Total coverage: 151450 (9%)of 1848466
6 5 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 // SPDX-License-Identifier: GPL-2.0-only /* Kernel module to match FRAG parameters. */ /* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ipv6.h> #include <linux/types.h> #include <net/checksum.h> #include <net/ipv6.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter_ipv6/ip6t_frag.h> MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: IPv6 fragment match"); MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); /* Returns 1 if the id is matched by the range, 0 otherwise */ static inline bool id_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert) { bool r; pr_debug("id_match:%c 0x%x <= 0x%x <= 0x%x\n", invert ? '!' : ' ', min, id, max); r = (id >= min && id <= max) ^ invert; pr_debug(" result %s\n", r ? "PASS" : "FAILED"); return r; } static bool frag_mt6(const struct sk_buff *skb, struct xt_action_param *par) { struct frag_hdr _frag; const struct frag_hdr *fh; const struct ip6t_frag *fraginfo = par->matchinfo; unsigned int ptr = 0; int err; err = ipv6_find_hdr(skb, &ptr, NEXTHDR_FRAGMENT, NULL, NULL); if (err < 0) { if (err != -ENOENT) par->hotdrop = true; return false; } fh = skb_header_pointer(skb, ptr, sizeof(_frag), &_frag); if (fh == NULL) { par->hotdrop = true; return false; } pr_debug("INFO %04X ", fh->frag_off); pr_debug("OFFSET %04X ", ntohs(fh->frag_off) & ~0x7); pr_debug("RES %02X %04X", fh->reserved, ntohs(fh->frag_off) & 0x6); pr_debug("MF %04X ", fh->frag_off & htons(IP6_MF)); pr_debug("ID %u %08X\n", ntohl(fh->identification), ntohl(fh->identification)); pr_debug("IPv6 FRAG id %02X ", id_match(fraginfo->ids[0], fraginfo->ids[1], ntohl(fh->identification), !!(fraginfo->invflags & IP6T_FRAG_INV_IDS))); pr_debug("res %02X %02X%04X %02X ", fraginfo->flags & IP6T_FRAG_RES, fh->reserved, ntohs(fh->frag_off) & 0x6, !((fraginfo->flags & IP6T_FRAG_RES) && (fh->reserved || (ntohs(fh->frag_off) & 0x06)))); pr_debug("first %02X %02X %02X ", fraginfo->flags & IP6T_FRAG_FST, ntohs(fh->frag_off) & ~0x7, !((fraginfo->flags & IP6T_FRAG_FST) && (ntohs(fh->frag_off) & ~0x7))); pr_debug("mf %02X %02X %02X ", fraginfo->flags & IP6T_FRAG_MF, ntohs(fh->frag_off) & IP6_MF, !((fraginfo->flags & IP6T_FRAG_MF) && !((ntohs(fh->frag_off) & IP6_MF)))); pr_debug("last %02X %02X %02X\n", fraginfo->flags & IP6T_FRAG_NMF, ntohs(fh->frag_off) & IP6_MF, !((fraginfo->flags & IP6T_FRAG_NMF) && (ntohs(fh->frag_off) & IP6_MF))); return id_match(fraginfo->ids[0], fraginfo->ids[1], ntohl(fh->identification), !!(fraginfo->invflags & IP6T_FRAG_INV_IDS)) && !((fraginfo->flags & IP6T_FRAG_RES) && (fh->reserved || (ntohs(fh->frag_off) & 0x6))) && !((fraginfo->flags & IP6T_FRAG_FST) && (ntohs(fh->frag_off) & ~0x7)) && !((fraginfo->flags & IP6T_FRAG_MF) && !(ntohs(fh->frag_off) & IP6_MF)) && !((fraginfo->flags & IP6T_FRAG_NMF) && (ntohs(fh->frag_off) & IP6_MF)); } static int frag_mt6_check(const struct xt_mtchk_param *par) { const struct ip6t_frag *fraginfo = par->matchinfo; if (fraginfo->invflags & ~IP6T_FRAG_INV_MASK) { pr_debug("unknown flags %X\n", fraginfo->invflags); return -EINVAL; } return 0; } static struct xt_match frag_mt6_reg __read_mostly = { .name = "frag", .family = NFPROTO_IPV6, .match = frag_mt6, .matchsize = sizeof(struct ip6t_frag), .checkentry = frag_mt6_check, .me = THIS_MODULE, }; static int __init frag_mt6_init(void) { return xt_register_match(&frag_mt6_reg); } static void __exit frag_mt6_exit(void) { xt_unregister_match(&frag_mt6_reg); } module_init(frag_mt6_init); module_exit(frag_mt6_exit);
4 4 4 4 22 22 7 4 3 2 2 2 2 3 14 14 14 19 19 19 6 5 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #include "originator.h" #include "main.h" #include <linux/atomic.h> #include <linux/container_of.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_vlan.h> #include <linux/jiffies.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/workqueue.h> #include <uapi/linux/batadv_packet.h> #include "distributed-arp-table.h" #include "fragmentation.h" #include "gateway_client.h" #include "hard-interface.h" #include "hash.h" #include "log.h" #include "multicast.h" #include "netlink.h" #include "routing.h" #include "translation-table.h" /* hash class keys */ static struct lock_class_key batadv_orig_hash_lock_class_key; /** * batadv_orig_hash_find() - Find and return originator from orig_hash * @bat_priv: the bat priv with all the mesh interface information * @data: mac address of the originator * * Return: orig_node (with increased refcnt), NULL on errors */ struct batadv_orig_node * batadv_orig_hash_find(struct batadv_priv *bat_priv, const void *data) { struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_head *head; struct batadv_orig_node *orig_node, *orig_node_tmp = NULL; int index; if (!hash) return NULL; index = batadv_choose_orig(data, hash->size); head = &hash->table[index]; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, head, hash_entry) { if (!batadv_compare_eth(orig_node, data)) continue; if (!kref_get_unless_zero(&orig_node->refcount)) continue; orig_node_tmp = orig_node; break; } rcu_read_unlock(); return orig_node_tmp; } static void batadv_purge_orig(struct work_struct *work); /** * batadv_compare_orig() - comparing function used in the originator hash table * @node: node in the local table * @data2: second object to compare the node to * * Return: true if they are the same originator */ bool batadv_compare_orig(const struct hlist_node *node, const void *data2) { const void *data1 = container_of(node, struct batadv_orig_node, hash_entry); return batadv_compare_eth(data1, data2); } /** * batadv_orig_node_vlan_get() - get an orig_node_vlan object * @orig_node: the originator serving the VLAN * @vid: the VLAN identifier * * Return: the vlan object identified by vid and belonging to orig_node or NULL * if it does not exist. */ struct batadv_orig_node_vlan * batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node, unsigned short vid) { struct batadv_orig_node_vlan *vlan = NULL, *tmp; rcu_read_lock(); hlist_for_each_entry_rcu(tmp, &orig_node->vlan_list, list) { if (tmp->vid != vid) continue; if (!kref_get_unless_zero(&tmp->refcount)) continue; vlan = tmp; break; } rcu_read_unlock(); return vlan; } /** * batadv_vlan_id_valid() - check if vlan id is in valid batman-adv encoding * @vid: the VLAN identifier * * Return: true when either no vlan is set or if VLAN is in correct range, * false otherwise */ static bool batadv_vlan_id_valid(unsigned short vid) { unsigned short non_vlan = vid & ~(BATADV_VLAN_HAS_TAG | VLAN_VID_MASK); if (vid == 0) return true; if (!(vid & BATADV_VLAN_HAS_TAG)) return false; if (non_vlan) return false; return true; } /** * batadv_orig_node_vlan_new() - search and possibly create an orig_node_vlan * object * @orig_node: the originator serving the VLAN * @vid: the VLAN identifier * * Return: NULL in case of failure or the vlan object identified by vid and * belonging to orig_node otherwise. The object is created and added to the list * if it does not exist. * * The object is returned with refcounter increased by 1. */ struct batadv_orig_node_vlan * batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node, unsigned short vid) { struct batadv_orig_node_vlan *vlan; if (!batadv_vlan_id_valid(vid)) return NULL; spin_lock_bh(&orig_node->vlan_list_lock); /* first look if an object for this vid already exists */ vlan = batadv_orig_node_vlan_get(orig_node, vid); if (vlan) goto out; vlan = kzalloc(sizeof(*vlan), GFP_ATOMIC); if (!vlan) goto out; kref_init(&vlan->refcount); vlan->vid = vid; kref_get(&vlan->refcount); hlist_add_head_rcu(&vlan->list, &orig_node->vlan_list); out: spin_unlock_bh(&orig_node->vlan_list_lock); return vlan; } /** * batadv_orig_node_vlan_release() - release originator-vlan object from lists * and queue for free after rcu grace period * @ref: kref pointer of the originator-vlan object */ void batadv_orig_node_vlan_release(struct kref *ref) { struct batadv_orig_node_vlan *orig_vlan; orig_vlan = container_of(ref, struct batadv_orig_node_vlan, refcount); kfree_rcu(orig_vlan, rcu); } /** * batadv_originator_init() - Initialize all originator structures * @bat_priv: the bat priv with all the mesh interface information * * Return: 0 on success or negative error number in case of failure */ int batadv_originator_init(struct batadv_priv *bat_priv) { if (bat_priv->orig_hash) return 0; bat_priv->orig_hash = batadv_hash_new(1024); if (!bat_priv->orig_hash) goto err; batadv_hash_set_lock_class(bat_priv->orig_hash, &batadv_orig_hash_lock_class_key); INIT_DELAYED_WORK(&bat_priv->orig_work, batadv_purge_orig); queue_delayed_work(batadv_event_workqueue, &bat_priv->orig_work, msecs_to_jiffies(BATADV_ORIG_WORK_PERIOD)); return 0; err: return -ENOMEM; } /** * batadv_neigh_ifinfo_release() - release neigh_ifinfo from lists and queue for * free after rcu grace period * @ref: kref pointer of the neigh_ifinfo */ void batadv_neigh_ifinfo_release(struct kref *ref) { struct batadv_neigh_ifinfo *neigh_ifinfo; neigh_ifinfo = container_of(ref, struct batadv_neigh_ifinfo, refcount); if (neigh_ifinfo->if_outgoing != BATADV_IF_DEFAULT) batadv_hardif_put(neigh_ifinfo->if_outgoing); kfree_rcu(neigh_ifinfo, rcu); } /** * batadv_hardif_neigh_release() - release hardif neigh node from lists and * queue for free after rcu grace period * @ref: kref pointer of the neigh_node */ void batadv_hardif_neigh_release(struct kref *ref) { struct batadv_hardif_neigh_node *hardif_neigh; hardif_neigh = container_of(ref, struct batadv_hardif_neigh_node, refcount); spin_lock_bh(&hardif_neigh->if_incoming->neigh_list_lock); hlist_del_init_rcu(&hardif_neigh->list); spin_unlock_bh(&hardif_neigh->if_incoming->neigh_list_lock); batadv_hardif_put(hardif_neigh->if_incoming); kfree_rcu(hardif_neigh, rcu); } /** * batadv_neigh_node_release() - release neigh_node from lists and queue for * free after rcu grace period * @ref: kref pointer of the neigh_node */ void batadv_neigh_node_release(struct kref *ref) { struct hlist_node *node_tmp; struct batadv_neigh_node *neigh_node; struct batadv_neigh_ifinfo *neigh_ifinfo; neigh_node = container_of(ref, struct batadv_neigh_node, refcount); hlist_for_each_entry_safe(neigh_ifinfo, node_tmp, &neigh_node->ifinfo_list, list) { batadv_neigh_ifinfo_put(neigh_ifinfo); } batadv_hardif_neigh_put(neigh_node->hardif_neigh); batadv_hardif_put(neigh_node->if_incoming); kfree_rcu(neigh_node, rcu); } /** * batadv_orig_router_get() - router to the originator depending on iface * @orig_node: the orig node for the router * @if_outgoing: the interface where the payload packet has been received or * the OGM should be sent to * * Return: the neighbor which should be the router for this orig_node/iface. * * The object is returned with refcounter increased by 1. */ struct batadv_neigh_node * batadv_orig_router_get(struct batadv_orig_node *orig_node, const struct batadv_hard_iface *if_outgoing) { struct batadv_orig_ifinfo *orig_ifinfo; struct batadv_neigh_node *router = NULL; rcu_read_lock(); hlist_for_each_entry_rcu(orig_ifinfo, &orig_node->ifinfo_list, list) { if (orig_ifinfo->if_outgoing != if_outgoing) continue; router = rcu_dereference(orig_ifinfo->router); break; } if (router && !kref_get_unless_zero(&router->refcount)) router = NULL; rcu_read_unlock(); return router; } /** * batadv_orig_to_router() - get next hop neighbor to an orig address * @bat_priv: the bat priv with all the mesh interface information * @orig_addr: the originator MAC address to search the best next hop router for * @if_outgoing: the interface where the payload packet has been received or * the OGM should be sent to * * Return: A neighbor node which is the best router towards the given originator * address. */ struct batadv_neigh_node * batadv_orig_to_router(struct batadv_priv *bat_priv, u8 *orig_addr, struct batadv_hard_iface *if_outgoing) { struct batadv_neigh_node *neigh_node; struct batadv_orig_node *orig_node; orig_node = batadv_orig_hash_find(bat_priv, orig_addr); if (!orig_node) return NULL; neigh_node = batadv_find_router(bat_priv, orig_node, if_outgoing); batadv_orig_node_put(orig_node); return neigh_node; } /** * batadv_orig_ifinfo_get() - find the ifinfo from an orig_node * @orig_node: the orig node to be queried * @if_outgoing: the interface for which the ifinfo should be acquired * * Return: the requested orig_ifinfo or NULL if not found. * * The object is returned with refcounter increased by 1. */ struct batadv_orig_ifinfo * batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_outgoing) { struct batadv_orig_ifinfo *tmp, *orig_ifinfo = NULL; rcu_read_lock(); hlist_for_each_entry_rcu(tmp, &orig_node->ifinfo_list, list) { if (tmp->if_outgoing != if_outgoing) continue; if (!kref_get_unless_zero(&tmp->refcount)) continue; orig_ifinfo = tmp; break; } rcu_read_unlock(); return orig_ifinfo; } /** * batadv_orig_ifinfo_new() - search and possibly create an orig_ifinfo object * @orig_node: the orig node to be queried * @if_outgoing: the interface for which the ifinfo should be acquired * * Return: NULL in case of failure or the orig_ifinfo object for the if_outgoing * interface otherwise. The object is created and added to the list * if it does not exist. * * The object is returned with refcounter increased by 1. */ struct batadv_orig_ifinfo * batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_outgoing) { struct batadv_orig_ifinfo *orig_ifinfo; unsigned long reset_time; spin_lock_bh(&orig_node->neigh_list_lock); orig_ifinfo = batadv_orig_ifinfo_get(orig_node, if_outgoing); if (orig_ifinfo) goto out; orig_ifinfo = kzalloc(sizeof(*orig_ifinfo), GFP_ATOMIC); if (!orig_ifinfo) goto out; if (if_outgoing != BATADV_IF_DEFAULT) kref_get(&if_outgoing->refcount); reset_time = jiffies - 1; reset_time -= msecs_to_jiffies(BATADV_RESET_PROTECTION_MS); orig_ifinfo->batman_seqno_reset = reset_time; orig_ifinfo->if_outgoing = if_outgoing; INIT_HLIST_NODE(&orig_ifinfo->list); kref_init(&orig_ifinfo->refcount); kref_get(&orig_ifinfo->refcount); hlist_add_head_rcu(&orig_ifinfo->list, &orig_node->ifinfo_list); out: spin_unlock_bh(&orig_node->neigh_list_lock); return orig_ifinfo; } /** * batadv_neigh_ifinfo_get() - find the ifinfo from an neigh_node * @neigh: the neigh node to be queried * @if_outgoing: the interface for which the ifinfo should be acquired * * The object is returned with refcounter increased by 1. * * Return: the requested neigh_ifinfo or NULL if not found */ struct batadv_neigh_ifinfo * batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh, struct batadv_hard_iface *if_outgoing) { struct batadv_neigh_ifinfo *neigh_ifinfo = NULL, *tmp_neigh_ifinfo; rcu_read_lock(); hlist_for_each_entry_rcu(tmp_neigh_ifinfo, &neigh->ifinfo_list, list) { if (tmp_neigh_ifinfo->if_outgoing != if_outgoing) continue; if (!kref_get_unless_zero(&tmp_neigh_ifinfo->refcount)) continue; neigh_ifinfo = tmp_neigh_ifinfo; break; } rcu_read_unlock(); return neigh_ifinfo; } /** * batadv_neigh_ifinfo_new() - search and possibly create an neigh_ifinfo object * @neigh: the neigh node to be queried * @if_outgoing: the interface for which the ifinfo should be acquired * * Return: NULL in case of failure or the neigh_ifinfo object for the * if_outgoing interface otherwise. The object is created and added to the list * if it does not exist. * * The object is returned with refcounter increased by 1. */ struct batadv_neigh_ifinfo * batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh, struct batadv_hard_iface *if_outgoing) { struct batadv_neigh_ifinfo *neigh_ifinfo; spin_lock_bh(&neigh->ifinfo_lock); neigh_ifinfo = batadv_neigh_ifinfo_get(neigh, if_outgoing); if (neigh_ifinfo) goto out; neigh_ifinfo = kzalloc(sizeof(*neigh_ifinfo), GFP_ATOMIC); if (!neigh_ifinfo) goto out; if (if_outgoing) kref_get(&if_outgoing->refcount); INIT_HLIST_NODE(&neigh_ifinfo->list); kref_init(&neigh_ifinfo->refcount); neigh_ifinfo->if_outgoing = if_outgoing; kref_get(&neigh_ifinfo->refcount); hlist_add_head_rcu(&neigh_ifinfo->list, &neigh->ifinfo_list); out: spin_unlock_bh(&neigh->ifinfo_lock); return neigh_ifinfo; } /** * batadv_neigh_node_get() - retrieve a neighbour from the list * @orig_node: originator which the neighbour belongs to * @hard_iface: the interface where this neighbour is connected to * @addr: the address of the neighbour * * Looks for and possibly returns a neighbour belonging to this originator list * which is connected through the provided hard interface. * * Return: neighbor when found. Otherwise NULL */ static struct batadv_neigh_node * batadv_neigh_node_get(const struct batadv_orig_node *orig_node, const struct batadv_hard_iface *hard_iface, const u8 *addr) { struct batadv_neigh_node *tmp_neigh_node, *res = NULL; rcu_read_lock(); hlist_for_each_entry_rcu(tmp_neigh_node, &orig_node->neigh_list, list) { if (!batadv_compare_eth(tmp_neigh_node->addr, addr)) continue; if (tmp_neigh_node->if_incoming != hard_iface) continue; if (!kref_get_unless_zero(&tmp_neigh_node->refcount)) continue; res = tmp_neigh_node; break; } rcu_read_unlock(); return res; } /** * batadv_hardif_neigh_create() - create a hardif neighbour node * @hard_iface: the interface this neighbour is connected to * @neigh_addr: the interface address of the neighbour to retrieve * @orig_node: originator object representing the neighbour * * Return: the hardif neighbour node if found or created or NULL otherwise. */ static struct batadv_hardif_neigh_node * batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface, const u8 *neigh_addr, struct batadv_orig_node *orig_node) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); struct batadv_hardif_neigh_node *hardif_neigh; spin_lock_bh(&hard_iface->neigh_list_lock); /* check if neighbor hasn't been added in the meantime */ hardif_neigh = batadv_hardif_neigh_get(hard_iface, neigh_addr); if (hardif_neigh) goto out; hardif_neigh = kzalloc(sizeof(*hardif_neigh), GFP_ATOMIC); if (!hardif_neigh) goto out; kref_get(&hard_iface->refcount); INIT_HLIST_NODE(&hardif_neigh->list); ether_addr_copy(hardif_neigh->addr, neigh_addr); ether_addr_copy(hardif_neigh->orig, orig_node->orig); hardif_neigh->if_incoming = hard_iface; hardif_neigh->last_seen = jiffies; kref_init(&hardif_neigh->refcount); if (bat_priv->algo_ops->neigh.hardif_init) bat_priv->algo_ops->neigh.hardif_init(hardif_neigh); hlist_add_head_rcu(&hardif_neigh->list, &hard_iface->neigh_list); out: spin_unlock_bh(&hard_iface->neigh_list_lock); return hardif_neigh; } /** * batadv_hardif_neigh_get_or_create() - retrieve or create a hardif neighbour * node * @hard_iface: the interface this neighbour is connected to * @neigh_addr: the interface address of the neighbour to retrieve * @orig_node: originator object representing the neighbour * * Return: the hardif neighbour node if found or created or NULL otherwise. */ static struct batadv_hardif_neigh_node * batadv_hardif_neigh_get_or_create(struct batadv_hard_iface *hard_iface, const u8 *neigh_addr, struct batadv_orig_node *orig_node) { struct batadv_hardif_neigh_node *hardif_neigh; /* first check without locking to avoid the overhead */ hardif_neigh = batadv_hardif_neigh_get(hard_iface, neigh_addr); if (hardif_neigh) return hardif_neigh; return batadv_hardif_neigh_create(hard_iface, neigh_addr, orig_node); } /** * batadv_hardif_neigh_get() - retrieve a hardif neighbour from the list * @hard_iface: the interface where this neighbour is connected to * @neigh_addr: the address of the neighbour * * Looks for and possibly returns a neighbour belonging to this hard interface. * * Return: neighbor when found. Otherwise NULL */ struct batadv_hardif_neigh_node * batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, const u8 *neigh_addr) { struct batadv_hardif_neigh_node *tmp_hardif_neigh, *hardif_neigh = NULL; rcu_read_lock(); hlist_for_each_entry_rcu(tmp_hardif_neigh, &hard_iface->neigh_list, list) { if (!batadv_compare_eth(tmp_hardif_neigh->addr, neigh_addr)) continue; if (!kref_get_unless_zero(&tmp_hardif_neigh->refcount)) continue; hardif_neigh = tmp_hardif_neigh; break; } rcu_read_unlock(); return hardif_neigh; } /** * batadv_neigh_node_create() - create a neigh node object * @orig_node: originator object representing the neighbour * @hard_iface: the interface where the neighbour is connected to * @neigh_addr: the mac address of the neighbour interface * * Allocates a new neigh_node object and initialises all the generic fields. * * Return: the neighbour node if found or created or NULL otherwise. */ static struct batadv_neigh_node * batadv_neigh_node_create(struct batadv_orig_node *orig_node, struct batadv_hard_iface *hard_iface, const u8 *neigh_addr) { struct batadv_neigh_node *neigh_node; struct batadv_hardif_neigh_node *hardif_neigh = NULL; spin_lock_bh(&orig_node->neigh_list_lock); neigh_node = batadv_neigh_node_get(orig_node, hard_iface, neigh_addr); if (neigh_node) goto out; hardif_neigh = batadv_hardif_neigh_get_or_create(hard_iface, neigh_addr, orig_node); if (!hardif_neigh) goto out; neigh_node = kzalloc(sizeof(*neigh_node), GFP_ATOMIC); if (!neigh_node) goto out; INIT_HLIST_NODE(&neigh_node->list); INIT_HLIST_HEAD(&neigh_node->ifinfo_list); spin_lock_init(&neigh_node->ifinfo_lock); kref_get(&hard_iface->refcount); ether_addr_copy(neigh_node->addr, neigh_addr); neigh_node->if_incoming = hard_iface; neigh_node->orig_node = orig_node; neigh_node->last_seen = jiffies; /* increment unique neighbor refcount */ kref_get(&hardif_neigh->refcount); neigh_node->hardif_neigh = hardif_neigh; /* extra reference for return */ kref_init(&neigh_node->refcount); kref_get(&neigh_node->refcount); hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list); batadv_dbg(BATADV_DBG_BATMAN, orig_node->bat_priv, "Creating new neighbor %pM for orig_node %pM on interface %s\n", neigh_addr, orig_node->orig, hard_iface->net_dev->name); out: spin_unlock_bh(&orig_node->neigh_list_lock); batadv_hardif_neigh_put(hardif_neigh); return neigh_node; } /** * batadv_neigh_node_get_or_create() - retrieve or create a neigh node object * @orig_node: originator object representing the neighbour * @hard_iface: the interface where the neighbour is connected to * @neigh_addr: the mac address of the neighbour interface * * Return: the neighbour node if found or created or NULL otherwise. */ struct batadv_neigh_node * batadv_neigh_node_get_or_create(struct batadv_orig_node *orig_node, struct batadv_hard_iface *hard_iface, const u8 *neigh_addr) { struct batadv_neigh_node *neigh_node; /* first check without locking to avoid the overhead */ neigh_node = batadv_neigh_node_get(orig_node, hard_iface, neigh_addr); if (neigh_node) return neigh_node; return batadv_neigh_node_create(orig_node, hard_iface, neigh_addr); } /** * batadv_hardif_neigh_dump() - Dump to netlink the neighbor infos for a * specific outgoing interface * @msg: message to dump into * @cb: parameters for the dump * * Return: 0 or error value */ int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb) { struct batadv_hard_iface *primary_if, *hard_iface; struct net_device *mesh_iface; struct batadv_priv *bat_priv; int ret; mesh_iface = batadv_netlink_get_meshif(cb); if (IS_ERR(mesh_iface)) return PTR_ERR(mesh_iface); bat_priv = netdev_priv(mesh_iface); primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { ret = -ENOENT; goto out_put_mesh_iface; } hard_iface = batadv_netlink_get_hardif(bat_priv, cb); if (IS_ERR(hard_iface) && PTR_ERR(hard_iface) != -ENONET) { ret = PTR_ERR(hard_iface); goto out_put_primary_if; } else if (IS_ERR(hard_iface)) { /* => PTR_ERR(hard_iface) == -ENONET * => no hard-iface given, ok */ hard_iface = BATADV_IF_DEFAULT; } if (!bat_priv->algo_ops->neigh.dump) { ret = -EOPNOTSUPP; goto out_put_hard_iface; } bat_priv->algo_ops->neigh.dump(msg, cb, bat_priv, hard_iface); ret = msg->len; out_put_hard_iface: batadv_hardif_put(hard_iface); out_put_primary_if: batadv_hardif_put(primary_if); out_put_mesh_iface: dev_put(mesh_iface); return ret; } /** * batadv_orig_ifinfo_release() - release orig_ifinfo from lists and queue for * free after rcu grace period * @ref: kref pointer of the orig_ifinfo */ void batadv_orig_ifinfo_release(struct kref *ref) { struct batadv_orig_ifinfo *orig_ifinfo; struct batadv_neigh_node *router; orig_ifinfo = container_of(ref, struct batadv_orig_ifinfo, refcount); if (orig_ifinfo->if_outgoing != BATADV_IF_DEFAULT) batadv_hardif_put(orig_ifinfo->if_outgoing); /* this is the last reference to this object */ router = rcu_dereference_protected(orig_ifinfo->router, true); batadv_neigh_node_put(router); kfree_rcu(orig_ifinfo, rcu); } /** * batadv_orig_node_free_rcu() - free the orig_node * @rcu: rcu pointer of the orig_node */ static void batadv_orig_node_free_rcu(struct rcu_head *rcu) { struct batadv_orig_node *orig_node; orig_node = container_of(rcu, struct batadv_orig_node, rcu); batadv_mcast_purge_orig(orig_node); batadv_frag_purge_orig(orig_node, NULL); kfree(orig_node->tt_buff); kfree(orig_node); } /** * batadv_orig_node_release() - release orig_node from lists and queue for * free after rcu grace period * @ref: kref pointer of the orig_node */ void batadv_orig_node_release(struct kref *ref) { struct hlist_node *node_tmp; struct batadv_neigh_node *neigh_node; struct batadv_orig_node *orig_node; struct batadv_orig_ifinfo *orig_ifinfo; struct batadv_orig_node_vlan *vlan; struct batadv_orig_ifinfo *last_candidate; orig_node = container_of(ref, struct batadv_orig_node, refcount); spin_lock_bh(&orig_node->neigh_list_lock); /* for all neighbors towards this originator ... */ hlist_for_each_entry_safe(neigh_node, node_tmp, &orig_node->neigh_list, list) { hlist_del_rcu(&neigh_node->list); batadv_neigh_node_put(neigh_node); } hlist_for_each_entry_safe(orig_ifinfo, node_tmp, &orig_node->ifinfo_list, list) { hlist_del_rcu(&orig_ifinfo->list); batadv_orig_ifinfo_put(orig_ifinfo); } last_candidate = orig_node->last_bonding_candidate; orig_node->last_bonding_candidate = NULL; spin_unlock_bh(&orig_node->neigh_list_lock); batadv_orig_ifinfo_put(last_candidate); spin_lock_bh(&orig_node->vlan_list_lock); hlist_for_each_entry_safe(vlan, node_tmp, &orig_node->vlan_list, list) { hlist_del_rcu(&vlan->list); batadv_orig_node_vlan_put(vlan); } spin_unlock_bh(&orig_node->vlan_list_lock); call_rcu(&orig_node->rcu, batadv_orig_node_free_rcu); } /** * batadv_originator_free() - Free all originator structures * @bat_priv: the bat priv with all the mesh interface information */ void batadv_originator_free(struct batadv_priv *bat_priv) { struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_node *node_tmp; struct hlist_head *head; spinlock_t *list_lock; /* spinlock to protect write access */ struct batadv_orig_node *orig_node; u32 i; if (!hash) return; cancel_delayed_work_sync(&bat_priv->orig_work); bat_priv->orig_hash = NULL; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); hlist_for_each_entry_safe(orig_node, node_tmp, head, hash_entry) { hlist_del_rcu(&orig_node->hash_entry); batadv_orig_node_put(orig_node); } spin_unlock_bh(list_lock); } batadv_hash_destroy(hash); } /** * batadv_orig_node_new() - creates a new orig_node * @bat_priv: the bat priv with all the mesh interface information * @addr: the mac address of the originator * * Creates a new originator object and initialises all the generic fields. * The new object is not added to the originator list. * * Return: the newly created object or NULL on failure. */ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, const u8 *addr) { struct batadv_orig_node *orig_node; struct batadv_orig_node_vlan *vlan; unsigned long reset_time; int i; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Creating new originator: %pM\n", addr); orig_node = kzalloc(sizeof(*orig_node), GFP_ATOMIC); if (!orig_node) return NULL; INIT_HLIST_HEAD(&orig_node->neigh_list); INIT_HLIST_HEAD(&orig_node->vlan_list); INIT_HLIST_HEAD(&orig_node->ifinfo_list); spin_lock_init(&orig_node->bcast_seqno_lock); spin_lock_init(&orig_node->neigh_list_lock); spin_lock_init(&orig_node->tt_buff_lock); spin_lock_init(&orig_node->tt_lock); spin_lock_init(&orig_node->vlan_list_lock); /* extra reference for return */ kref_init(&orig_node->refcount); orig_node->bat_priv = bat_priv; ether_addr_copy(orig_node->orig, addr); batadv_dat_init_orig_node_addr(orig_node); atomic_set(&orig_node->last_ttvn, 0); orig_node->tt_buff = NULL; orig_node->tt_buff_len = 0; orig_node->last_seen = jiffies; reset_time = jiffies - 1 - msecs_to_jiffies(BATADV_RESET_PROTECTION_MS); orig_node->bcast_seqno_reset = reset_time; #ifdef CONFIG_BATMAN_ADV_MCAST orig_node->mcast_flags = BATADV_MCAST_WANT_NO_RTR4; orig_node->mcast_flags |= BATADV_MCAST_WANT_NO_RTR6; orig_node->mcast_flags |= BATADV_MCAST_HAVE_MC_PTYPE_CAPA; INIT_HLIST_NODE(&orig_node->mcast_want_all_unsnoopables_node); INIT_HLIST_NODE(&orig_node->mcast_want_all_ipv4_node); INIT_HLIST_NODE(&orig_node->mcast_want_all_ipv6_node); spin_lock_init(&orig_node->mcast_handler_lock); #endif /* create a vlan object for the "untagged" LAN */ vlan = batadv_orig_node_vlan_new(orig_node, BATADV_NO_FLAGS); if (!vlan) goto free_orig_node; /* batadv_orig_node_vlan_new() increases the refcounter. * Immediately release vlan since it is not needed anymore in this * context */ batadv_orig_node_vlan_put(vlan); for (i = 0; i < BATADV_FRAG_BUFFER_COUNT; i++) { INIT_HLIST_HEAD(&orig_node->fragments[i].fragment_list); spin_lock_init(&orig_node->fragments[i].lock); orig_node->fragments[i].size = 0; } return orig_node; free_orig_node: kfree(orig_node); return NULL; } /** * batadv_purge_neigh_ifinfo() - purge obsolete ifinfo entries from neighbor * @bat_priv: the bat priv with all the mesh interface information * @neigh: orig node which is to be checked */ static void batadv_purge_neigh_ifinfo(struct batadv_priv *bat_priv, struct batadv_neigh_node *neigh) { struct batadv_neigh_ifinfo *neigh_ifinfo; struct batadv_hard_iface *if_outgoing; struct hlist_node *node_tmp; spin_lock_bh(&neigh->ifinfo_lock); /* for all ifinfo objects for this neighinator */ hlist_for_each_entry_safe(neigh_ifinfo, node_tmp, &neigh->ifinfo_list, list) { if_outgoing = neigh_ifinfo->if_outgoing; /* always keep the default interface */ if (if_outgoing == BATADV_IF_DEFAULT) continue; /* don't purge if the interface is not (going) down */ if (if_outgoing->if_status != BATADV_IF_INACTIVE && if_outgoing->if_status != BATADV_IF_NOT_IN_USE && if_outgoing->if_status != BATADV_IF_TO_BE_REMOVED) continue; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "neighbor/ifinfo purge: neighbor %pM, iface: %s\n", neigh->addr, if_outgoing->net_dev->name); hlist_del_rcu(&neigh_ifinfo->list); batadv_neigh_ifinfo_put(neigh_ifinfo); } spin_unlock_bh(&neigh->ifinfo_lock); } /** * batadv_purge_orig_ifinfo() - purge obsolete ifinfo entries from originator * @bat_priv: the bat priv with all the mesh interface information * @orig_node: orig node which is to be checked * * Return: true if any ifinfo entry was purged, false otherwise. */ static bool batadv_purge_orig_ifinfo(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { struct batadv_orig_ifinfo *orig_ifinfo; struct batadv_hard_iface *if_outgoing; struct hlist_node *node_tmp; bool ifinfo_purged = false; spin_lock_bh(&orig_node->neigh_list_lock); /* for all ifinfo objects for this originator */ hlist_for_each_entry_safe(orig_ifinfo, node_tmp, &orig_node->ifinfo_list, list) { if_outgoing = orig_ifinfo->if_outgoing; /* always keep the default interface */ if (if_outgoing == BATADV_IF_DEFAULT) continue; /* don't purge if the interface is not (going) down */ if (if_outgoing->if_status != BATADV_IF_INACTIVE && if_outgoing->if_status != BATADV_IF_NOT_IN_USE && if_outgoing->if_status != BATADV_IF_TO_BE_REMOVED) continue; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "router/ifinfo purge: originator %pM, iface: %s\n", orig_node->orig, if_outgoing->net_dev->name); ifinfo_purged = true; hlist_del_rcu(&orig_ifinfo->list); batadv_orig_ifinfo_put(orig_ifinfo); if (orig_node->last_bonding_candidate == orig_ifinfo) { orig_node->last_bonding_candidate = NULL; batadv_orig_ifinfo_put(orig_ifinfo); } } spin_unlock_bh(&orig_node->neigh_list_lock); return ifinfo_purged; } /** * batadv_purge_orig_neighbors() - purges neighbors from originator * @bat_priv: the bat priv with all the mesh interface information * @orig_node: orig node which is to be checked * * Return: true if any neighbor was purged, false otherwise */ static bool batadv_purge_orig_neighbors(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { struct hlist_node *node_tmp; struct batadv_neigh_node *neigh_node; bool neigh_purged = false; unsigned long last_seen; struct batadv_hard_iface *if_incoming; spin_lock_bh(&orig_node->neigh_list_lock); /* for all neighbors towards this originator ... */ hlist_for_each_entry_safe(neigh_node, node_tmp, &orig_node->neigh_list, list) { last_seen = neigh_node->last_seen; if_incoming = neigh_node->if_incoming; if (batadv_has_timed_out(last_seen, BATADV_PURGE_TIMEOUT) || if_incoming->if_status == BATADV_IF_INACTIVE || if_incoming->if_status == BATADV_IF_NOT_IN_USE || if_incoming->if_status == BATADV_IF_TO_BE_REMOVED) { if (if_incoming->if_status == BATADV_IF_INACTIVE || if_incoming->if_status == BATADV_IF_NOT_IN_USE || if_incoming->if_status == BATADV_IF_TO_BE_REMOVED) batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "neighbor purge: originator %pM, neighbor: %pM, iface: %s\n", orig_node->orig, neigh_node->addr, if_incoming->net_dev->name); else batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "neighbor timeout: originator %pM, neighbor: %pM, last_seen: %u\n", orig_node->orig, neigh_node->addr, jiffies_to_msecs(last_seen)); neigh_purged = true; hlist_del_rcu(&neigh_node->list); batadv_neigh_node_put(neigh_node); } else { /* only necessary if not the whole neighbor is to be * deleted, but some interface has been removed. */ batadv_purge_neigh_ifinfo(bat_priv, neigh_node); } } spin_unlock_bh(&orig_node->neigh_list_lock); return neigh_purged; } /** * batadv_find_best_neighbor() - finds the best neighbor after purging * @bat_priv: the bat priv with all the mesh interface information * @orig_node: orig node which is to be checked * @if_outgoing: the interface for which the metric should be compared * * Return: the current best neighbor, with refcount increased. */ static struct batadv_neigh_node * batadv_find_best_neighbor(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_outgoing) { struct batadv_neigh_node *best = NULL, *neigh; struct batadv_algo_ops *bao = bat_priv->algo_ops; rcu_read_lock(); hlist_for_each_entry_rcu(neigh, &orig_node->neigh_list, list) { if (best && (bao->neigh.cmp(neigh, if_outgoing, best, if_outgoing) <= 0)) continue; if (!kref_get_unless_zero(&neigh->refcount)) continue; batadv_neigh_node_put(best); best = neigh; } rcu_read_unlock(); return best; } /** * batadv_purge_orig_node() - purges obsolete information from an orig_node * @bat_priv: the bat priv with all the mesh interface information * @orig_node: orig node which is to be checked * * This function checks if the orig_node or substructures of it have become * obsolete, and purges this information if that's the case. * * Return: true if the orig_node is to be removed, false otherwise. */ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { struct batadv_neigh_node *best_neigh_node; struct batadv_hard_iface *hard_iface; bool changed_ifinfo, changed_neigh; struct list_head *iter; if (batadv_has_timed_out(orig_node->last_seen, 2 * BATADV_PURGE_TIMEOUT)) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Originator timeout: originator %pM, last_seen %u\n", orig_node->orig, jiffies_to_msecs(orig_node->last_seen)); return true; } changed_ifinfo = batadv_purge_orig_ifinfo(bat_priv, orig_node); changed_neigh = batadv_purge_orig_neighbors(bat_priv, orig_node); if (!changed_ifinfo && !changed_neigh) return false; /* first for NULL ... */ best_neigh_node = batadv_find_best_neighbor(bat_priv, orig_node, BATADV_IF_DEFAULT); batadv_update_route(bat_priv, orig_node, BATADV_IF_DEFAULT, best_neigh_node); batadv_neigh_node_put(best_neigh_node); /* ... then for all other interfaces. */ rcu_read_lock(); netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) { if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; if (!kref_get_unless_zero(&hard_iface->refcount)) continue; best_neigh_node = batadv_find_best_neighbor(bat_priv, orig_node, hard_iface); batadv_update_route(bat_priv, orig_node, hard_iface, best_neigh_node); batadv_neigh_node_put(best_neigh_node); batadv_hardif_put(hard_iface); } rcu_read_unlock(); return false; } /** * batadv_purge_orig_ref() - Purge all outdated originators * @bat_priv: the bat priv with all the mesh interface information */ void batadv_purge_orig_ref(struct batadv_priv *bat_priv) { struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_node *node_tmp; struct hlist_head *head; spinlock_t *list_lock; /* spinlock to protect write access */ struct batadv_orig_node *orig_node; u32 i; if (!hash) return; /* for all origins... */ for (i = 0; i < hash->size; i++) { head = &hash->table[i]; if (hlist_empty(head)) continue; list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); hlist_for_each_entry_safe(orig_node, node_tmp, head, hash_entry) { if (batadv_purge_orig_node(bat_priv, orig_node)) { batadv_gw_node_delete(bat_priv, orig_node); hlist_del_rcu(&orig_node->hash_entry); batadv_tt_global_del_orig(orig_node->bat_priv, orig_node, -1, "originator timed out"); batadv_orig_node_put(orig_node); continue; } batadv_frag_purge_orig(orig_node, batadv_frag_check_entry); } spin_unlock_bh(list_lock); } batadv_gw_election(bat_priv); } static void batadv_purge_orig(struct work_struct *work) { struct delayed_work *delayed_work; struct batadv_priv *bat_priv; delayed_work = to_delayed_work(work); bat_priv = container_of(delayed_work, struct batadv_priv, orig_work); batadv_purge_orig_ref(bat_priv); queue_delayed_work(batadv_event_workqueue, &bat_priv->orig_work, msecs_to_jiffies(BATADV_ORIG_WORK_PERIOD)); } /** * batadv_orig_dump() - Dump to netlink the originator infos for a specific * outgoing interface * @msg: message to dump into * @cb: parameters for the dump * * Return: 0 or error value */ int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb) { struct batadv_hard_iface *primary_if, *hard_iface; struct net_device *mesh_iface; struct batadv_priv *bat_priv; int ret; mesh_iface = batadv_netlink_get_meshif(cb); if (IS_ERR(mesh_iface)) return PTR_ERR(mesh_iface); bat_priv = netdev_priv(mesh_iface); primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { ret = -ENOENT; goto out_put_mesh_iface; } hard_iface = batadv_netlink_get_hardif(bat_priv, cb); if (IS_ERR(hard_iface) && PTR_ERR(hard_iface) != -ENONET) { ret = PTR_ERR(hard_iface); goto out_put_primary_if; } else if (IS_ERR(hard_iface)) { /* => PTR_ERR(hard_iface) == -ENONET * => no hard-iface given, ok */ hard_iface = BATADV_IF_DEFAULT; } if (!bat_priv->algo_ops->orig.dump) { ret = -EOPNOTSUPP; goto out_put_hard_iface; } bat_priv->algo_ops->orig.dump(msg, cb, bat_priv, hard_iface); ret = msg->len; out_put_hard_iface: batadv_hardif_put(hard_iface); out_put_primary_if: batadv_hardif_put(primary_if); out_put_mesh_iface: dev_put(mesh_iface); return ret; }
5927 1 5117 2209 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PERCPU_COUNTER_H #define _LINUX_PERCPU_COUNTER_H /* * A simple "approximate counter" for use in ext2 and ext3 superblocks. * * WARNING: these things are HUGE. 4 kbytes per counter on 32-way P4. */ #include <linux/spinlock.h> #include <linux/smp.h> #include <linux/list.h> #include <linux/threads.h> #include <linux/percpu.h> #include <linux/types.h> /* percpu_counter batch for local add or sub */ #define PERCPU_COUNTER_LOCAL_BATCH INT_MAX #ifdef CONFIG_SMP struct percpu_counter { raw_spinlock_t lock; s64 count; #ifdef CONFIG_HOTPLUG_CPU struct list_head list; /* All percpu_counters are on a list */ #endif s32 __percpu *counters; }; extern int percpu_counter_batch; int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, gfp_t gfp, u32 nr_counters, struct lock_class_key *key); #define percpu_counter_init_many(fbc, value, gfp, nr_counters) \ ({ \ static struct lock_class_key __key; \ \ __percpu_counter_init_many(fbc, value, gfp, nr_counters,\ &__key); \ }) #define percpu_counter_init(fbc, value, gfp) \ percpu_counter_init_many(fbc, value, gfp, 1) void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters); static inline void percpu_counter_destroy(struct percpu_counter *fbc) { percpu_counter_destroy_many(fbc, 1); } void percpu_counter_set(struct percpu_counter *fbc, s64 amount); void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch); s64 __percpu_counter_sum(struct percpu_counter *fbc); int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch); bool __percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount, s32 batch); void percpu_counter_sync(struct percpu_counter *fbc); static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs) { return __percpu_counter_compare(fbc, rhs, percpu_counter_batch); } static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) { percpu_counter_add_batch(fbc, amount, percpu_counter_batch); } static inline bool percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount) { return __percpu_counter_limited_add(fbc, limit, amount, percpu_counter_batch); } /* * With percpu_counter_add_local() and percpu_counter_sub_local(), counts * are accumulated in local per cpu counter and not in fbc->count until * local count overflows PERCPU_COUNTER_LOCAL_BATCH. This makes counter * write efficient. * But percpu_counter_sum(), instead of percpu_counter_read(), needs to be * used to add up the counts from each CPU to account for all the local * counts. So percpu_counter_add_local() and percpu_counter_sub_local() * should be used when a counter is updated frequently and read rarely. */ static inline void percpu_counter_add_local(struct percpu_counter *fbc, s64 amount) { percpu_counter_add_batch(fbc, amount, PERCPU_COUNTER_LOCAL_BATCH); } static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc) { s64 ret = __percpu_counter_sum(fbc); return ret < 0 ? 0 : ret; } static inline s64 percpu_counter_sum(struct percpu_counter *fbc) { return __percpu_counter_sum(fbc); } static inline s64 percpu_counter_read(struct percpu_counter *fbc) { return fbc->count; } /* * It is possible for the percpu_counter_read() to return a small negative * number for some counter which should never be negative. * */ static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc) { /* Prevent reloads of fbc->count */ s64 ret = READ_ONCE(fbc->count); if (ret >= 0) return ret; return 0; } static inline bool percpu_counter_initialized(struct percpu_counter *fbc) { return (fbc->counters != NULL); } #else /* !CONFIG_SMP */ struct percpu_counter { s64 count; }; static inline int percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, gfp_t gfp, u32 nr_counters) { u32 i; for (i = 0; i < nr_counters; i++) fbc[i].count = amount; return 0; } static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp) { return percpu_counter_init_many(fbc, amount, gfp, 1); } static inline void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters) { } static inline void percpu_counter_destroy(struct percpu_counter *fbc) { } static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount) { fbc->count = amount; } static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs) { if (fbc->count > rhs) return 1; else if (fbc->count < rhs) return -1; else return 0; } static inline int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch) { return percpu_counter_compare(fbc, rhs); } static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) { unsigned long flags; local_irq_save(flags); fbc->count += amount; local_irq_restore(flags); } static inline bool percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount) { unsigned long flags; bool good = false; s64 count; if (amount == 0) return true; local_irq_save(flags); count = fbc->count + amount; if ((amount > 0 && count <= limit) || (amount < 0 && count >= limit)) { fbc->count = count; good = true; } local_irq_restore(flags); return good; } /* non-SMP percpu_counter_add_local is the same with percpu_counter_add */ static inline void percpu_counter_add_local(struct percpu_counter *fbc, s64 amount) { percpu_counter_add(fbc, amount); } static inline void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) { percpu_counter_add(fbc, amount); } static inline s64 percpu_counter_read(struct percpu_counter *fbc) { return fbc->count; } /* * percpu_counter is intended to track positive numbers. In the UP case the * number should never be negative. */ static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc) { return fbc->count; } static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc) { return percpu_counter_read_positive(fbc); } static inline s64 percpu_counter_sum(struct percpu_counter *fbc) { return percpu_counter_read(fbc); } static inline bool percpu_counter_initialized(struct percpu_counter *fbc) { return true; } static inline void percpu_counter_sync(struct percpu_counter *fbc) { } #endif /* CONFIG_SMP */ static inline void percpu_counter_inc(struct percpu_counter *fbc) { percpu_counter_add(fbc, 1); } static inline void percpu_counter_dec(struct percpu_counter *fbc) { percpu_counter_add(fbc, -1); } static inline void percpu_counter_sub(struct percpu_counter *fbc, s64 amount) { percpu_counter_add(fbc, -amount); } static inline void percpu_counter_sub_local(struct percpu_counter *fbc, s64 amount) { percpu_counter_add_local(fbc, -amount); } #endif /* _LINUX_PERCPU_COUNTER_H */
4 817 817 671 442 444 30 817 785 18 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* include/asm-generic/tlb.h * * Generic TLB shootdown code * * Copyright 2001 Red Hat, Inc. * Based on code from mm/memory.c Copyright Linus Torvalds and others. * * Copyright 2011 Red Hat, Inc., Peter Zijlstra */ #ifndef _ASM_GENERIC__TLB_H #define _ASM_GENERIC__TLB_H #include <linux/mmu_notifier.h> #include <linux/swap.h> #include <linux/hugetlb_inline.h> #include <asm/tlbflush.h> #include <asm/cacheflush.h> /* * Blindly accessing user memory from NMI context can be dangerous * if we're in the middle of switching the current user task or switching * the loaded mm. */ #ifndef nmi_uaccess_okay # define nmi_uaccess_okay() true #endif #ifdef CONFIG_MMU /* * Generic MMU-gather implementation. * * The mmu_gather data structure is used by the mm code to implement the * correct and efficient ordering of freeing pages and TLB invalidations. * * This correct ordering is: * * 1) unhook page * 2) TLB invalidate page * 3) free page * * That is, we must never free a page before we have ensured there are no live * translations left to it. Otherwise it might be possible to observe (or * worse, change) the page content after it has been reused. * * The mmu_gather API consists of: * * - tlb_gather_mmu() / tlb_gather_mmu_fullmm() / tlb_finish_mmu() * * start and finish a mmu_gather * * Finish in particular will issue a (final) TLB invalidate and free * all (remaining) queued pages. * * - tlb_start_vma() / tlb_end_vma(); marks the start / end of a VMA * * Defaults to flushing at tlb_end_vma() to reset the range; helps when * there's large holes between the VMAs. * * - tlb_free_vmas() * * tlb_free_vmas() marks the start of unlinking of one or more vmas * and freeing page-tables. * * - tlb_remove_table() * * tlb_remove_table() is the basic primitive to free page-table directories * (__p*_free_tlb()). In it's most primitive form it is an alias for * tlb_remove_page() below, for when page directories are pages and have no * additional constraints. * * See also MMU_GATHER_TABLE_FREE and MMU_GATHER_RCU_TABLE_FREE. * * - tlb_remove_page() / tlb_remove_page_size() * - __tlb_remove_folio_pages() / __tlb_remove_page_size() * - __tlb_remove_folio_pages_size() * * __tlb_remove_folio_pages_size() is the basic primitive that queues pages * for freeing. It will return a boolean indicating if the queue is (now) * full and a call to tlb_flush_mmu() is required. * * tlb_remove_page() and tlb_remove_page_size() imply the call to * tlb_flush_mmu() when required and has no return value. * * __tlb_remove_folio_pages() is similar to __tlb_remove_page_size(), * however, instead of removing a single page, assume PAGE_SIZE and remove * the given number of consecutive pages that are all part of the * same (large) folio. * * - tlb_change_page_size() * * call before __tlb_remove_page*() to set the current page-size; implies a * possible tlb_flush_mmu() call. * * - tlb_flush_mmu() / tlb_flush_mmu_tlbonly() * * tlb_flush_mmu_tlbonly() - does the TLB invalidate (and resets * related state, like the range) * * tlb_flush_mmu() - in addition to the above TLB invalidate, also frees * whatever pages are still batched. * * - mmu_gather::fullmm * * A flag set by tlb_gather_mmu_fullmm() to indicate we're going to free * the entire mm; this allows a number of optimizations. * * - We can ignore tlb_{start,end}_vma(); because we don't * care about ranges. Everything will be shot down. * * - (RISC) architectures that use ASIDs can cycle to a new ASID * and delay the invalidation until ASID space runs out. * * - mmu_gather::need_flush_all * * A flag that can be set by the arch code if it wants to force * flush the entire TLB irrespective of the range. For instance * x86-PAE needs this when changing top-level entries. * * And allows the architecture to provide and implement tlb_flush(): * * tlb_flush() may, in addition to the above mentioned mmu_gather fields, make * use of: * * - mmu_gather::start / mmu_gather::end * * which provides the range that needs to be flushed to cover the pages to * be freed. * * - mmu_gather::freed_tables * * set when we freed page table pages * * - tlb_get_unmap_shift() / tlb_get_unmap_size() * * returns the smallest TLB entry size unmapped in this range. * * If an architecture does not provide tlb_flush() a default implementation * based on flush_tlb_range() will be used, unless MMU_GATHER_NO_RANGE is * specified, in which case we'll default to flush_tlb_mm(). * * Additionally there are a few opt-in features: * * MMU_GATHER_PAGE_SIZE * * This ensures we call tlb_flush() every time tlb_change_page_size() actually * changes the size and provides mmu_gather::page_size to tlb_flush(). * * This might be useful if your architecture has size specific TLB * invalidation instructions. * * MMU_GATHER_TABLE_FREE * * This provides tlb_remove_table(), to be used instead of tlb_remove_page() * for page directores (__p*_free_tlb()). * * Useful if your architecture has non-page page directories. * * When used, an architecture is expected to provide __tlb_remove_table() or * use the generic __tlb_remove_table(), which does the actual freeing of these * pages. * * MMU_GATHER_RCU_TABLE_FREE * * Like MMU_GATHER_TABLE_FREE, and adds semi-RCU semantics to the free (see * comment below). * * Useful if your architecture doesn't use IPIs for remote TLB invalidates * and therefore doesn't naturally serialize with software page-table walkers. * * MMU_GATHER_NO_FLUSH_CACHE * * Indicates the architecture has flush_cache_range() but it needs *NOT* be called * before unmapping a VMA. * * NOTE: strictly speaking we shouldn't have this knob and instead rely on * flush_cache_range() being a NOP, except Sparc64 seems to be * different here. * * MMU_GATHER_MERGE_VMAS * * Indicates the architecture wants to merge ranges over VMAs; typical when * multiple range invalidates are more expensive than a full invalidate. * * MMU_GATHER_NO_RANGE * * Use this if your architecture lacks an efficient flush_tlb_range(). This * option implies MMU_GATHER_MERGE_VMAS above. * * MMU_GATHER_NO_GATHER * * If the option is set the mmu_gather will not track individual pages for * delayed page free anymore. A platform that enables the option needs to * provide its own implementation of the __tlb_remove_page_size() function to * free pages. * * This is useful if your architecture already flushes TLB entries in the * various ptep_get_and_clear() functions. */ #ifdef CONFIG_MMU_GATHER_TABLE_FREE struct mmu_table_batch { #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE struct rcu_head rcu; #endif unsigned int nr; void *tables[]; }; #define MAX_TABLE_BATCH \ ((PAGE_SIZE - sizeof(struct mmu_table_batch)) / sizeof(void *)) #ifndef __HAVE_ARCH_TLB_REMOVE_TABLE static inline void __tlb_remove_table(void *table) { struct ptdesc *ptdesc = (struct ptdesc *)table; pagetable_dtor_free(ptdesc); } #endif extern void tlb_remove_table(struct mmu_gather *tlb, void *table); #else /* !CONFIG_MMU_GATHER_TABLE_FREE */ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page); /* * Without MMU_GATHER_TABLE_FREE the architecture is assumed to have page based * page directories and we can use the normal page batching to free them. */ static inline void tlb_remove_table(struct mmu_gather *tlb, void *table) { struct ptdesc *ptdesc = (struct ptdesc *)table; pagetable_dtor(ptdesc); tlb_remove_page(tlb, ptdesc_page(ptdesc)); } #endif /* CONFIG_MMU_GATHER_TABLE_FREE */ #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE /* * This allows an architecture that does not use the linux page-tables for * hardware to skip the TLBI when freeing page tables. */ #ifndef tlb_needs_table_invalidate #define tlb_needs_table_invalidate() (true) #endif void tlb_remove_table_sync_one(void); #else #ifdef tlb_needs_table_invalidate #error tlb_needs_table_invalidate() requires MMU_GATHER_RCU_TABLE_FREE #endif static inline void tlb_remove_table_sync_one(void) { } #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ #ifndef CONFIG_MMU_GATHER_NO_GATHER /* * If we can't allocate a page to make a big batch of page pointers * to work on, then just handle a few from the on-stack structure. */ #define MMU_GATHER_BUNDLE 8 struct mmu_gather_batch { struct mmu_gather_batch *next; unsigned int nr; unsigned int max; struct encoded_page *encoded_pages[]; }; #define MAX_GATHER_BATCH \ ((PAGE_SIZE - sizeof(struct mmu_gather_batch)) / sizeof(void *)) /* * Limit the maximum number of mmu_gather batches to reduce a risk of soft * lockups for non-preemptible kernels on huge machines when a lot of memory * is zapped during unmapping. * 10K pages freed at once should be safe even without a preemption point. */ #define MAX_GATHER_BATCH_COUNT (10000UL/MAX_GATHER_BATCH) extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, bool delay_rmap, int page_size); bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page, unsigned int nr_pages, bool delay_rmap); #ifdef CONFIG_SMP /* * This both sets 'delayed_rmap', and returns true. It would be an inline * function, except we define it before the 'struct mmu_gather'. */ #define tlb_delay_rmap(tlb) (((tlb)->delayed_rmap = 1), true) extern void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma); #endif #endif /* * We have a no-op version of the rmap removal that doesn't * delay anything. That is used on S390, which flushes remote * TLBs synchronously, and on UP, which doesn't have any * remote TLBs to flush and is not preemptible due to this * all happening under the page table lock. */ #ifndef tlb_delay_rmap #define tlb_delay_rmap(tlb) (false) static inline void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) { } #endif /* * struct mmu_gather is an opaque type used by the mm code for passing around * any data needed by arch specific code for tlb_remove_page. */ struct mmu_gather { struct mm_struct *mm; #ifdef CONFIG_MMU_GATHER_TABLE_FREE struct mmu_table_batch *batch; #endif unsigned long start; unsigned long end; /* * we are in the middle of an operation to clear * a full mm and can make some optimizations */ unsigned int fullmm : 1; /* * we have performed an operation which * requires a complete flush of the tlb */ unsigned int need_flush_all : 1; /* * we have removed page directories */ unsigned int freed_tables : 1; /* * Do we have pending delayed rmap removals? */ unsigned int delayed_rmap : 1; /* * at which levels have we cleared entries? */ unsigned int cleared_ptes : 1; unsigned int cleared_pmds : 1; unsigned int cleared_puds : 1; unsigned int cleared_p4ds : 1; /* * tracks VM_EXEC | VM_HUGETLB in tlb_start_vma */ unsigned int vma_exec : 1; unsigned int vma_huge : 1; unsigned int vma_pfn : 1; unsigned int batch_count; #ifndef CONFIG_MMU_GATHER_NO_GATHER struct mmu_gather_batch *active; struct mmu_gather_batch local; struct page *__pages[MMU_GATHER_BUNDLE]; #ifdef CONFIG_MMU_GATHER_PAGE_SIZE unsigned int page_size; #endif #endif }; void tlb_flush_mmu(struct mmu_gather *tlb); static inline void __tlb_adjust_range(struct mmu_gather *tlb, unsigned long address, unsigned int range_size) { tlb->start = min(tlb->start, address); tlb->end = max(tlb->end, address + range_size); } static inline void __tlb_reset_range(struct mmu_gather *tlb) { if (tlb->fullmm) { tlb->start = tlb->end = ~0; } else { tlb->start = TASK_SIZE; tlb->end = 0; } tlb->freed_tables = 0; tlb->cleared_ptes = 0; tlb->cleared_pmds = 0; tlb->cleared_puds = 0; tlb->cleared_p4ds = 0; /* * Do not reset mmu_gather::vma_* fields here, we do not * call into tlb_start_vma() again to set them if there is an * intermediate flush. */ } #ifdef CONFIG_MMU_GATHER_NO_RANGE #if defined(tlb_flush) #error MMU_GATHER_NO_RANGE relies on default tlb_flush() #endif /* * When an architecture does not have efficient means of range flushing TLBs * there is no point in doing intermediate flushes on tlb_end_vma() to keep the * range small. We equally don't have to worry about page granularity or other * things. * * All we need to do is issue a full flush for any !0 range. */ static inline void tlb_flush(struct mmu_gather *tlb) { if (tlb->end) flush_tlb_mm(tlb->mm); } #else /* CONFIG_MMU_GATHER_NO_RANGE */ #ifndef tlb_flush /* * When an architecture does not provide its own tlb_flush() implementation * but does have a reasonably efficient flush_vma_range() implementation * use that. */ static inline void tlb_flush(struct mmu_gather *tlb) { if (tlb->fullmm || tlb->need_flush_all) { flush_tlb_mm(tlb->mm); } else if (tlb->end) { struct vm_area_struct vma = { .vm_mm = tlb->mm, .vm_flags = (tlb->vma_exec ? VM_EXEC : 0) | (tlb->vma_huge ? VM_HUGETLB : 0), }; flush_tlb_range(&vma, tlb->start, tlb->end); } } #endif #endif /* CONFIG_MMU_GATHER_NO_RANGE */ static inline void tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { /* * flush_tlb_range() implementations that look at VM_HUGETLB (tile, * mips-4k) flush only large pages. * * flush_tlb_range() implementations that flush I-TLB also flush D-TLB * (tile, xtensa, arm), so it's ok to just add VM_EXEC to an existing * range. * * We rely on tlb_end_vma() to issue a flush, such that when we reset * these values the batch is empty. */ tlb->vma_huge = is_vm_hugetlb_page(vma); tlb->vma_exec = !!(vma->vm_flags & VM_EXEC); /* * Track if there's at least one VM_PFNMAP/VM_MIXEDMAP vma * in the tracked range, see tlb_free_vmas(). */ tlb->vma_pfn |= !!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)); } static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) { /* * Anything calling __tlb_adjust_range() also sets at least one of * these bits. */ if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds || tlb->cleared_puds || tlb->cleared_p4ds)) return; tlb_flush(tlb); __tlb_reset_range(tlb); } static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { if (__tlb_remove_page_size(tlb, page, false, page_size)) tlb_flush_mmu(tlb); } static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) { return tlb_remove_page_size(tlb, page, PAGE_SIZE); } static inline void tlb_remove_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt) { tlb_remove_table(tlb, pt); } static inline void tlb_change_page_size(struct mmu_gather *tlb, unsigned int page_size) { #ifdef CONFIG_MMU_GATHER_PAGE_SIZE if (tlb->page_size && tlb->page_size != page_size) { if (!tlb->fullmm && !tlb->need_flush_all) tlb_flush_mmu(tlb); } tlb->page_size = page_size; #endif } static inline unsigned long tlb_get_unmap_shift(struct mmu_gather *tlb) { if (tlb->cleared_ptes) return PAGE_SHIFT; if (tlb->cleared_pmds) return PMD_SHIFT; if (tlb->cleared_puds) return PUD_SHIFT; if (tlb->cleared_p4ds) return P4D_SHIFT; return PAGE_SHIFT; } static inline unsigned long tlb_get_unmap_size(struct mmu_gather *tlb) { return 1UL << tlb_get_unmap_shift(tlb); } /* * In the case of tlb vma handling, we can optimise these away in the * case where we're doing a full MM flush. When we're doing a munmap, * the vmas are adjusted to only cover the region to be torn down. */ static inline void tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { if (tlb->fullmm) return; tlb_update_vma_flags(tlb, vma); #ifndef CONFIG_MMU_GATHER_NO_FLUSH_CACHE flush_cache_range(vma, vma->vm_start, vma->vm_end); #endif } static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { if (tlb->fullmm || IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS)) return; /* * Do a TLB flush and reset the range at VMA boundaries; this avoids * the ranges growing with the unused space between consecutive VMAs, * but also the mmu_gather::vma_* flags from tlb_start_vma() rely on * this. */ tlb_flush_mmu_tlbonly(tlb); } static inline void tlb_free_vmas(struct mmu_gather *tlb) { if (tlb->fullmm) return; /* * VM_PFNMAP is more fragile because the core mm will not track the * page mapcount -- there might not be page-frames for these PFNs * after all. * * Specifically() there is a race between munmap() and * unmap_mapping_range(), where munmap() will unlink the VMA, such * that unmap_mapping_range() will no longer observe the VMA and * no-op, without observing the TLBI, returning prematurely. * * So if we're about to unlink such a VMA, and we have pending * TLBI for such a vma, flush things now. */ if (tlb->vma_pfn) tlb_flush_mmu_tlbonly(tlb); } /* * tlb_flush_{pte|pmd|pud|p4d}_range() adjust the tlb->start and tlb->end, * and set corresponding cleared_*. */ static inline void tlb_flush_pte_range(struct mmu_gather *tlb, unsigned long address, unsigned long size) { __tlb_adjust_range(tlb, address, size); tlb->cleared_ptes = 1; } static inline void tlb_flush_pmd_range(struct mmu_gather *tlb, unsigned long address, unsigned long size) { __tlb_adjust_range(tlb, address, size); tlb->cleared_pmds = 1; } static inline void tlb_flush_pud_range(struct mmu_gather *tlb, unsigned long address, unsigned long size) { __tlb_adjust_range(tlb, address, size); tlb->cleared_puds = 1; } static inline void tlb_flush_p4d_range(struct mmu_gather *tlb, unsigned long address, unsigned long size) { __tlb_adjust_range(tlb, address, size); tlb->cleared_p4ds = 1; } #ifndef __tlb_remove_tlb_entry static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long address) { } #endif /** * tlb_remove_tlb_entry - remember a pte unmapping for later tlb invalidation. * * Record the fact that pte's were really unmapped by updating the range, * so we can later optimise away the tlb invalidate. This helps when * userspace is unmapping already-unmapped pages, which happens quite a lot. */ #define tlb_remove_tlb_entry(tlb, ptep, address) \ do { \ tlb_flush_pte_range(tlb, address, PAGE_SIZE); \ __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) /** * tlb_remove_tlb_entries - remember unmapping of multiple consecutive ptes for * later tlb invalidation. * * Similar to tlb_remove_tlb_entry(), but remember unmapping of multiple * consecutive ptes instead of only a single one. */ static inline void tlb_remove_tlb_entries(struct mmu_gather *tlb, pte_t *ptep, unsigned int nr, unsigned long address) { tlb_flush_pte_range(tlb, address, PAGE_SIZE * nr); for (;;) { __tlb_remove_tlb_entry(tlb, ptep, address); if (--nr == 0) break; ptep++; address += PAGE_SIZE; } } #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ do { \ unsigned long _sz = huge_page_size(h); \ if (_sz >= P4D_SIZE) \ tlb_flush_p4d_range(tlb, address, _sz); \ else if (_sz >= PUD_SIZE) \ tlb_flush_pud_range(tlb, address, _sz); \ else if (_sz >= PMD_SIZE) \ tlb_flush_pmd_range(tlb, address, _sz); \ else \ tlb_flush_pte_range(tlb, address, _sz); \ __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) /** * tlb_remove_pmd_tlb_entry - remember a pmd mapping for later tlb invalidation * This is a nop so far, because only x86 needs it. */ #ifndef __tlb_remove_pmd_tlb_entry #define __tlb_remove_pmd_tlb_entry(tlb, pmdp, address) do {} while (0) #endif #define tlb_remove_pmd_tlb_entry(tlb, pmdp, address) \ do { \ tlb_flush_pmd_range(tlb, address, HPAGE_PMD_SIZE); \ __tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \ } while (0) /** * tlb_remove_pud_tlb_entry - remember a pud mapping for later tlb * invalidation. This is a nop so far, because only x86 needs it. */ #ifndef __tlb_remove_pud_tlb_entry #define __tlb_remove_pud_tlb_entry(tlb, pudp, address) do {} while (0) #endif #define tlb_remove_pud_tlb_entry(tlb, pudp, address) \ do { \ tlb_flush_pud_range(tlb, address, HPAGE_PUD_SIZE); \ __tlb_remove_pud_tlb_entry(tlb, pudp, address); \ } while (0) /* * For things like page tables caches (ie caching addresses "inside" the * page tables, like x86 does), for legacy reasons, flushing an * individual page had better flush the page table caches behind it. This * is definitely how x86 works, for example. And if you have an * architected non-legacy page table cache (which I'm not aware of * anybody actually doing), you're going to have some architecturally * explicit flushing for that, likely *separate* from a regular TLB entry * flush, and thus you'd need more than just some range expansion.. * * So if we ever find an architecture * that would want something that odd, I think it is up to that * architecture to do its own odd thing, not cause pain for others * http://lkml.kernel.org/r/CA+55aFzBggoXtNXQeng5d_mRoDnaMBE5Y+URs+PHR67nUpMtaw@mail.gmail.com * * For now w.r.t page table cache, mark the range_size as PAGE_SIZE */ #ifndef pte_free_tlb #define pte_free_tlb(tlb, ptep, address) \ do { \ tlb_flush_pmd_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ __pte_free_tlb(tlb, ptep, address); \ } while (0) #endif #ifndef pmd_free_tlb #define pmd_free_tlb(tlb, pmdp, address) \ do { \ tlb_flush_pud_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ __pmd_free_tlb(tlb, pmdp, address); \ } while (0) #endif #ifndef pud_free_tlb #define pud_free_tlb(tlb, pudp, address) \ do { \ tlb_flush_p4d_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ __pud_free_tlb(tlb, pudp, address); \ } while (0) #endif #ifndef p4d_free_tlb #define p4d_free_tlb(tlb, pudp, address) \ do { \ __tlb_adjust_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ __p4d_free_tlb(tlb, pudp, address); \ } while (0) #endif #ifndef pte_needs_flush static inline bool pte_needs_flush(pte_t oldpte, pte_t newpte) { return true; } #endif #ifndef huge_pmd_needs_flush static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd) { return true; } #endif #endif /* CONFIG_MMU */ #endif /* _ASM_GENERIC__TLB_H */
2 2 2 2 1 1 11 1 1 1 1 2 2 1 2 3 3 3 191 191 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/act_simple.c Simple example of an action * * Authors: Jamal Hadi Salim (2005-8) */ #include <linux/module.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/tc_wrapper.h> #include <linux/tc_act/tc_defact.h> #include <net/tc_act/tc_defact.h> static struct tc_action_ops act_simp_ops; #define SIMP_MAX_DATA 32 TC_INDIRECT_SCOPE int tcf_simp_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_defact *d = to_defact(a); spin_lock(&d->tcf_lock); tcf_lastuse_update(&d->tcf_tm); bstats_update(&d->tcf_bstats, skb); /* print policy string followed by _ then packet count * Example if this was the 3rd packet and the string was "hello" * then it would look like "hello_3" (without quotes) */ pr_info("simple: %s_%llu\n", (char *)d->tcfd_defdata, u64_stats_read(&d->tcf_bstats.packets)); spin_unlock(&d->tcf_lock); return d->tcf_action; } static void tcf_simp_release(struct tc_action *a) { struct tcf_defact *d = to_defact(a); kfree(d->tcfd_defdata); } static int alloc_defdata(struct tcf_defact *d, const struct nlattr *defdata) { d->tcfd_defdata = kzalloc(SIMP_MAX_DATA, GFP_KERNEL); if (unlikely(!d->tcfd_defdata)) return -ENOMEM; nla_strscpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); return 0; } static int reset_policy(struct tc_action *a, const struct nlattr *defdata, struct tc_defact *p, struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct tcf_chain *goto_ch = NULL; struct tcf_defact *d; int err; err = tcf_action_check_ctrlact(p->action, tp, &goto_ch, extack); if (err < 0) return err; d = to_defact(a); spin_lock_bh(&d->tcf_lock); goto_ch = tcf_action_set_ctrlact(a, p->action, goto_ch); nla_strscpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); spin_unlock_bh(&d->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); return 0; } static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = { [TCA_DEF_PARMS] = { .len = sizeof(struct tc_defact) }, [TCA_DEF_DATA] = { .type = NLA_STRING, .len = SIMP_MAX_DATA }, }; static int tcf_simp_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_simp_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_DEF_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tc_defact *parm; struct tcf_defact *d; bool exists = false; int ret = 0, err; u32 index; if (nla == NULL) return -EINVAL; err = nla_parse_nested_deprecated(tb, TCA_DEF_MAX, nla, simple_policy, NULL); if (err < 0) return err; if (tb[TCA_DEF_PARMS] == NULL) return -EINVAL; parm = nla_data(tb[TCA_DEF_PARMS]); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) return err; exists = err; if (exists && bind) return ACT_P_BOUND; if (tb[TCA_DEF_DATA] == NULL) { if (exists) tcf_idr_release(*a, bind); else tcf_idr_cleanup(tn, index); return -EINVAL; } if (!exists) { ret = tcf_idr_create(tn, index, est, a, &act_simp_ops, bind, false, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } d = to_defact(*a); err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; err = alloc_defdata(d, tb[TCA_DEF_DATA]); if (err < 0) goto put_chain; tcf_action_set_ctrlact(*a, parm->action, goto_ch); ret = ACT_P_CREATED; } else { if (!(flags & TCA_ACT_FLAGS_REPLACE)) { err = -EEXIST; goto release_idr; } err = reset_policy(*a, tb[TCA_DEF_DATA], parm, tp, extack); if (err) goto release_idr; } return ret; put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: tcf_idr_release(*a, bind); return err; } static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_defact *d = to_defact(a); struct tc_defact opt = { .index = d->tcf_index, .refcnt = refcount_read(&d->tcf_refcnt) - ref, .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, }; struct tcf_t t; spin_lock_bh(&d->tcf_lock); opt.action = d->tcf_action; if (nla_put(skb, TCA_DEF_PARMS, sizeof(opt), &opt) || nla_put_string(skb, TCA_DEF_DATA, d->tcfd_defdata)) goto nla_put_failure; tcf_tm_dump(&t, &d->tcf_tm); if (nla_put_64bit(skb, TCA_DEF_TM, sizeof(t), &t, TCA_DEF_PAD)) goto nla_put_failure; spin_unlock_bh(&d->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&d->tcf_lock); nlmsg_trim(skb, b); return -1; } static struct tc_action_ops act_simp_ops = { .kind = "simple", .id = TCA_ID_SIMP, .owner = THIS_MODULE, .act = tcf_simp_act, .dump = tcf_simp_dump, .cleanup = tcf_simp_release, .init = tcf_simp_init, .size = sizeof(struct tcf_defact), }; MODULE_ALIAS_NET_ACT("simple"); static __net_init int simp_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_simp_ops.net_id); return tc_action_net_init(net, tn, &act_simp_ops); } static void __net_exit simp_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_simp_ops.net_id); } static struct pernet_operations simp_net_ops = { .init = simp_init_net, .exit_batch = simp_exit_net, .id = &act_simp_ops.net_id, .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2005)"); MODULE_DESCRIPTION("Simple example action"); MODULE_LICENSE("GPL"); static int __init simp_init_module(void) { int ret = tcf_register_action(&act_simp_ops, &simp_net_ops); if (!ret) pr_info("Simple TC action Loaded\n"); return ret; } static void __exit simp_cleanup_module(void) { tcf_unregister_action(&act_simp_ops, &simp_net_ops); } module_init(simp_init_module); module_exit(simp_cleanup_module);
3 30 15 30 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 /* SPDX-License-Identifier: GPL-2.0-only */ /* * AppArmor security module * * This file contains AppArmor policy definitions. * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2017 Canonical Ltd. */ #ifndef __AA_NAMESPACE_H #define __AA_NAMESPACE_H #include <linux/kref.h> #include "apparmor.h" #include "apparmorfs.h" #include "label.h" #include "policy.h" /* struct aa_ns_acct - accounting of profiles in namespace * @max_size: maximum space allowed for all profiles in namespace * @max_count: maximum number of profiles that can be in this namespace * @size: current size of profiles * @count: current count of profiles (includes null profiles) */ struct aa_ns_acct { int max_size; int max_count; int size; int count; }; /* struct aa_ns - namespace for a set of profiles * @base: common policy * @parent: parent of namespace * @lock: lock for modifying the object * @acct: accounting for the namespace * @unconfined: special unconfined profile for the namespace * @sub_ns: list of namespaces under the current namespace. * @uniq_null: uniq value used for null learning profiles * @uniq_id: a unique id count for the profiles in the namespace * @level: level of ns within the tree hierarchy * @dents: dentries for the namespaces file entries in apparmorfs * * An aa_ns defines the set profiles that are searched to determine which * profile to attach to a task. Profiles can not be shared between aa_ns * and profile names within a namespace are guaranteed to be unique. When * profiles in separate namespaces have the same name they are NOT considered * to be equivalent. * * Namespaces are hierarchical and only namespaces and profiles below the * current namespace are visible. * * Namespace names must be unique and can not contain the characters :/\0 */ struct aa_ns { struct aa_policy base; struct aa_ns *parent; struct mutex lock; struct aa_ns_acct acct; struct aa_profile *unconfined; struct list_head sub_ns; atomic_t uniq_null; long uniq_id; int level; long revision; wait_queue_head_t wait; struct aa_labelset labels; struct list_head rawdata_list; struct dentry *dents[AAFS_NS_SIZEOF]; }; extern struct aa_label *kernel_t; extern struct aa_ns *root_ns; extern const char *aa_hidden_ns_name; #define ns_unconfined(NS) (&(NS)->unconfined->label) bool aa_ns_visible(struct aa_ns *curr, struct aa_ns *view, bool subns); const char *aa_ns_name(struct aa_ns *parent, struct aa_ns *child, bool subns); void aa_free_ns(struct aa_ns *ns); int aa_alloc_root_ns(void); void aa_free_root_ns(void); struct aa_ns *__aa_lookupn_ns(struct aa_ns *view, const char *hname, size_t n); struct aa_ns *aa_lookupn_ns(struct aa_ns *view, const char *name, size_t n); struct aa_ns *__aa_find_or_create_ns(struct aa_ns *parent, const char *name, struct dentry *dir); struct aa_ns *aa_prepare_ns(struct aa_ns *root, const char *name); void __aa_remove_ns(struct aa_ns *ns); static inline struct aa_profile *aa_deref_parent(struct aa_profile *p) { return rcu_dereference_protected(p->parent, mutex_is_locked(&p->ns->lock)); } /** * aa_get_ns - increment references count on @ns * @ns: namespace to increment reference count of (MAYBE NULL) * * Returns: pointer to @ns, if @ns is NULL returns NULL * Requires: @ns must be held with valid refcount when called */ static inline struct aa_ns *aa_get_ns(struct aa_ns *ns) { if (ns) aa_get_profile(ns->unconfined); return ns; } /** * aa_put_ns - decrement refcount on @ns * @ns: namespace to put reference of * * Decrement reference count of @ns and if no longer in use free it */ static inline void aa_put_ns(struct aa_ns *ns) { if (ns) aa_put_profile(ns->unconfined); } /** * __aa_findn_ns - find a namespace on a list by @name * @head: list to search for namespace on (NOT NULL) * @name: name of namespace to look for (NOT NULL) * @n: length of @name * Returns: unrefcounted namespace * * Requires: rcu_read_lock be held */ static inline struct aa_ns *__aa_findn_ns(struct list_head *head, const char *name, size_t n) { return (struct aa_ns *)__policy_strn_find(head, name, n); } static inline struct aa_ns *__aa_find_ns(struct list_head *head, const char *name) { return __aa_findn_ns(head, name, strlen(name)); } #endif /* AA_NAMESPACE_H */
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef LINUX_MLD_H #define LINUX_MLD_H #include <linux/in6.h> #include <linux/icmpv6.h> /* MLDv1 Query/Report/Done */ struct mld_msg { struct icmp6hdr mld_hdr; struct in6_addr mld_mca; }; #define mld_type mld_hdr.icmp6_type #define mld_code mld_hdr.icmp6_code #define mld_cksum mld_hdr.icmp6_cksum #define mld_maxdelay mld_hdr.icmp6_maxdelay #define mld_reserved mld_hdr.icmp6_dataun.un_data16[1] /* Multicast Listener Discovery version 2 headers */ /* MLDv2 Report */ struct mld2_grec { __u8 grec_type; __u8 grec_auxwords; __be16 grec_nsrcs; struct in6_addr grec_mca; struct in6_addr grec_src[]; }; struct mld2_report { struct icmp6hdr mld2r_hdr; struct mld2_grec mld2r_grec[]; }; #define mld2r_type mld2r_hdr.icmp6_type #define mld2r_resv1 mld2r_hdr.icmp6_code #define mld2r_cksum mld2r_hdr.icmp6_cksum #define mld2r_resv2 mld2r_hdr.icmp6_dataun.un_data16[0] #define mld2r_ngrec mld2r_hdr.icmp6_dataun.un_data16[1] /* MLDv2 Query */ struct mld2_query { struct icmp6hdr mld2q_hdr; struct in6_addr mld2q_mca; #if defined(__LITTLE_ENDIAN_BITFIELD) __u8 mld2q_qrv:3, mld2q_suppress:1, mld2q_resv2:4; #elif defined(__BIG_ENDIAN_BITFIELD) __u8 mld2q_resv2:4, mld2q_suppress:1, mld2q_qrv:3; #else #error "Please fix <asm/byteorder.h>" #endif __u8 mld2q_qqic; __be16 mld2q_nsrcs; struct in6_addr mld2q_srcs[]; }; #define mld2q_type mld2q_hdr.icmp6_type #define mld2q_code mld2q_hdr.icmp6_code #define mld2q_cksum mld2q_hdr.icmp6_cksum #define mld2q_mrc mld2q_hdr.icmp6_maxdelay #define mld2q_resv1 mld2q_hdr.icmp6_dataun.un_data16[1] /* RFC3810, 5.1.3. Maximum Response Code: * * If Maximum Response Code >= 32768, Maximum Response Code represents a * floating-point value as follows: * * 0 1 2 3 4 5 6 7 8 9 A B C D E F * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |1| exp | mant | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ #define MLDV2_MRC_EXP(value) (((value) >> 12) & 0x0007) #define MLDV2_MRC_MAN(value) ((value) & 0x0fff) /* RFC3810, 5.1.9. QQIC (Querier's Query Interval Code): * * If QQIC >= 128, QQIC represents a floating-point value as follows: * * 0 1 2 3 4 5 6 7 * +-+-+-+-+-+-+-+-+ * |1| exp | mant | * +-+-+-+-+-+-+-+-+ */ #define MLDV2_QQIC_EXP(value) (((value) >> 4) & 0x07) #define MLDV2_QQIC_MAN(value) ((value) & 0x0f) #define MLD_EXP_MIN_LIMIT 32768UL #define MLDV1_MRD_MAX_COMPAT (MLD_EXP_MIN_LIMIT - 1) #define MLD_MAX_QUEUE 8 #define MLD_MAX_SKBS 32 static inline unsigned long mldv2_mrc(const struct mld2_query *mlh2) { /* RFC3810, 5.1.3. Maximum Response Code */ unsigned long ret, mc_mrc = ntohs(mlh2->mld2q_mrc); if (mc_mrc < MLD_EXP_MIN_LIMIT) { ret = mc_mrc; } else { unsigned long mc_man, mc_exp; mc_exp = MLDV2_MRC_EXP(mc_mrc); mc_man = MLDV2_MRC_MAN(mc_mrc); ret = (mc_man | 0x1000) << (mc_exp + 3); } return ret; } #endif
20 20 20 20 20 21049 19630 20988 72 72 2071 2073 11992 11980 93 93 93 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 // SPDX-License-Identifier: GPL-2.0 /* * linux/kernel/capability.c * * Copyright (C) 1997 Andrew Main <zefram@fysh.org> * * Integrated into 2.1.97+, Andrew G. Morgan <morgan@kernel.org> * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/audit.h> #include <linux/capability.h> #include <linux/mm.h> #include <linux/export.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include <linux/uaccess.h> int file_caps_enabled = 1; static int __init file_caps_disable(char *str) { file_caps_enabled = 0; return 1; } __setup("no_file_caps", file_caps_disable); #ifdef CONFIG_MULTIUSER /* * More recent versions of libcap are available from: * * http://www.kernel.org/pub/linux/libs/security/linux-privs/ */ static void warn_legacy_capability_use(void) { pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n", current->comm); } /* * Version 2 capabilities worked fine, but the linux/capability.h file * that accompanied their introduction encouraged their use without * the necessary user-space source code changes. As such, we have * created a version 3 with equivalent functionality to version 2, but * with a header change to protect legacy source code from using * version 2 when it wanted to use version 1. If your system has code * that trips the following warning, it is using version 2 specific * capabilities and may be doing so insecurely. * * The remedy is to either upgrade your version of libcap (to 2.10+, * if the application is linked against it), or recompile your * application with modern kernel headers and this warning will go * away. */ static void warn_deprecated_v2(void) { pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n", current->comm); } /* * Version check. Return the number of u32s in each capability flag * array, or a negative value on error. */ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) { __u32 version; if (get_user(version, &header->version)) return -EFAULT; switch (version) { case _LINUX_CAPABILITY_VERSION_1: warn_legacy_capability_use(); *tocopy = _LINUX_CAPABILITY_U32S_1; break; case _LINUX_CAPABILITY_VERSION_2: warn_deprecated_v2(); fallthrough; /* v3 is otherwise equivalent to v2 */ case _LINUX_CAPABILITY_VERSION_3: *tocopy = _LINUX_CAPABILITY_U32S_3; break; default: if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version)) return -EFAULT; return -EINVAL; } return 0; } /* * The only thing that can change the capabilities of the current * process is the current process. As such, we can't be in this code * at the same time as we are in the process of setting capabilities * in this process. The net result is that we can limit our use of * locks to when we are reading the caps of another process. */ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, kernel_cap_t *pIp, kernel_cap_t *pPp) { int ret; if (pid && (pid != task_pid_vnr(current))) { const struct task_struct *target; rcu_read_lock(); target = find_task_by_vpid(pid); if (!target) ret = -ESRCH; else ret = security_capget(target, pEp, pIp, pPp); rcu_read_unlock(); } else ret = security_capget(current, pEp, pIp, pPp); return ret; } /** * sys_capget - get the capabilities of a given process. * @header: pointer to struct that contains capability version and * target pid data * @dataptr: pointer to struct that contains the effective, permitted, * and inheritable capabilities that are returned * * Returns 0 on success and < 0 on error. */ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) { int ret = 0; pid_t pid; unsigned tocopy; kernel_cap_t pE, pI, pP; struct __user_cap_data_struct kdata[2]; ret = cap_validate_magic(header, &tocopy); if ((dataptr == NULL) || (ret != 0)) return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret; if (get_user(pid, &header->pid)) return -EFAULT; if (pid < 0) return -EINVAL; ret = cap_get_target_pid(pid, &pE, &pI, &pP); if (ret) return ret; /* * Annoying legacy format with 64-bit capabilities exposed * as two sets of 32-bit fields, so we need to split the * capability values up. */ kdata[0].effective = pE.val; kdata[1].effective = pE.val >> 32; kdata[0].permitted = pP.val; kdata[1].permitted = pP.val >> 32; kdata[0].inheritable = pI.val; kdata[1].inheritable = pI.val >> 32; /* * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S, * we silently drop the upper capabilities here. This * has the effect of making older libcap * implementations implicitly drop upper capability * bits when they perform a: capget/modify/capset * sequence. * * This behavior is considered fail-safe * behavior. Upgrading the application to a newer * version of libcap will enable access to the newer * capabilities. * * An alternative would be to return an error here * (-ERANGE), but that causes legacy applications to * unexpectedly fail; the capget/modify/capset aborts * before modification is attempted and the application * fails. */ if (copy_to_user(dataptr, kdata, tocopy * sizeof(kdata[0]))) return -EFAULT; return 0; } static kernel_cap_t mk_kernel_cap(u32 low, u32 high) { return (kernel_cap_t) { (low | ((u64)high << 32)) & CAP_VALID_MASK }; } /** * sys_capset - set capabilities for a process or (*) a group of processes * @header: pointer to struct that contains capability version and * target pid data * @data: pointer to struct that contains the effective, permitted, * and inheritable capabilities * * Set capabilities for the current process only. The ability to any other * process(es) has been deprecated and removed. * * The restrictions on setting capabilities are specified as: * * I: any raised capabilities must be a subset of the old permitted * P: any raised capabilities must be a subset of the old permitted * E: must be set to a subset of new permitted * * Returns 0 on success and < 0 on error. */ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) { struct __user_cap_data_struct kdata[2] = { { 0, }, }; unsigned tocopy, copybytes; kernel_cap_t inheritable, permitted, effective; struct cred *new; int ret; pid_t pid; ret = cap_validate_magic(header, &tocopy); if (ret != 0) return ret; if (get_user(pid, &header->pid)) return -EFAULT; /* may only affect current now */ if (pid != 0 && pid != task_pid_vnr(current)) return -EPERM; copybytes = tocopy * sizeof(struct __user_cap_data_struct); if (copybytes > sizeof(kdata)) return -EFAULT; if (copy_from_user(&kdata, data, copybytes)) return -EFAULT; effective = mk_kernel_cap(kdata[0].effective, kdata[1].effective); permitted = mk_kernel_cap(kdata[0].permitted, kdata[1].permitted); inheritable = mk_kernel_cap(kdata[0].inheritable, kdata[1].inheritable); new = prepare_creds(); if (!new) return -ENOMEM; ret = security_capset(new, current_cred(), &effective, &inheritable, &permitted); if (ret < 0) goto error; audit_log_capset(new, current_cred()); return commit_creds(new); error: abort_creds(new); return ret; } /** * has_ns_capability - Does a task have a capability in a specific user ns * @t: The task in question * @ns: target user namespace * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability * currently in effect to the specified user namespace, false if not. * * Note that this does not set PF_SUPERPRIV on the task. */ bool has_ns_capability(struct task_struct *t, struct user_namespace *ns, int cap) { int ret; rcu_read_lock(); ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NONE); rcu_read_unlock(); return (ret == 0); } /** * has_ns_capability_noaudit - Does a task have a capability (unaudited) * in a specific user ns. * @t: The task in question * @ns: target user namespace * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability * currently in effect to the specified user namespace, false if not. * Do not write an audit message for the check. * * Note that this does not set PF_SUPERPRIV on the task. */ bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap) { int ret; rcu_read_lock(); ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NOAUDIT); rcu_read_unlock(); return (ret == 0); } /** * has_capability_noaudit - Does a task have a capability (unaudited) in the * initial user ns * @t: The task in question * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability * currently in effect to init_user_ns, false if not. Don't write an * audit message for the check. * * Note that this does not set PF_SUPERPRIV on the task. */ bool has_capability_noaudit(struct task_struct *t, int cap) { return has_ns_capability_noaudit(t, &init_user_ns, cap); } EXPORT_SYMBOL(has_capability_noaudit); static bool ns_capable_common(struct user_namespace *ns, int cap, unsigned int opts) { int capable; if (unlikely(!cap_valid(cap))) { pr_crit("capable() called with invalid cap=%u\n", cap); BUG(); } capable = security_capable(current_cred(), ns, cap, opts); if (capable == 0) { current->flags |= PF_SUPERPRIV; return true; } return false; } /** * ns_capable - Determine if the current task has a superior capability in effect * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool ns_capable(struct user_namespace *ns, int cap) { return ns_capable_common(ns, cap, CAP_OPT_NONE); } EXPORT_SYMBOL(ns_capable); /** * ns_capable_noaudit - Determine if the current task has a superior capability * (unaudited) in effect * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool ns_capable_noaudit(struct user_namespace *ns, int cap) { return ns_capable_common(ns, cap, CAP_OPT_NOAUDIT); } EXPORT_SYMBOL(ns_capable_noaudit); /** * ns_capable_setid - Determine if the current task has a superior capability * in effect, while signalling that this check is being done from within a * setid or setgroups syscall. * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool ns_capable_setid(struct user_namespace *ns, int cap) { return ns_capable_common(ns, cap, CAP_OPT_INSETID); } EXPORT_SYMBOL(ns_capable_setid); /** * capable - Determine if the current task has a superior capability in effect * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool capable(int cap) { return ns_capable(&init_user_ns, cap); } EXPORT_SYMBOL(capable); #endif /* CONFIG_MULTIUSER */ /** * file_ns_capable - Determine if the file's opener had a capability in effect * @file: The file we want to check * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if task that opened the file had a capability in effect * when the file was opened. * * This does not set PF_SUPERPRIV because the caller may not * actually be privileged. */ bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) { if (WARN_ON_ONCE(!cap_valid(cap))) return false; if (security_capable(file->f_cred, ns, cap, CAP_OPT_NONE) == 0) return true; return false; } EXPORT_SYMBOL(file_ns_capable); /** * privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode? * @ns: The user namespace in question * @idmap: idmap of the mount @inode was found from * @inode: The inode in question * * Return true if the inode uid and gid are within the namespace. */ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, struct mnt_idmap *idmap, const struct inode *inode) { return vfsuid_has_mapping(ns, i_uid_into_vfsuid(idmap, inode)) && vfsgid_has_mapping(ns, i_gid_into_vfsgid(idmap, inode)); } /** * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped * @idmap: idmap of the mount @inode was found from * @inode: The inode in question * @cap: The capability in question * * Return true if the current task has the given capability targeted at * its own user namespace and that the given inode's uid and gid are * mapped into the current user namespace. */ bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap, const struct inode *inode, int cap) { struct user_namespace *ns = current_user_ns(); return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, idmap, inode); } EXPORT_SYMBOL(capable_wrt_inode_uidgid); /** * ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace * @tsk: The task that may be ptraced * @ns: The user namespace to search for CAP_SYS_PTRACE in * * Return true if the task that is ptracing the current task had CAP_SYS_PTRACE * in the specified user namespace. */ bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns) { int ret = 0; /* An absent tracer adds no restrictions */ const struct cred *cred; rcu_read_lock(); cred = rcu_dereference(tsk->ptracer_cred); if (cred) ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NOAUDIT); rcu_read_unlock(); return (ret == 0); }
12 3 12 10 7 6 5 3 5 3 16 16 16 1 20 20 14 13 14 12 20 20 16 1 25 24 3 21 26 1 1 24 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 // SPDX-License-Identifier: GPL-2.0 /* XDP sockets monitoring support * * Copyright(c) 2019 Intel Corporation. * * Author: Björn Töpel <bjorn.topel@intel.com> */ #include <linux/module.h> #include <net/xdp_sock.h> #include <linux/xdp_diag.h> #include <linux/sock_diag.h> #include "xsk_queue.h" #include "xsk.h" static int xsk_diag_put_info(const struct xdp_sock *xs, struct sk_buff *nlskb) { struct xdp_diag_info di = {}; di.ifindex = xs->dev ? xs->dev->ifindex : 0; di.queue_id = xs->queue_id; return nla_put(nlskb, XDP_DIAG_INFO, sizeof(di), &di); } static int xsk_diag_put_ring(const struct xsk_queue *queue, int nl_type, struct sk_buff *nlskb) { struct xdp_diag_ring dr = {}; dr.entries = queue->nentries; return nla_put(nlskb, nl_type, sizeof(dr), &dr); } static int xsk_diag_put_rings_cfg(const struct xdp_sock *xs, struct sk_buff *nlskb) { int err = 0; if (xs->rx) err = xsk_diag_put_ring(xs->rx, XDP_DIAG_RX_RING, nlskb); if (!err && xs->tx) err = xsk_diag_put_ring(xs->tx, XDP_DIAG_TX_RING, nlskb); return err; } static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb) { struct xsk_buff_pool *pool = xs->pool; struct xdp_umem *umem = xs->umem; struct xdp_diag_umem du = {}; int err; if (!umem) return 0; du.id = umem->id; du.size = umem->size; du.num_pages = umem->npgs; du.chunk_size = umem->chunk_size; du.headroom = umem->headroom; du.ifindex = (pool && pool->netdev) ? pool->netdev->ifindex : 0; du.queue_id = pool ? pool->queue_id : 0; du.flags = 0; if (umem->zc) du.flags |= XDP_DU_F_ZEROCOPY; du.refs = refcount_read(&umem->users); err = nla_put(nlskb, XDP_DIAG_UMEM, sizeof(du), &du); if (!err && pool && pool->fq) err = xsk_diag_put_ring(pool->fq, XDP_DIAG_UMEM_FILL_RING, nlskb); if (!err && pool && pool->cq) err = xsk_diag_put_ring(pool->cq, XDP_DIAG_UMEM_COMPLETION_RING, nlskb); return err; } static int xsk_diag_put_stats(const struct xdp_sock *xs, struct sk_buff *nlskb) { struct xdp_diag_stats du = {}; du.n_rx_dropped = xs->rx_dropped; du.n_rx_invalid = xskq_nb_invalid_descs(xs->rx); du.n_rx_full = xs->rx_queue_full; du.n_fill_ring_empty = xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0; du.n_tx_invalid = xskq_nb_invalid_descs(xs->tx); du.n_tx_ring_empty = xskq_nb_queue_empty_descs(xs->tx); return nla_put(nlskb, XDP_DIAG_STATS, sizeof(du), &du); } static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb, struct xdp_diag_req *req, struct user_namespace *user_ns, u32 portid, u32 seq, u32 flags, int sk_ino) { struct xdp_sock *xs = xdp_sk(sk); struct xdp_diag_msg *msg; struct nlmsghdr *nlh; nlh = nlmsg_put(nlskb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*msg), flags); if (!nlh) return -EMSGSIZE; msg = nlmsg_data(nlh); memset(msg, 0, sizeof(*msg)); msg->xdiag_family = AF_XDP; msg->xdiag_type = sk->sk_type; msg->xdiag_ino = sk_ino; sock_diag_save_cookie(sk, msg->xdiag_cookie); mutex_lock(&xs->mutex); if (READ_ONCE(xs->state) == XSK_UNBOUND) goto out_nlmsg_trim; if ((req->xdiag_show & XDP_SHOW_INFO) && xsk_diag_put_info(xs, nlskb)) goto out_nlmsg_trim; if ((req->xdiag_show & XDP_SHOW_INFO) && nla_put_u32(nlskb, XDP_DIAG_UID, from_kuid_munged(user_ns, sk_uid(sk)))) goto out_nlmsg_trim; if ((req->xdiag_show & XDP_SHOW_RING_CFG) && xsk_diag_put_rings_cfg(xs, nlskb)) goto out_nlmsg_trim; if ((req->xdiag_show & XDP_SHOW_UMEM) && xsk_diag_put_umem(xs, nlskb)) goto out_nlmsg_trim; if ((req->xdiag_show & XDP_SHOW_MEMINFO) && sock_diag_put_meminfo(sk, nlskb, XDP_DIAG_MEMINFO)) goto out_nlmsg_trim; if ((req->xdiag_show & XDP_SHOW_STATS) && xsk_diag_put_stats(xs, nlskb)) goto out_nlmsg_trim; mutex_unlock(&xs->mutex); nlmsg_end(nlskb, nlh); return 0; out_nlmsg_trim: mutex_unlock(&xs->mutex); nlmsg_cancel(nlskb, nlh); return -EMSGSIZE; } static int xsk_diag_dump(struct sk_buff *nlskb, struct netlink_callback *cb) { struct xdp_diag_req *req = nlmsg_data(cb->nlh); struct net *net = sock_net(nlskb->sk); int num = 0, s_num = cb->args[0]; struct sock *sk; mutex_lock(&net->xdp.lock); sk_for_each(sk, &net->xdp.list) { if (!net_eq(sock_net(sk), net)) continue; if (num++ < s_num) continue; if (xsk_diag_fill(sk, nlskb, req, sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, sock_i_ino(sk)) < 0) { num--; break; } } mutex_unlock(&net->xdp.lock); cb->args[0] = num; return nlskb->len; } static int xsk_diag_handler_dump(struct sk_buff *nlskb, struct nlmsghdr *hdr) { struct netlink_dump_control c = { .dump = xsk_diag_dump }; int hdrlen = sizeof(struct xdp_diag_req); struct net *net = sock_net(nlskb->sk); if (nlmsg_len(hdr) < hdrlen) return -EINVAL; if (!(hdr->nlmsg_flags & NLM_F_DUMP)) return -EOPNOTSUPP; return netlink_dump_start(net->diag_nlsk, nlskb, hdr, &c); } static const struct sock_diag_handler xsk_diag_handler = { .owner = THIS_MODULE, .family = AF_XDP, .dump = xsk_diag_handler_dump, }; static int __init xsk_diag_init(void) { return sock_diag_register(&xsk_diag_handler); } static void __exit xsk_diag_exit(void) { sock_diag_unregister(&xsk_diag_handler); } module_init(xsk_diag_init); module_exit(xsk_diag_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("XDP socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, AF_XDP);
191 191 191 191 191 191 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 // SPDX-License-Identifier: GPL-2.0 /* * Neil Brown <neilb@cse.unsw.edu.au> * J. Bruce Fields <bfields@umich.edu> * Andy Adamson <andros@umich.edu> * Dug Song <dugsong@monkey.org> * * RPCSEC_GSS server authentication. * This implements RPCSEC_GSS as defined in rfc2203 (rpcsec_gss) and rfc2078 * (gssapi) * * The RPCSEC_GSS involves three stages: * 1/ context creation * 2/ data exchange * 3/ context destruction * * Context creation is handled largely by upcalls to user-space. * In particular, GSS_Accept_sec_context is handled by an upcall * Data exchange is handled entirely within the kernel * In particular, GSS_GetMIC, GSS_VerifyMIC, GSS_Seal, GSS_Unseal are in-kernel. * Context destruction is handled in-kernel * GSS_Delete_sec_context is in-kernel * * Context creation is initiated by a RPCSEC_GSS_INIT request arriving. * The context handle and gss_token are used as a key into the rpcsec_init cache. * The content of this cache includes some of the outputs of GSS_Accept_sec_context, * being major_status, minor_status, context_handle, reply_token. * These are sent back to the client. * Sequence window management is handled by the kernel. The window size if currently * a compile time constant. * * When user-space is happy that a context is established, it places an entry * in the rpcsec_context cache. The key for this cache is the context_handle. * The content includes: * uid/gidlist - for determining access rights * mechanism type * mechanism specific information, such as a key * */ #include <linux/slab.h> #include <linux/types.h> #include <linux/module.h> #include <linux/pagemap.h> #include <linux/user_namespace.h> #include <linux/sunrpc/auth_gss.h> #include <linux/sunrpc/gss_err.h> #include <linux/sunrpc/svcauth.h> #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/cache.h> #include <linux/sunrpc/gss_krb5.h> #include <trace/events/rpcgss.h> #include "gss_rpc_upcall.h" /* * Unfortunately there isn't a maximum checksum size exported via the * GSS API. Manufacture one based on GSS mechanisms supported by this * implementation. */ #define GSS_MAX_CKSUMSIZE (GSS_KRB5_TOK_HDR_LEN + GSS_KRB5_MAX_CKSUM_LEN) /* * This value may be increased in the future to accommodate other * usage of the scratch buffer. */ #define GSS_SCRATCH_SIZE GSS_MAX_CKSUMSIZE struct gss_svc_data { /* decoded gss client cred: */ struct rpc_gss_wire_cred clcred; u32 gsd_databody_offset; struct rsc *rsci; /* for temporary results */ __be32 gsd_seq_num; u8 gsd_scratch[GSS_SCRATCH_SIZE]; }; /* The rpcsec_init cache is used for mapping RPCSEC_GSS_{,CONT_}INIT requests * into replies. * * Key is context handle (\x if empty) and gss_token. * Content is major_status minor_status (integers) context_handle, reply_token. * */ static int netobj_equal(struct xdr_netobj *a, struct xdr_netobj *b) { return a->len == b->len && 0 == memcmp(a->data, b->data, a->len); } #define RSI_HASHBITS 6 #define RSI_HASHMAX (1<<RSI_HASHBITS) struct rsi { struct cache_head h; struct xdr_netobj in_handle, in_token; struct xdr_netobj out_handle, out_token; int major_status, minor_status; struct rcu_head rcu_head; }; static struct rsi *rsi_update(struct cache_detail *cd, struct rsi *new, struct rsi *old); static struct rsi *rsi_lookup(struct cache_detail *cd, struct rsi *item); static void rsi_free(struct rsi *rsii) { kfree(rsii->in_handle.data); kfree(rsii->in_token.data); kfree(rsii->out_handle.data); kfree(rsii->out_token.data); } static void rsi_free_rcu(struct rcu_head *head) { struct rsi *rsii = container_of(head, struct rsi, rcu_head); rsi_free(rsii); kfree(rsii); } static void rsi_put(struct kref *ref) { struct rsi *rsii = container_of(ref, struct rsi, h.ref); call_rcu(&rsii->rcu_head, rsi_free_rcu); } static inline int rsi_hash(struct rsi *item) { return hash_mem(item->in_handle.data, item->in_handle.len, RSI_HASHBITS) ^ hash_mem(item->in_token.data, item->in_token.len, RSI_HASHBITS); } static int rsi_match(struct cache_head *a, struct cache_head *b) { struct rsi *item = container_of(a, struct rsi, h); struct rsi *tmp = container_of(b, struct rsi, h); return netobj_equal(&item->in_handle, &tmp->in_handle) && netobj_equal(&item->in_token, &tmp->in_token); } static int dup_to_netobj(struct xdr_netobj *dst, char *src, int len) { dst->len = len; dst->data = (len ? kmemdup(src, len, GFP_KERNEL) : NULL); if (len && !dst->data) return -ENOMEM; return 0; } static inline int dup_netobj(struct xdr_netobj *dst, struct xdr_netobj *src) { return dup_to_netobj(dst, src->data, src->len); } static void rsi_init(struct cache_head *cnew, struct cache_head *citem) { struct rsi *new = container_of(cnew, struct rsi, h); struct rsi *item = container_of(citem, struct rsi, h); new->out_handle.data = NULL; new->out_handle.len = 0; new->out_token.data = NULL; new->out_token.len = 0; new->in_handle.len = item->in_handle.len; item->in_handle.len = 0; new->in_token.len = item->in_token.len; item->in_token.len = 0; new->in_handle.data = item->in_handle.data; item->in_handle.data = NULL; new->in_token.data = item->in_token.data; item->in_token.data = NULL; } static void update_rsi(struct cache_head *cnew, struct cache_head *citem) { struct rsi *new = container_of(cnew, struct rsi, h); struct rsi *item = container_of(citem, struct rsi, h); BUG_ON(new->out_handle.data || new->out_token.data); new->out_handle.len = item->out_handle.len; item->out_handle.len = 0; new->out_token.len = item->out_token.len; item->out_token.len = 0; new->out_handle.data = item->out_handle.data; item->out_handle.data = NULL; new->out_token.data = item->out_token.data; item->out_token.data = NULL; new->major_status = item->major_status; new->minor_status = item->minor_status; } static struct cache_head *rsi_alloc(void) { struct rsi *rsii = kmalloc(sizeof(*rsii), GFP_KERNEL); if (rsii) return &rsii->h; else return NULL; } static int rsi_upcall(struct cache_detail *cd, struct cache_head *h) { return sunrpc_cache_pipe_upcall_timeout(cd, h); } static void rsi_request(struct cache_detail *cd, struct cache_head *h, char **bpp, int *blen) { struct rsi *rsii = container_of(h, struct rsi, h); qword_addhex(bpp, blen, rsii->in_handle.data, rsii->in_handle.len); qword_addhex(bpp, blen, rsii->in_token.data, rsii->in_token.len); (*bpp)[-1] = '\n'; WARN_ONCE(*blen < 0, "RPCSEC/GSS credential too large - please use gssproxy\n"); } static int rsi_parse(struct cache_detail *cd, char *mesg, int mlen) { /* context token expiry major minor context token */ char *buf = mesg; char *ep; int len; struct rsi rsii, *rsip = NULL; time64_t expiry; int status = -EINVAL; memset(&rsii, 0, sizeof(rsii)); /* handle */ len = qword_get(&mesg, buf, mlen); if (len < 0) goto out; status = -ENOMEM; if (dup_to_netobj(&rsii.in_handle, buf, len)) goto out; /* token */ len = qword_get(&mesg, buf, mlen); status = -EINVAL; if (len < 0) goto out; status = -ENOMEM; if (dup_to_netobj(&rsii.in_token, buf, len)) goto out; rsip = rsi_lookup(cd, &rsii); if (!rsip) goto out; rsii.h.flags = 0; /* expiry */ status = get_expiry(&mesg, &expiry); if (status) goto out; status = -EINVAL; /* major/minor */ len = qword_get(&mesg, buf, mlen); if (len <= 0) goto out; rsii.major_status = simple_strtoul(buf, &ep, 10); if (*ep) goto out; len = qword_get(&mesg, buf, mlen); if (len <= 0) goto out; rsii.minor_status = simple_strtoul(buf, &ep, 10); if (*ep) goto out; /* out_handle */ len = qword_get(&mesg, buf, mlen); if (len < 0) goto out; status = -ENOMEM; if (dup_to_netobj(&rsii.out_handle, buf, len)) goto out; /* out_token */ len = qword_get(&mesg, buf, mlen); status = -EINVAL; if (len < 0) goto out; status = -ENOMEM; if (dup_to_netobj(&rsii.out_token, buf, len)) goto out; rsii.h.expiry_time = expiry; rsip = rsi_update(cd, &rsii, rsip); status = 0; out: rsi_free(&rsii); if (rsip) cache_put(&rsip->h, cd); else status = -ENOMEM; return status; } static const struct cache_detail rsi_cache_template = { .owner = THIS_MODULE, .hash_size = RSI_HASHMAX, .name = "auth.rpcsec.init", .cache_put = rsi_put, .cache_upcall = rsi_upcall, .cache_request = rsi_request, .cache_parse = rsi_parse, .match = rsi_match, .init = rsi_init, .update = update_rsi, .alloc = rsi_alloc, }; static struct rsi *rsi_lookup(struct cache_detail *cd, struct rsi *item) { struct cache_head *ch; int hash = rsi_hash(item); ch = sunrpc_cache_lookup_rcu(cd, &item->h, hash); if (ch) return container_of(ch, struct rsi, h); else return NULL; } static struct rsi *rsi_update(struct cache_detail *cd, struct rsi *new, struct rsi *old) { struct cache_head *ch; int hash = rsi_hash(new); ch = sunrpc_cache_update(cd, &new->h, &old->h, hash); if (ch) return container_of(ch, struct rsi, h); else return NULL; } /* * The rpcsec_context cache is used to store a context that is * used in data exchange. * The key is a context handle. The content is: * uid, gidlist, mechanism, service-set, mech-specific-data */ #define RSC_HASHBITS 10 #define RSC_HASHMAX (1<<RSC_HASHBITS) #define GSS_SEQ_WIN 128 struct gss_svc_seq_data { /* highest seq number seen so far: */ u32 sd_max; /* for i such that sd_max-GSS_SEQ_WIN < i <= sd_max, the i-th bit of * sd_win is nonzero iff sequence number i has been seen already: */ unsigned long sd_win[GSS_SEQ_WIN/BITS_PER_LONG]; spinlock_t sd_lock; }; struct rsc { struct cache_head h; struct xdr_netobj handle; struct svc_cred cred; struct gss_svc_seq_data seqdata; struct gss_ctx *mechctx; struct rcu_head rcu_head; }; static struct rsc *rsc_update(struct cache_detail *cd, struct rsc *new, struct rsc *old); static struct rsc *rsc_lookup(struct cache_detail *cd, struct rsc *item); static void rsc_free(struct rsc *rsci) { kfree(rsci->handle.data); if (rsci->mechctx) gss_delete_sec_context(&rsci->mechctx); free_svc_cred(&rsci->cred); } static void rsc_free_rcu(struct rcu_head *head) { struct rsc *rsci = container_of(head, struct rsc, rcu_head); kfree(rsci->handle.data); kfree(rsci); } static void rsc_put(struct kref *ref) { struct rsc *rsci = container_of(ref, struct rsc, h.ref); if (rsci->mechctx) gss_delete_sec_context(&rsci->mechctx); free_svc_cred(&rsci->cred); call_rcu(&rsci->rcu_head, rsc_free_rcu); } static inline int rsc_hash(struct rsc *rsci) { return hash_mem(rsci->handle.data, rsci->handle.len, RSC_HASHBITS); } static int rsc_match(struct cache_head *a, struct cache_head *b) { struct rsc *new = container_of(a, struct rsc, h); struct rsc *tmp = container_of(b, struct rsc, h); return netobj_equal(&new->handle, &tmp->handle); } static void rsc_init(struct cache_head *cnew, struct cache_head *ctmp) { struct rsc *new = container_of(cnew, struct rsc, h); struct rsc *tmp = container_of(ctmp, struct rsc, h); new->handle.len = tmp->handle.len; tmp->handle.len = 0; new->handle.data = tmp->handle.data; tmp->handle.data = NULL; new->mechctx = NULL; init_svc_cred(&new->cred); } static void update_rsc(struct cache_head *cnew, struct cache_head *ctmp) { struct rsc *new = container_of(cnew, struct rsc, h); struct rsc *tmp = container_of(ctmp, struct rsc, h); new->mechctx = tmp->mechctx; tmp->mechctx = NULL; memset(&new->seqdata, 0, sizeof(new->seqdata)); spin_lock_init(&new->seqdata.sd_lock); new->cred = tmp->cred; init_svc_cred(&tmp->cred); } static struct cache_head * rsc_alloc(void) { struct rsc *rsci = kmalloc(sizeof(*rsci), GFP_KERNEL); if (rsci) return &rsci->h; else return NULL; } static int rsc_upcall(struct cache_detail *cd, struct cache_head *h) { return -EINVAL; } static int rsc_parse(struct cache_detail *cd, char *mesg, int mlen) { /* contexthandle expiry [ uid gid N <n gids> mechname ...mechdata... ] */ char *buf = mesg; int id; int len, rv; struct rsc rsci, *rscp = NULL; time64_t expiry; int status = -EINVAL; struct gss_api_mech *gm = NULL; memset(&rsci, 0, sizeof(rsci)); /* context handle */ len = qword_get(&mesg, buf, mlen); if (len < 0) goto out; status = -ENOMEM; if (dup_to_netobj(&rsci.handle, buf, len)) goto out; rsci.h.flags = 0; /* expiry */ status = get_expiry(&mesg, &expiry); if (status) goto out; status = -EINVAL; rscp = rsc_lookup(cd, &rsci); if (!rscp) goto out; /* uid, or NEGATIVE */ rv = get_int(&mesg, &id); if (rv == -EINVAL) goto out; if (rv == -ENOENT) set_bit(CACHE_NEGATIVE, &rsci.h.flags); else { int N, i; /* * NOTE: we skip uid_valid()/gid_valid() checks here: * instead, * -1 id's are later mapped to the * (export-specific) anonymous id by nfsd_setuser. * * (But supplementary gid's get no such special * treatment so are checked for validity here.) */ /* uid */ rsci.cred.cr_uid = make_kuid(current_user_ns(), id); /* gid */ if (get_int(&mesg, &id)) goto out; rsci.cred.cr_gid = make_kgid(current_user_ns(), id); /* number of additional gid's */ if (get_int(&mesg, &N)) goto out; if (N < 0 || N > NGROUPS_MAX) goto out; status = -ENOMEM; rsci.cred.cr_group_info = groups_alloc(N); if (rsci.cred.cr_group_info == NULL) goto out; /* gid's */ status = -EINVAL; for (i=0; i<N; i++) { kgid_t kgid; if (get_int(&mesg, &id)) goto out; kgid = make_kgid(current_user_ns(), id); if (!gid_valid(kgid)) goto out; rsci.cred.cr_group_info->gid[i] = kgid; } groups_sort(rsci.cred.cr_group_info); /* mech name */ len = qword_get(&mesg, buf, mlen); if (len < 0) goto out; gm = rsci.cred.cr_gss_mech = gss_mech_get_by_name(buf); status = -EOPNOTSUPP; if (!gm) goto out; status = -EINVAL; /* mech-specific data: */ len = qword_get(&mesg, buf, mlen); if (len < 0) goto out; status = gss_import_sec_context(buf, len, gm, &rsci.mechctx, NULL, GFP_KERNEL); if (status) goto out; /* get client name */ len = qword_get(&mesg, buf, mlen); if (len > 0) { rsci.cred.cr_principal = kstrdup(buf, GFP_KERNEL); if (!rsci.cred.cr_principal) { status = -ENOMEM; goto out; } } } rsci.h.expiry_time = expiry; rscp = rsc_update(cd, &rsci, rscp); status = 0; out: rsc_free(&rsci); if (rscp) cache_put(&rscp->h, cd); else status = -ENOMEM; return status; } static const struct cache_detail rsc_cache_template = { .owner = THIS_MODULE, .hash_size = RSC_HASHMAX, .name = "auth.rpcsec.context", .cache_put = rsc_put, .cache_upcall = rsc_upcall, .cache_parse = rsc_parse, .match = rsc_match, .init = rsc_init, .update = update_rsc, .alloc = rsc_alloc, }; static struct rsc *rsc_lookup(struct cache_detail *cd, struct rsc *item) { struct cache_head *ch; int hash = rsc_hash(item); ch = sunrpc_cache_lookup_rcu(cd, &item->h, hash); if (ch) return container_of(ch, struct rsc, h); else return NULL; } static struct rsc *rsc_update(struct cache_detail *cd, struct rsc *new, struct rsc *old) { struct cache_head *ch; int hash = rsc_hash(new); ch = sunrpc_cache_update(cd, &new->h, &old->h, hash); if (ch) return container_of(ch, struct rsc, h); else return NULL; } static struct rsc * gss_svc_searchbyctx(struct cache_detail *cd, struct xdr_netobj *handle) { struct rsc rsci; struct rsc *found; memset(&rsci, 0, sizeof(rsci)); if (dup_to_netobj(&rsci.handle, handle->data, handle->len)) return NULL; found = rsc_lookup(cd, &rsci); rsc_free(&rsci); if (!found) return NULL; if (cache_check(cd, &found->h, NULL)) return NULL; return found; } /** * gss_check_seq_num - GSS sequence number window check * @rqstp: RPC Call to use when reporting errors * @rsci: cached GSS context state (updated on return) * @seq_num: sequence number to check * * Implements sequence number algorithm as specified in * RFC 2203, Section 5.3.3.1. "Context Management". * * Return values: * %true: @rqstp's GSS sequence number is inside the window * %false: @rqstp's GSS sequence number is outside the window */ static bool gss_check_seq_num(const struct svc_rqst *rqstp, struct rsc *rsci, u32 seq_num) { struct gss_svc_seq_data *sd = &rsci->seqdata; bool result = false; spin_lock(&sd->sd_lock); if (seq_num > sd->sd_max) { if (seq_num >= sd->sd_max + GSS_SEQ_WIN) { memset(sd->sd_win, 0, sizeof(sd->sd_win)); sd->sd_max = seq_num; } else while (sd->sd_max < seq_num) { sd->sd_max++; __clear_bit(sd->sd_max % GSS_SEQ_WIN, sd->sd_win); } __set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win); goto ok; } else if (seq_num + GSS_SEQ_WIN <= sd->sd_max) { goto toolow; } if (__test_and_set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win)) goto alreadyseen; ok: result = true; out: spin_unlock(&sd->sd_lock); return result; toolow: trace_rpcgss_svc_seqno_low(rqstp, seq_num, sd->sd_max - GSS_SEQ_WIN, sd->sd_max); goto out; alreadyseen: trace_rpcgss_svc_seqno_seen(rqstp, seq_num); goto out; } /* * Decode and verify a Call's verifier field. For RPC_AUTH_GSS Calls, * the body of this field contains a variable length checksum. * * GSS-specific auth_stat values are mandated by RFC 2203 Section * 5.3.3.3. */ static int svcauth_gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, __be32 *rpcstart, struct rpc_gss_wire_cred *gc) { struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct gss_ctx *ctx_id = rsci->mechctx; u32 flavor, maj_stat; struct xdr_buf rpchdr; struct xdr_netobj checksum; struct kvec iov; /* * Compute the checksum of the incoming Call from the * XID field to credential field: */ iov.iov_base = rpcstart; iov.iov_len = (u8 *)xdr->p - (u8 *)rpcstart; xdr_buf_from_iov(&iov, &rpchdr); /* Call's verf field: */ if (xdr_stream_decode_opaque_auth(xdr, &flavor, (void **)&checksum.data, &checksum.len) < 0) { rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; } if (flavor != RPC_AUTH_GSS || checksum.len < XDR_UNIT) { rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; } if (rqstp->rq_deferred) return SVC_OK; maj_stat = gss_verify_mic(ctx_id, &rpchdr, &checksum); if (maj_stat != GSS_S_COMPLETE) { trace_rpcgss_svc_mic(rqstp, maj_stat); rqstp->rq_auth_stat = rpcsec_gsserr_credproblem; return SVC_DENIED; } if (gc->gc_seq > MAXSEQ) { trace_rpcgss_svc_seqno_large(rqstp, gc->gc_seq); rqstp->rq_auth_stat = rpcsec_gsserr_ctxproblem; return SVC_DENIED; } if (!gss_check_seq_num(rqstp, rsci, gc->gc_seq)) return SVC_DROP; return SVC_OK; } /* * Construct and encode a Reply's verifier field. The verifier's body * field contains a variable-length checksum of the GSS sequence * number. */ static bool svcauth_gss_encode_verf(struct svc_rqst *rqstp, struct gss_ctx *ctx_id, u32 seq) { struct gss_svc_data *gsd = rqstp->rq_auth_data; u32 maj_stat; struct xdr_buf verf_data; struct xdr_netobj checksum; struct kvec iov; gsd->gsd_seq_num = cpu_to_be32(seq); iov.iov_base = &gsd->gsd_seq_num; iov.iov_len = XDR_UNIT; xdr_buf_from_iov(&iov, &verf_data); checksum.data = gsd->gsd_scratch; maj_stat = gss_get_mic(ctx_id, &verf_data, &checksum); if (maj_stat != GSS_S_COMPLETE) goto bad_mic; return xdr_stream_encode_opaque_auth(&rqstp->rq_res_stream, RPC_AUTH_GSS, checksum.data, checksum.len) > 0; bad_mic: trace_rpcgss_svc_get_mic(rqstp, maj_stat); return false; } struct gss_domain { struct auth_domain h; u32 pseudoflavor; }; static struct auth_domain * find_gss_auth_domain(struct gss_ctx *ctx, u32 svc) { char *name; name = gss_service_to_auth_domain_name(ctx->mech_type, svc); if (!name) return NULL; return auth_domain_find(name); } static struct auth_ops svcauthops_gss; u32 svcauth_gss_flavor(struct auth_domain *dom) { struct gss_domain *gd = container_of(dom, struct gss_domain, h); return gd->pseudoflavor; } EXPORT_SYMBOL_GPL(svcauth_gss_flavor); struct auth_domain * svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name) { struct gss_domain *new; struct auth_domain *test; int stat = -ENOMEM; new = kmalloc(sizeof(*new), GFP_KERNEL); if (!new) goto out; kref_init(&new->h.ref); new->h.name = kstrdup(name, GFP_KERNEL); if (!new->h.name) goto out_free_dom; new->h.flavour = &svcauthops_gss; new->pseudoflavor = pseudoflavor; test = auth_domain_lookup(name, &new->h); if (test != &new->h) { pr_warn("svc: duplicate registration of gss pseudo flavour %s.\n", name); stat = -EADDRINUSE; auth_domain_put(test); goto out_free_name; } return test; out_free_name: kfree(new->h.name); out_free_dom: kfree(new); out: return ERR_PTR(stat); } EXPORT_SYMBOL_GPL(svcauth_gss_register_pseudoflavor); /* * RFC 2203, Section 5.3.2.2 * * struct rpc_gss_integ_data { * opaque databody_integ<>; * opaque checksum<>; * }; * * struct rpc_gss_data_t { * unsigned int seq_num; * proc_req_arg_t arg; * }; */ static noinline_for_stack int svcauth_gss_unwrap_integ(struct svc_rqst *rqstp, u32 seq, struct gss_ctx *ctx) { struct gss_svc_data *gsd = rqstp->rq_auth_data; struct xdr_stream *xdr = &rqstp->rq_arg_stream; u32 len, offset, seq_num, maj_stat; struct xdr_buf *buf = xdr->buf; struct xdr_buf databody_integ; struct xdr_netobj checksum; /* Did we already verify the signature on the original pass through? */ if (rqstp->rq_deferred) return 0; if (xdr_stream_decode_u32(xdr, &len) < 0) goto unwrap_failed; if (len & 3) goto unwrap_failed; offset = xdr_stream_pos(xdr); if (xdr_buf_subsegment(buf, &databody_integ, offset, len)) goto unwrap_failed; /* * The xdr_stream now points to the @seq_num field. The next * XDR data item is the @arg field, which contains the clear * text RPC program payload. The checksum, which follows the * @arg field, is located and decoded without updating the * xdr_stream. */ offset += len; if (xdr_decode_word(buf, offset, &checksum.len)) goto unwrap_failed; if (checksum.len > sizeof(gsd->gsd_scratch)) goto unwrap_failed; checksum.data = gsd->gsd_scratch; if (read_bytes_from_xdr_buf(buf, offset + XDR_UNIT, checksum.data, checksum.len)) goto unwrap_failed; maj_stat = gss_verify_mic(ctx, &databody_integ, &checksum); if (maj_stat != GSS_S_COMPLETE) goto bad_mic; /* The received seqno is protected by the checksum. */ if (xdr_stream_decode_u32(xdr, &seq_num) < 0) goto unwrap_failed; if (seq_num != seq) goto bad_seqno; xdr_truncate_decode(xdr, XDR_UNIT + checksum.len); return 0; unwrap_failed: trace_rpcgss_svc_unwrap_failed(rqstp); return -EINVAL; bad_seqno: trace_rpcgss_svc_seqno_bad(rqstp, seq, seq_num); return -EINVAL; bad_mic: trace_rpcgss_svc_mic(rqstp, maj_stat); return -EINVAL; } /* * RFC 2203, Section 5.3.2.3 * * struct rpc_gss_priv_data { * opaque databody_priv<> * }; * * struct rpc_gss_data_t { * unsigned int seq_num; * proc_req_arg_t arg; * }; */ static noinline_for_stack int svcauth_gss_unwrap_priv(struct svc_rqst *rqstp, u32 seq, struct gss_ctx *ctx) { struct xdr_stream *xdr = &rqstp->rq_arg_stream; u32 len, maj_stat, seq_num, offset; struct xdr_buf *buf = xdr->buf; unsigned int saved_len; if (xdr_stream_decode_u32(xdr, &len) < 0) goto unwrap_failed; if (rqstp->rq_deferred) { /* Already decrypted last time through! The sequence number * check at out_seq is unnecessary but harmless: */ goto out_seq; } if (len > xdr_stream_remaining(xdr)) goto unwrap_failed; offset = xdr_stream_pos(xdr); saved_len = buf->len; maj_stat = gss_unwrap(ctx, offset, offset + len, buf); if (maj_stat != GSS_S_COMPLETE) goto bad_unwrap; xdr->nwords -= XDR_QUADLEN(saved_len - buf->len); out_seq: /* gss_unwrap() decrypted the sequence number. */ if (xdr_stream_decode_u32(xdr, &seq_num) < 0) goto unwrap_failed; if (seq_num != seq) goto bad_seqno; return 0; unwrap_failed: trace_rpcgss_svc_unwrap_failed(rqstp); return -EINVAL; bad_seqno: trace_rpcgss_svc_seqno_bad(rqstp, seq, seq_num); return -EINVAL; bad_unwrap: trace_rpcgss_svc_unwrap(rqstp, maj_stat); return -EINVAL; } static enum svc_auth_status svcauth_gss_set_client(struct svc_rqst *rqstp) { struct gss_svc_data *svcdata = rqstp->rq_auth_data; struct rsc *rsci = svcdata->rsci; struct rpc_gss_wire_cred *gc = &svcdata->clcred; int stat; rqstp->rq_auth_stat = rpc_autherr_badcred; /* * A gss export can be specified either by: * export *(sec=krb5,rw) * or by * export gss/krb5(rw) * The latter is deprecated; but for backwards compatibility reasons * the nfsd code will still fall back on trying it if the former * doesn't work; so we try to make both available to nfsd, below. */ rqstp->rq_gssclient = find_gss_auth_domain(rsci->mechctx, gc->gc_svc); if (rqstp->rq_gssclient == NULL) return SVC_DENIED; stat = svcauth_unix_set_client(rqstp); if (stat == SVC_DROP || stat == SVC_CLOSE) return stat; rqstp->rq_auth_stat = rpc_auth_ok; return SVC_OK; } static bool svcauth_gss_proc_init_verf(struct cache_detail *cd, struct svc_rqst *rqstp, struct xdr_netobj *out_handle, int *major_status, u32 seq_num) { struct xdr_stream *xdr = &rqstp->rq_res_stream; struct rsc *rsci; bool rc; if (*major_status != GSS_S_COMPLETE) goto null_verifier; rsci = gss_svc_searchbyctx(cd, out_handle); if (rsci == NULL) { *major_status = GSS_S_NO_CONTEXT; goto null_verifier; } rc = svcauth_gss_encode_verf(rqstp, rsci->mechctx, seq_num); cache_put(&rsci->h, cd); return rc; null_verifier: return xdr_stream_encode_opaque_auth(xdr, RPC_AUTH_NULL, NULL, 0) > 0; } static void gss_free_in_token_pages(struct gssp_in_token *in_token) { int i; i = 0; while (in_token->pages[i]) put_page(in_token->pages[i++]); kfree(in_token->pages); in_token->pages = NULL; } static int gss_read_proxy_verf(struct svc_rqst *rqstp, struct rpc_gss_wire_cred *gc, struct xdr_netobj *in_handle, struct gssp_in_token *in_token) { struct xdr_stream *xdr = &rqstp->rq_arg_stream; unsigned int length, pgto_offs, pgfrom_offs; int pages, i, pgto, pgfrom; size_t to_offs, from_offs; u32 inlen; if (dup_netobj(in_handle, &gc->gc_ctx)) return SVC_CLOSE; /* * RFC 2203 Section 5.2.2 * * struct rpc_gss_init_arg { * opaque gss_token<>; * }; */ if (xdr_stream_decode_u32(xdr, &inlen) < 0) goto out_denied_free; if (inlen > xdr_stream_remaining(xdr)) goto out_denied_free; pages = DIV_ROUND_UP(inlen, PAGE_SIZE); in_token->pages = kcalloc(pages + 1, sizeof(struct page *), GFP_KERNEL); if (!in_token->pages) goto out_denied_free; in_token->page_base = 0; in_token->page_len = inlen; for (i = 0; i < pages; i++) { in_token->pages[i] = alloc_page(GFP_KERNEL); if (!in_token->pages[i]) { gss_free_in_token_pages(in_token); goto out_denied_free; } } length = min_t(unsigned int, inlen, (char *)xdr->end - (char *)xdr->p); memcpy(page_address(in_token->pages[0]), xdr->p, length); inlen -= length; to_offs = length; from_offs = rqstp->rq_arg.page_base; while (inlen) { pgto = to_offs >> PAGE_SHIFT; pgfrom = from_offs >> PAGE_SHIFT; pgto_offs = to_offs & ~PAGE_MASK; pgfrom_offs = from_offs & ~PAGE_MASK; length = min_t(unsigned int, inlen, min_t(unsigned int, PAGE_SIZE - pgto_offs, PAGE_SIZE - pgfrom_offs)); memcpy(page_address(in_token->pages[pgto]) + pgto_offs, page_address(rqstp->rq_arg.pages[pgfrom]) + pgfrom_offs, length); to_offs += length; from_offs += length; inlen -= length; } return 0; out_denied_free: kfree(in_handle->data); return SVC_DENIED; } /* * RFC 2203, Section 5.2.3.1. * * struct rpc_gss_init_res { * opaque handle<>; * unsigned int gss_major; * unsigned int gss_minor; * unsigned int seq_window; * opaque gss_token<>; * }; */ static bool svcxdr_encode_gss_init_res(struct xdr_stream *xdr, struct xdr_netobj *handle, struct xdr_netobj *gss_token, unsigned int major_status, unsigned int minor_status, u32 seq_num) { if (xdr_stream_encode_opaque(xdr, handle->data, handle->len) < 0) return false; if (xdr_stream_encode_u32(xdr, major_status) < 0) return false; if (xdr_stream_encode_u32(xdr, minor_status) < 0) return false; if (xdr_stream_encode_u32(xdr, seq_num) < 0) return false; if (xdr_stream_encode_opaque(xdr, gss_token->data, gss_token->len) < 0) return false; return true; } /* * Having read the cred already and found we're in the context * initiation case, read the verifier and initiate (or check the results * of) upcalls to userspace for help with context initiation. If * the upcall results are available, write the verifier and result. * Otherwise, drop the request pending an answer to the upcall. */ static int svcauth_gss_legacy_init(struct svc_rqst *rqstp, struct rpc_gss_wire_cred *gc) { struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct rsi *rsip, rsikey; __be32 *p; u32 len; int ret; struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); memset(&rsikey, 0, sizeof(rsikey)); if (dup_netobj(&rsikey.in_handle, &gc->gc_ctx)) return SVC_CLOSE; /* * RFC 2203 Section 5.2.2 * * struct rpc_gss_init_arg { * opaque gss_token<>; * }; */ if (xdr_stream_decode_u32(xdr, &len) < 0) { kfree(rsikey.in_handle.data); return SVC_DENIED; } p = xdr_inline_decode(xdr, len); if (!p) { kfree(rsikey.in_handle.data); return SVC_DENIED; } rsikey.in_token.data = kmalloc(len, GFP_KERNEL); if (ZERO_OR_NULL_PTR(rsikey.in_token.data)) { kfree(rsikey.in_handle.data); return SVC_CLOSE; } memcpy(rsikey.in_token.data, p, len); rsikey.in_token.len = len; /* Perform upcall, or find upcall result: */ rsip = rsi_lookup(sn->rsi_cache, &rsikey); rsi_free(&rsikey); if (!rsip) return SVC_CLOSE; if (cache_check(sn->rsi_cache, &rsip->h, &rqstp->rq_chandle) < 0) /* No upcall result: */ return SVC_CLOSE; ret = SVC_CLOSE; if (!svcauth_gss_proc_init_verf(sn->rsc_cache, rqstp, &rsip->out_handle, &rsip->major_status, GSS_SEQ_WIN)) goto out; if (!svcxdr_set_accept_stat(rqstp)) goto out; if (!svcxdr_encode_gss_init_res(&rqstp->rq_res_stream, &rsip->out_handle, &rsip->out_token, rsip->major_status, rsip->minor_status, GSS_SEQ_WIN)) goto out; ret = SVC_COMPLETE; out: cache_put(&rsip->h, sn->rsi_cache); return ret; } static int gss_proxy_save_rsc(struct cache_detail *cd, struct gssp_upcall_data *ud, uint64_t *handle) { struct rsc rsci, *rscp = NULL; static atomic64_t ctxhctr; long long ctxh; struct gss_api_mech *gm = NULL; time64_t expiry; int status; memset(&rsci, 0, sizeof(rsci)); /* context handle */ status = -ENOMEM; /* the handle needs to be just a unique id, * use a static counter */ ctxh = atomic64_inc_return(&ctxhctr); /* make a copy for the caller */ *handle = ctxh; /* make a copy for the rsc cache */ if (dup_to_netobj(&rsci.handle, (char *)handle, sizeof(uint64_t))) goto out; rscp = rsc_lookup(cd, &rsci); if (!rscp) goto out; /* creds */ if (!ud->found_creds) { /* userspace seem buggy, we should always get at least a * mapping to nobody */ goto out; } else { struct timespec64 boot; /* steal creds */ rsci.cred = ud->creds; memset(&ud->creds, 0, sizeof(struct svc_cred)); status = -EOPNOTSUPP; /* get mech handle from OID */ gm = gss_mech_get_by_OID(&ud->mech_oid); if (!gm) goto out; rsci.cred.cr_gss_mech = gm; status = -EINVAL; /* mech-specific data: */ status = gss_import_sec_context(ud->out_handle.data, ud->out_handle.len, gm, &rsci.mechctx, &expiry, GFP_KERNEL); if (status) goto out; getboottime64(&boot); expiry -= boot.tv_sec; } rsci.h.expiry_time = expiry; rscp = rsc_update(cd, &rsci, rscp); status = 0; out: rsc_free(&rsci); if (rscp) cache_put(&rscp->h, cd); else status = -ENOMEM; return status; } static int svcauth_gss_proxy_init(struct svc_rqst *rqstp, struct rpc_gss_wire_cred *gc) { struct xdr_netobj cli_handle; struct gssp_upcall_data ud; uint64_t handle; int status; int ret; struct net *net = SVC_NET(rqstp); struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); memset(&ud, 0, sizeof(ud)); ret = gss_read_proxy_verf(rqstp, gc, &ud.in_handle, &ud.in_token); if (ret) return ret; ret = SVC_CLOSE; /* Perform synchronous upcall to gss-proxy */ status = gssp_accept_sec_context_upcall(net, &ud); if (status) goto out; trace_rpcgss_svc_accept_upcall(rqstp, ud.major_status, ud.minor_status); switch (ud.major_status) { case GSS_S_CONTINUE_NEEDED: cli_handle = ud.out_handle; break; case GSS_S_COMPLETE: status = gss_proxy_save_rsc(sn->rsc_cache, &ud, &handle); if (status) goto out; cli_handle.data = (u8 *)&handle; cli_handle.len = sizeof(handle); break; default: goto out; } if (!svcauth_gss_proc_init_verf(sn->rsc_cache, rqstp, &cli_handle, &ud.major_status, GSS_SEQ_WIN)) goto out; if (!svcxdr_set_accept_stat(rqstp)) goto out; if (!svcxdr_encode_gss_init_res(&rqstp->rq_res_stream, &cli_handle, &ud.out_token, ud.major_status, ud.minor_status, GSS_SEQ_WIN)) goto out; ret = SVC_COMPLETE; out: gss_free_in_token_pages(&ud.in_token); gssp_free_upcall_data(&ud); return ret; } /* * Try to set the sn->use_gss_proxy variable to a new value. We only allow * it to be changed if it's currently undefined (-1). If it's any other value * then return -EBUSY unless the type wouldn't have changed anyway. */ static int set_gss_proxy(struct net *net, int type) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); int ret; WARN_ON_ONCE(type != 0 && type != 1); ret = cmpxchg(&sn->use_gss_proxy, -1, type); if (ret != -1 && ret != type) return -EBUSY; return 0; } static bool use_gss_proxy(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); /* If use_gss_proxy is still undefined, then try to disable it */ if (sn->use_gss_proxy == -1) set_gss_proxy(net, 0); return sn->use_gss_proxy; } static noinline_for_stack int svcauth_gss_proc_init(struct svc_rqst *rqstp, struct rpc_gss_wire_cred *gc) { struct xdr_stream *xdr = &rqstp->rq_arg_stream; u32 flavor, len; void *body; /* Call's verf field: */ if (xdr_stream_decode_opaque_auth(xdr, &flavor, &body, &len) < 0) return SVC_GARBAGE; if (flavor != RPC_AUTH_NULL || len != 0) { rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; } if (gc->gc_proc == RPC_GSS_PROC_INIT && gc->gc_ctx.len != 0) { rqstp->rq_auth_stat = rpc_autherr_badcred; return SVC_DENIED; } if (!use_gss_proxy(SVC_NET(rqstp))) return svcauth_gss_legacy_init(rqstp, gc); return svcauth_gss_proxy_init(rqstp, gc); } #ifdef CONFIG_PROC_FS static ssize_t write_gssp(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct net *net = pde_data(file_inode(file)); char tbuf[20]; unsigned long i; int res; if (*ppos || count > sizeof(tbuf)-1) return -EINVAL; if (copy_from_user(tbuf, buf, count)) return -EFAULT; tbuf[count] = 0; res = kstrtoul(tbuf, 0, &i); if (res) return res; if (i != 1) return -EINVAL; res = set_gssp_clnt(net); if (res) return res; res = set_gss_proxy(net, 1); if (res) return res; return count; } static ssize_t read_gssp(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct net *net = pde_data(file_inode(file)); struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); unsigned long p = *ppos; char tbuf[10]; size_t len; snprintf(tbuf, sizeof(tbuf), "%d\n", sn->use_gss_proxy); len = strlen(tbuf); if (p >= len) return 0; len -= p; if (len > count) len = count; if (copy_to_user(buf, (void *)(tbuf+p), len)) return -EFAULT; *ppos += len; return len; } static const struct proc_ops use_gss_proxy_proc_ops = { .proc_open = nonseekable_open, .proc_write = write_gssp, .proc_read = read_gssp, }; static int create_use_gss_proxy_proc_entry(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct proc_dir_entry **p = &sn->use_gssp_proc; sn->use_gss_proxy = -1; *p = proc_create_data("use-gss-proxy", S_IFREG | 0600, sn->proc_net_rpc, &use_gss_proxy_proc_ops, net); if (!*p) return -ENOMEM; init_gssp_clnt(sn); return 0; } static void destroy_use_gss_proxy_proc_entry(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); if (sn->use_gssp_proc) { remove_proc_entry("use-gss-proxy", sn->proc_net_rpc); clear_gssp_clnt(sn); } } static ssize_t read_gss_krb5_enctypes(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct rpcsec_gss_oid oid = { .len = 9, .data = "\x2a\x86\x48\x86\xf7\x12\x01\x02\x02", }; struct gss_api_mech *mech; ssize_t ret; mech = gss_mech_get_by_OID(&oid); if (!mech) return 0; if (!mech->gm_upcall_enctypes) { gss_mech_put(mech); return 0; } ret = simple_read_from_buffer(buf, count, ppos, mech->gm_upcall_enctypes, strlen(mech->gm_upcall_enctypes)); gss_mech_put(mech); return ret; } static const struct proc_ops gss_krb5_enctypes_proc_ops = { .proc_open = nonseekable_open, .proc_read = read_gss_krb5_enctypes, }; static int create_krb5_enctypes_proc_entry(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); sn->gss_krb5_enctypes = proc_create_data("gss_krb5_enctypes", S_IFREG | 0444, sn->proc_net_rpc, &gss_krb5_enctypes_proc_ops, net); return sn->gss_krb5_enctypes ? 0 : -ENOMEM; } static void destroy_krb5_enctypes_proc_entry(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); if (sn->gss_krb5_enctypes) remove_proc_entry("gss_krb5_enctypes", sn->proc_net_rpc); } #else /* CONFIG_PROC_FS */ static int create_use_gss_proxy_proc_entry(struct net *net) { return 0; } static void destroy_use_gss_proxy_proc_entry(struct net *net) {} static int create_krb5_enctypes_proc_entry(struct net *net) { return 0; } static void destroy_krb5_enctypes_proc_entry(struct net *net) {} #endif /* CONFIG_PROC_FS */ /* * The Call's credential body should contain a struct rpc_gss_cred_t. * * RFC 2203 Section 5 * * struct rpc_gss_cred_t { * union switch (unsigned int version) { * case RPCSEC_GSS_VERS_1: * struct { * rpc_gss_proc_t gss_proc; * unsigned int seq_num; * rpc_gss_service_t service; * opaque handle<>; * } rpc_gss_cred_vers_1_t; * } * }; */ static bool svcauth_gss_decode_credbody(struct xdr_stream *xdr, struct rpc_gss_wire_cred *gc, __be32 **rpcstart) { ssize_t handle_len; u32 body_len; __be32 *p; p = xdr_inline_decode(xdr, XDR_UNIT); if (!p) return false; /* * start of rpc packet is 7 u32's back from here: * xid direction rpcversion prog vers proc flavour */ *rpcstart = p - 7; body_len = be32_to_cpup(p); if (body_len > RPC_MAX_AUTH_SIZE) return false; /* struct rpc_gss_cred_t */ if (xdr_stream_decode_u32(xdr, &gc->gc_v) < 0) return false; if (xdr_stream_decode_u32(xdr, &gc->gc_proc) < 0) return false; if (xdr_stream_decode_u32(xdr, &gc->gc_seq) < 0) return false; if (xdr_stream_decode_u32(xdr, &gc->gc_svc) < 0) return false; handle_len = xdr_stream_decode_opaque_inline(xdr, (void **)&gc->gc_ctx.data, body_len); if (handle_len < 0) return false; if (body_len != XDR_UNIT * 5 + xdr_align_size(handle_len)) return false; gc->gc_ctx.len = handle_len; return true; } /** * svcauth_gss_accept - Decode and validate incoming RPC_AUTH_GSS credential * @rqstp: RPC transaction * * Return values: * %SVC_OK: Success * %SVC_COMPLETE: GSS context lifetime event * %SVC_DENIED: Credential or verifier is not valid * %SVC_GARBAGE: Failed to decode credential or verifier * %SVC_CLOSE: Temporary failure * * The rqstp->rq_auth_stat field is also set (see RFCs 2203 and 5531). */ static enum svc_auth_status svcauth_gss_accept(struct svc_rqst *rqstp) { struct gss_svc_data *svcdata = rqstp->rq_auth_data; __be32 *rpcstart; struct rpc_gss_wire_cred *gc; struct rsc *rsci = NULL; int ret; struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); rqstp->rq_auth_stat = rpc_autherr_failed; if (!svcdata) svcdata = kmalloc(sizeof(*svcdata), GFP_KERNEL); if (!svcdata) goto auth_err; rqstp->rq_auth_data = svcdata; svcdata->gsd_databody_offset = 0; svcdata->rsci = NULL; gc = &svcdata->clcred; rqstp->rq_auth_stat = rpc_autherr_badcred; if (!svcauth_gss_decode_credbody(&rqstp->rq_arg_stream, gc, &rpcstart)) goto auth_err; if (gc->gc_v != RPC_GSS_VERSION) goto auth_err; switch (gc->gc_proc) { case RPC_GSS_PROC_INIT: case RPC_GSS_PROC_CONTINUE_INIT: if (rqstp->rq_proc != 0) goto auth_err; return svcauth_gss_proc_init(rqstp, gc); case RPC_GSS_PROC_DESTROY: if (rqstp->rq_proc != 0) goto auth_err; fallthrough; case RPC_GSS_PROC_DATA: rqstp->rq_auth_stat = rpcsec_gsserr_credproblem; rsci = gss_svc_searchbyctx(sn->rsc_cache, &gc->gc_ctx); if (!rsci) goto auth_err; switch (svcauth_gss_verify_header(rqstp, rsci, rpcstart, gc)) { case SVC_OK: break; case SVC_DENIED: goto auth_err; case SVC_DROP: goto drop; } break; default: if (rqstp->rq_proc != 0) goto auth_err; rqstp->rq_auth_stat = rpc_autherr_rejectedcred; goto auth_err; } /* now act upon the command: */ switch (gc->gc_proc) { case RPC_GSS_PROC_DESTROY: if (!svcauth_gss_encode_verf(rqstp, rsci->mechctx, gc->gc_seq)) goto auth_err; if (!svcxdr_set_accept_stat(rqstp)) goto auth_err; /* Delete the entry from the cache_list and call cache_put */ sunrpc_cache_unhash(sn->rsc_cache, &rsci->h); goto complete; case RPC_GSS_PROC_DATA: rqstp->rq_auth_stat = rpcsec_gsserr_ctxproblem; if (!svcauth_gss_encode_verf(rqstp, rsci->mechctx, gc->gc_seq)) goto auth_err; if (!svcxdr_set_accept_stat(rqstp)) goto auth_err; svcdata->gsd_databody_offset = xdr_stream_pos(&rqstp->rq_res_stream); rqstp->rq_cred = rsci->cred; get_group_info(rsci->cred.cr_group_info); rqstp->rq_auth_stat = rpc_autherr_badcred; switch (gc->gc_svc) { case RPC_GSS_SVC_NONE: break; case RPC_GSS_SVC_INTEGRITY: /* placeholders for body length and seq. number: */ xdr_reserve_space(&rqstp->rq_res_stream, XDR_UNIT * 2); if (svcauth_gss_unwrap_integ(rqstp, gc->gc_seq, rsci->mechctx)) goto garbage_args; svcxdr_set_auth_slack(rqstp, RPC_MAX_AUTH_SIZE); break; case RPC_GSS_SVC_PRIVACY: /* placeholders for body length and seq. number: */ xdr_reserve_space(&rqstp->rq_res_stream, XDR_UNIT * 2); if (svcauth_gss_unwrap_priv(rqstp, gc->gc_seq, rsci->mechctx)) goto garbage_args; svcxdr_set_auth_slack(rqstp, RPC_MAX_AUTH_SIZE * 2); break; default: goto auth_err; } svcdata->rsci = rsci; cache_get(&rsci->h); rqstp->rq_cred.cr_flavor = gss_svc_to_pseudoflavor( rsci->mechctx->mech_type, GSS_C_QOP_DEFAULT, gc->gc_svc); ret = SVC_OK; trace_rpcgss_svc_authenticate(rqstp, gc); goto out; } garbage_args: ret = SVC_GARBAGE; goto out; auth_err: xdr_truncate_encode(&rqstp->rq_res_stream, XDR_UNIT * 2); ret = SVC_DENIED; goto out; complete: ret = SVC_COMPLETE; goto out; drop: ret = SVC_CLOSE; out: if (rsci) cache_put(&rsci->h, sn->rsc_cache); return ret; } static u32 svcauth_gss_prepare_to_wrap(struct svc_rqst *rqstp, struct gss_svc_data *gsd) { u32 offset; /* Release can be called twice, but we only wrap once. */ offset = gsd->gsd_databody_offset; gsd->gsd_databody_offset = 0; /* AUTH_ERROR replies are not wrapped. */ if (rqstp->rq_auth_stat != rpc_auth_ok) return 0; /* Also don't wrap if the accept_stat is nonzero: */ if (*rqstp->rq_accept_statp != rpc_success) return 0; return offset; } /* * RFC 2203, Section 5.3.2.2 * * struct rpc_gss_integ_data { * opaque databody_integ<>; * opaque checksum<>; * }; * * struct rpc_gss_data_t { * unsigned int seq_num; * proc_req_arg_t arg; * }; * * The RPC Reply message has already been XDR-encoded. rq_res_stream * is now positioned so that the checksum can be written just past * the RPC Reply message. */ static int svcauth_gss_wrap_integ(struct svc_rqst *rqstp) { struct gss_svc_data *gsd = rqstp->rq_auth_data; struct xdr_stream *xdr = &rqstp->rq_res_stream; struct rpc_gss_wire_cred *gc = &gsd->clcred; struct xdr_buf *buf = xdr->buf; struct xdr_buf databody_integ; struct xdr_netobj checksum; u32 offset, maj_stat; offset = svcauth_gss_prepare_to_wrap(rqstp, gsd); if (!offset) goto out; if (xdr_buf_subsegment(buf, &databody_integ, offset + XDR_UNIT, buf->len - offset - XDR_UNIT)) goto wrap_failed; /* Buffer space for these has already been reserved in * svcauth_gss_accept(). */ if (xdr_encode_word(buf, offset, databody_integ.len)) goto wrap_failed; if (xdr_encode_word(buf, offset + XDR_UNIT, gc->gc_seq)) goto wrap_failed; checksum.data = gsd->gsd_scratch; maj_stat = gss_get_mic(gsd->rsci->mechctx, &databody_integ, &checksum); if (maj_stat != GSS_S_COMPLETE) goto bad_mic; if (xdr_stream_encode_opaque(xdr, checksum.data, checksum.len) < 0) goto wrap_failed; xdr_commit_encode(xdr); out: return 0; bad_mic: trace_rpcgss_svc_get_mic(rqstp, maj_stat); return -EINVAL; wrap_failed: trace_rpcgss_svc_wrap_failed(rqstp); return -EINVAL; } /* * RFC 2203, Section 5.3.2.3 * * struct rpc_gss_priv_data { * opaque databody_priv<> * }; * * struct rpc_gss_data_t { * unsigned int seq_num; * proc_req_arg_t arg; * }; * * gss_wrap() expands the size of the RPC message payload in the * response buffer. The main purpose of svcauth_gss_wrap_priv() * is to ensure there is adequate space in the response buffer to * avoid overflow during the wrap. */ static int svcauth_gss_wrap_priv(struct svc_rqst *rqstp) { struct gss_svc_data *gsd = rqstp->rq_auth_data; struct rpc_gss_wire_cred *gc = &gsd->clcred; struct xdr_buf *buf = &rqstp->rq_res; struct kvec *head = buf->head; struct kvec *tail = buf->tail; u32 offset, pad, maj_stat; __be32 *p; offset = svcauth_gss_prepare_to_wrap(rqstp, gsd); if (!offset) return 0; /* * Buffer space for this field has already been reserved * in svcauth_gss_accept(). Note that the GSS sequence * number is encrypted along with the RPC reply payload. */ if (xdr_encode_word(buf, offset + XDR_UNIT, gc->gc_seq)) goto wrap_failed; /* * If there is currently tail data, make sure there is * room for the head, tail, and 2 * RPC_MAX_AUTH_SIZE in * the page, and move the current tail data such that * there is RPC_MAX_AUTH_SIZE slack space available in * both the head and tail. */ if (tail->iov_base) { if (tail->iov_base >= head->iov_base + PAGE_SIZE) goto wrap_failed; if (tail->iov_base < head->iov_base) goto wrap_failed; if (tail->iov_len + head->iov_len + 2 * RPC_MAX_AUTH_SIZE > PAGE_SIZE) goto wrap_failed; memmove(tail->iov_base + RPC_MAX_AUTH_SIZE, tail->iov_base, tail->iov_len); tail->iov_base += RPC_MAX_AUTH_SIZE; } /* * If there is no current tail data, make sure there is * room for the head data, and 2 * RPC_MAX_AUTH_SIZE in the * allotted page, and set up tail information such that there * is RPC_MAX_AUTH_SIZE slack space available in both the * head and tail. */ if (!tail->iov_base) { if (head->iov_len + 2 * RPC_MAX_AUTH_SIZE > PAGE_SIZE) goto wrap_failed; tail->iov_base = head->iov_base + head->iov_len + RPC_MAX_AUTH_SIZE; tail->iov_len = 0; } maj_stat = gss_wrap(gsd->rsci->mechctx, offset + XDR_UNIT, buf, buf->pages); if (maj_stat != GSS_S_COMPLETE) goto bad_wrap; /* Wrapping can change the size of databody_priv. */ if (xdr_encode_word(buf, offset, buf->len - offset - XDR_UNIT)) goto wrap_failed; pad = xdr_pad_size(buf->len - offset - XDR_UNIT); p = (__be32 *)(tail->iov_base + tail->iov_len); memset(p, 0, pad); tail->iov_len += pad; buf->len += pad; return 0; wrap_failed: trace_rpcgss_svc_wrap_failed(rqstp); return -EINVAL; bad_wrap: trace_rpcgss_svc_wrap(rqstp, maj_stat); return -ENOMEM; } /** * svcauth_gss_release - Wrap payload and release resources * @rqstp: RPC transaction context * * Return values: * %0: the Reply is ready to be sent * %-ENOMEM: failed to allocate memory * %-EINVAL: encoding error */ static int svcauth_gss_release(struct svc_rqst *rqstp) { struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); struct gss_svc_data *gsd = rqstp->rq_auth_data; struct rpc_gss_wire_cred *gc; int stat; if (!gsd) goto out; gc = &gsd->clcred; if (gc->gc_proc != RPC_GSS_PROC_DATA) goto out; switch (gc->gc_svc) { case RPC_GSS_SVC_NONE: break; case RPC_GSS_SVC_INTEGRITY: stat = svcauth_gss_wrap_integ(rqstp); if (stat) goto out_err; break; case RPC_GSS_SVC_PRIVACY: stat = svcauth_gss_wrap_priv(rqstp); if (stat) goto out_err; break; /* * For any other gc_svc value, svcauth_gss_accept() already set * the auth_error appropriately; just fall through: */ } out: stat = 0; out_err: if (rqstp->rq_client) auth_domain_put(rqstp->rq_client); rqstp->rq_client = NULL; if (rqstp->rq_gssclient) auth_domain_put(rqstp->rq_gssclient); rqstp->rq_gssclient = NULL; if (rqstp->rq_cred.cr_group_info) put_group_info(rqstp->rq_cred.cr_group_info); rqstp->rq_cred.cr_group_info = NULL; if (gsd && gsd->rsci) { cache_put(&gsd->rsci->h, sn->rsc_cache); gsd->rsci = NULL; } return stat; } static void svcauth_gss_domain_release_rcu(struct rcu_head *head) { struct auth_domain *dom = container_of(head, struct auth_domain, rcu_head); struct gss_domain *gd = container_of(dom, struct gss_domain, h); kfree(dom->name); kfree(gd); } static void svcauth_gss_domain_release(struct auth_domain *dom) { call_rcu(&dom->rcu_head, svcauth_gss_domain_release_rcu); } static rpc_authflavor_t svcauth_gss_pseudoflavor(struct svc_rqst *rqstp) { return svcauth_gss_flavor(rqstp->rq_gssclient); } static struct auth_ops svcauthops_gss = { .name = "rpcsec_gss", .owner = THIS_MODULE, .flavour = RPC_AUTH_GSS, .accept = svcauth_gss_accept, .release = svcauth_gss_release, .domain_release = svcauth_gss_domain_release, .set_client = svcauth_gss_set_client, .pseudoflavor = svcauth_gss_pseudoflavor, }; static int rsi_cache_create_net(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct cache_detail *cd; int err; cd = cache_create_net(&rsi_cache_template, net); if (IS_ERR(cd)) return PTR_ERR(cd); err = cache_register_net(cd, net); if (err) { cache_destroy_net(cd, net); return err; } sn->rsi_cache = cd; return 0; } static void rsi_cache_destroy_net(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct cache_detail *cd = sn->rsi_cache; sn->rsi_cache = NULL; cache_purge(cd); cache_unregister_net(cd, net); cache_destroy_net(cd, net); } static int rsc_cache_create_net(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct cache_detail *cd; int err; cd = cache_create_net(&rsc_cache_template, net); if (IS_ERR(cd)) return PTR_ERR(cd); err = cache_register_net(cd, net); if (err) { cache_destroy_net(cd, net); return err; } sn->rsc_cache = cd; return 0; } static void rsc_cache_destroy_net(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct cache_detail *cd = sn->rsc_cache; sn->rsc_cache = NULL; cache_purge(cd); cache_unregister_net(cd, net); cache_destroy_net(cd, net); } int gss_svc_init_net(struct net *net) { int rv; rv = rsc_cache_create_net(net); if (rv) return rv; rv = rsi_cache_create_net(net); if (rv) goto out1; rv = create_use_gss_proxy_proc_entry(net); if (rv) goto out2; rv = create_krb5_enctypes_proc_entry(net); if (rv) goto out3; return 0; out3: destroy_use_gss_proxy_proc_entry(net); out2: rsi_cache_destroy_net(net); out1: rsc_cache_destroy_net(net); return rv; } void gss_svc_shutdown_net(struct net *net) { destroy_krb5_enctypes_proc_entry(net); destroy_use_gss_proxy_proc_entry(net); rsi_cache_destroy_net(net); rsc_cache_destroy_net(net); } int gss_svc_init(void) { return svc_auth_register(RPC_AUTH_GSS, &svcauthops_gss); } void gss_svc_shutdown(void) { svc_auth_unregister(RPC_AUTH_GSS); }
87 45 9 1713 1644 24 29 128 22 199 1839 286 2 281 279 279 1486 3 11 13 25 24 9 24 20 38 1639 42 118 3 2 15 41 4 41 126 136 1853 11 3 6 3 1 50 46 8 4 1716 2 2 1 1 1 1 40 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_VIRTIO_NET_H #define _LINUX_VIRTIO_NET_H #include <linux/if_vlan.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/udp.h> #include <uapi/linux/tcp.h> #include <uapi/linux/virtio_net.h> static inline bool virtio_net_hdr_match_proto(__be16 protocol, __u8 gso_type) { switch (gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: return protocol == cpu_to_be16(ETH_P_IP); case VIRTIO_NET_HDR_GSO_TCPV6: return protocol == cpu_to_be16(ETH_P_IPV6); case VIRTIO_NET_HDR_GSO_UDP: case VIRTIO_NET_HDR_GSO_UDP_L4: return protocol == cpu_to_be16(ETH_P_IP) || protocol == cpu_to_be16(ETH_P_IPV6); default: return false; } } static inline int virtio_net_hdr_set_proto(struct sk_buff *skb, const struct virtio_net_hdr *hdr) { if (skb->protocol) return 0; switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: case VIRTIO_NET_HDR_GSO_UDP: case VIRTIO_NET_HDR_GSO_UDP_L4: skb->protocol = cpu_to_be16(ETH_P_IP); break; case VIRTIO_NET_HDR_GSO_TCPV6: skb->protocol = cpu_to_be16(ETH_P_IPV6); break; default: return -EINVAL; } return 0; } static inline int __virtio_net_hdr_to_skb(struct sk_buff *skb, const struct virtio_net_hdr *hdr, bool little_endian, u8 hdr_gso_type) { unsigned int nh_min_len = sizeof(struct iphdr); unsigned int gso_type = 0; unsigned int thlen = 0; unsigned int p_off = 0; unsigned int ip_proto; if (hdr_gso_type != VIRTIO_NET_HDR_GSO_NONE) { switch (hdr_gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: gso_type = SKB_GSO_TCPV4; ip_proto = IPPROTO_TCP; thlen = sizeof(struct tcphdr); break; case VIRTIO_NET_HDR_GSO_TCPV6: gso_type = SKB_GSO_TCPV6; ip_proto = IPPROTO_TCP; thlen = sizeof(struct tcphdr); nh_min_len = sizeof(struct ipv6hdr); break; case VIRTIO_NET_HDR_GSO_UDP: gso_type = SKB_GSO_UDP; ip_proto = IPPROTO_UDP; thlen = sizeof(struct udphdr); break; case VIRTIO_NET_HDR_GSO_UDP_L4: gso_type = SKB_GSO_UDP_L4; ip_proto = IPPROTO_UDP; thlen = sizeof(struct udphdr); break; default: return -EINVAL; } if (hdr_gso_type & VIRTIO_NET_HDR_GSO_ECN) gso_type |= SKB_GSO_TCP_ECN; if (hdr->gso_size == 0) return -EINVAL; } skb_reset_mac_header(skb); if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { u32 start = __virtio16_to_cpu(little_endian, hdr->csum_start); u32 off = __virtio16_to_cpu(little_endian, hdr->csum_offset); u32 needed = start + max_t(u32, thlen, off + sizeof(__sum16)); if (!pskb_may_pull(skb, needed)) return -EINVAL; if (!skb_partial_csum_set(skb, start, off)) return -EINVAL; if (skb_transport_offset(skb) < nh_min_len) return -EINVAL; nh_min_len = skb_transport_offset(skb); p_off = nh_min_len + thlen; if (!pskb_may_pull(skb, p_off)) return -EINVAL; } else { /* gso packets without NEEDS_CSUM do not set transport_offset. * probe and drop if does not match one of the above types. */ if (gso_type && skb->network_header) { struct flow_keys_basic keys; if (!skb->protocol) { __be16 protocol = dev_parse_header_protocol(skb); if (!protocol) virtio_net_hdr_set_proto(skb, hdr); else if (!virtio_net_hdr_match_proto(protocol, hdr_gso_type)) return -EINVAL; else skb->protocol = protocol; } retry: if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys, NULL, 0, 0, 0, 0)) { /* UFO does not specify ipv4 or 6: try both */ if (gso_type & SKB_GSO_UDP && skb->protocol == htons(ETH_P_IP)) { skb->protocol = htons(ETH_P_IPV6); goto retry; } return -EINVAL; } p_off = keys.control.thoff + thlen; if (!pskb_may_pull(skb, p_off) || keys.basic.ip_proto != ip_proto) return -EINVAL; skb_set_transport_header(skb, keys.control.thoff); } else if (gso_type) { p_off = nh_min_len + thlen; if (!pskb_may_pull(skb, p_off)) return -EINVAL; } } if (hdr_gso_type != VIRTIO_NET_HDR_GSO_NONE) { u16 gso_size = __virtio16_to_cpu(little_endian, hdr->gso_size); unsigned int nh_off = p_off; struct skb_shared_info *shinfo = skb_shinfo(skb); switch (gso_type & ~SKB_GSO_TCP_ECN) { case SKB_GSO_UDP: /* UFO may not include transport header in gso_size. */ nh_off -= thlen; break; case SKB_GSO_UDP_L4: if (!(hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) return -EINVAL; if (skb->csum_offset != offsetof(struct udphdr, check)) return -EINVAL; if (skb->len - p_off > gso_size * UDP_MAX_SEGMENTS) return -EINVAL; if (gso_type != SKB_GSO_UDP_L4) return -EINVAL; break; case SKB_GSO_TCPV4: case SKB_GSO_TCPV6: if (skb->ip_summed == CHECKSUM_PARTIAL && skb->csum_offset != offsetof(struct tcphdr, check)) return -EINVAL; break; } /* Kernel has a special handling for GSO_BY_FRAGS. */ if (gso_size == GSO_BY_FRAGS) return -EINVAL; /* Too small packets are not really GSO ones. */ if (skb->len - nh_off > gso_size) { shinfo->gso_size = gso_size; shinfo->gso_type = gso_type; /* Header must be checked, and gso_segs computed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; } } return 0; } static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, const struct virtio_net_hdr *hdr, bool little_endian) { return __virtio_net_hdr_to_skb(skb, hdr, little_endian, hdr->gso_type); } static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb, struct virtio_net_hdr *hdr, bool little_endian, bool has_data_valid, int vlan_hlen) { memset(hdr, 0, sizeof(*hdr)); /* no info leak */ if (skb_is_gso(skb)) { struct skb_shared_info *sinfo = skb_shinfo(skb); /* This is a hint as to how much should be linear. */ hdr->hdr_len = __cpu_to_virtio16(little_endian, skb_headlen(skb)); hdr->gso_size = __cpu_to_virtio16(little_endian, sinfo->gso_size); if (sinfo->gso_type & SKB_GSO_TCPV4) hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; else if (sinfo->gso_type & SKB_GSO_TCPV6) hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; else if (sinfo->gso_type & SKB_GSO_UDP_L4) hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP_L4; else return -EINVAL; if (sinfo->gso_type & SKB_GSO_TCP_ECN) hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN; } else hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; if (skb->ip_summed == CHECKSUM_PARTIAL) { hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; hdr->csum_start = __cpu_to_virtio16(little_endian, skb_checksum_start_offset(skb) + vlan_hlen); hdr->csum_offset = __cpu_to_virtio16(little_endian, skb->csum_offset); } else if (has_data_valid && skb->ip_summed == CHECKSUM_UNNECESSARY) { hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID; } /* else everything is zero */ return 0; } static inline unsigned int virtio_l3min(bool is_ipv6) { return is_ipv6 ? sizeof(struct ipv6hdr) : sizeof(struct iphdr); } static inline int virtio_net_hdr_tnl_to_skb(struct sk_buff *skb, const struct virtio_net_hdr_v1_hash_tunnel *vhdr, bool tnl_hdr_negotiated, bool tnl_csum_negotiated, bool little_endian) { const struct virtio_net_hdr *hdr = (const struct virtio_net_hdr *)vhdr; unsigned int inner_nh, outer_th, inner_th; unsigned int inner_l3min, outer_l3min; u8 gso_inner_type, gso_tunnel_type; bool outer_isv6, inner_isv6; int ret; gso_tunnel_type = hdr->gso_type & VIRTIO_NET_HDR_GSO_UDP_TUNNEL; if (!gso_tunnel_type) return virtio_net_hdr_to_skb(skb, hdr, little_endian); /* Tunnel not supported/negotiated, but the hdr asks for it. */ if (!tnl_hdr_negotiated) return -EINVAL; /* Either ipv4 or ipv6. */ if (gso_tunnel_type == VIRTIO_NET_HDR_GSO_UDP_TUNNEL) return -EINVAL; /* The UDP tunnel must carry a GSO packet, but no UFO. */ gso_inner_type = hdr->gso_type & ~(VIRTIO_NET_HDR_GSO_ECN | VIRTIO_NET_HDR_GSO_UDP_TUNNEL); if (!gso_inner_type || gso_inner_type == VIRTIO_NET_HDR_GSO_UDP) return -EINVAL; /* Rely on csum being present. */ if (!(hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) return -EINVAL; /* Validate offsets. */ outer_isv6 = gso_tunnel_type & VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6; inner_isv6 = gso_inner_type == VIRTIO_NET_HDR_GSO_TCPV6; inner_l3min = virtio_l3min(inner_isv6); outer_l3min = ETH_HLEN + virtio_l3min(outer_isv6); inner_th = __virtio16_to_cpu(little_endian, hdr->csum_start); inner_nh = le16_to_cpu(vhdr->inner_nh_offset); outer_th = le16_to_cpu(vhdr->outer_th_offset); if (outer_th < outer_l3min || inner_nh < outer_th + sizeof(struct udphdr) || inner_th < inner_nh + inner_l3min) return -EINVAL; /* Let the basic parsing deal with plain GSO features. */ ret = __virtio_net_hdr_to_skb(skb, hdr, true, hdr->gso_type & ~gso_tunnel_type); if (ret) return ret; /* In case of USO, the inner protocol is still unknown and * `inner_isv6` is just a guess, additional parsing is needed. * The previous validation ensures that accessing an ipv4 inner * network header is safe. */ if (gso_inner_type == VIRTIO_NET_HDR_GSO_UDP_L4) { struct iphdr *iphdr = (struct iphdr *)(skb->data + inner_nh); inner_isv6 = iphdr->version == 6; inner_l3min = virtio_l3min(inner_isv6); if (inner_th < inner_nh + inner_l3min) return -EINVAL; } skb_set_inner_protocol(skb, inner_isv6 ? htons(ETH_P_IPV6) : htons(ETH_P_IP)); if (hdr->flags & VIRTIO_NET_HDR_F_UDP_TUNNEL_CSUM) { if (!tnl_csum_negotiated) return -EINVAL; skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; } else { skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL; } skb->inner_transport_header = inner_th + skb_headroom(skb); skb->inner_network_header = inner_nh + skb_headroom(skb); skb->inner_mac_header = inner_nh + skb_headroom(skb); skb->transport_header = outer_th + skb_headroom(skb); skb->encapsulation = 1; return 0; } /* Checksum-related fields validation for the driver */ static inline int virtio_net_handle_csum_offload(struct sk_buff *skb, struct virtio_net_hdr *hdr, bool tnl_csum_negotiated) { if (!(hdr->gso_type & VIRTIO_NET_HDR_GSO_UDP_TUNNEL)) { if (!(hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID)) return 0; skb->ip_summed = CHECKSUM_UNNECESSARY; if (!(hdr->flags & VIRTIO_NET_HDR_F_UDP_TUNNEL_CSUM)) return 0; /* tunnel csum packets are invalid when the related * feature has not been negotiated */ if (!tnl_csum_negotiated) return -EINVAL; skb->csum_level = 1; return 0; } /* DATA_VALID is mutually exclusive with NEEDS_CSUM, and GSO * over UDP tunnel requires the latter */ if (hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID) return -EINVAL; return 0; } /* * vlan_hlen always refers to the outermost MAC header. That also * means it refers to the only MAC header, if the packet does not carry * any encapsulation. */ static inline int virtio_net_hdr_tnl_from_skb(const struct sk_buff *skb, struct virtio_net_hdr_v1_hash_tunnel *vhdr, bool tnl_hdr_negotiated, bool little_endian, int vlan_hlen) { struct virtio_net_hdr *hdr = (struct virtio_net_hdr *)vhdr; unsigned int inner_nh, outer_th; int tnl_gso_type; int ret; tnl_gso_type = skb_shinfo(skb)->gso_type & (SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM); if (!tnl_gso_type) return virtio_net_hdr_from_skb(skb, hdr, little_endian, false, vlan_hlen); /* Tunnel support not negotiated but skb ask for it. */ if (!tnl_hdr_negotiated) return -EINVAL; vhdr->hash_hdr.hash_value = 0; vhdr->hash_hdr.hash_report = 0; vhdr->hash_hdr.padding = 0; /* Let the basic parsing deal with plain GSO features. */ skb_shinfo(skb)->gso_type &= ~tnl_gso_type; ret = virtio_net_hdr_from_skb(skb, hdr, true, false, vlan_hlen); skb_shinfo(skb)->gso_type |= tnl_gso_type; if (ret) return ret; if (skb->protocol == htons(ETH_P_IPV6)) hdr->gso_type |= VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6; else hdr->gso_type |= VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV4; if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM) hdr->flags |= VIRTIO_NET_HDR_F_UDP_TUNNEL_CSUM; inner_nh = skb->inner_network_header - skb_headroom(skb); outer_th = skb->transport_header - skb_headroom(skb); vhdr->inner_nh_offset = cpu_to_le16(inner_nh); vhdr->outer_th_offset = cpu_to_le16(outer_th); return 0; } #endif /* _LINUX_VIRTIO_NET_H */
92 139 140 400 640 105 17 90 105 65 1311 9 9 1362 1364 1363 18 61 14 54 61 1274 614 660 1356 1356 744 640 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * include/net/l3mdev.h - L3 master device API * Copyright (c) 2015 Cumulus Networks * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com> */ #ifndef _NET_L3MDEV_H_ #define _NET_L3MDEV_H_ #include <net/dst.h> #include <net/fib_rules.h> enum l3mdev_type { L3MDEV_TYPE_UNSPEC, L3MDEV_TYPE_VRF, __L3MDEV_TYPE_MAX }; #define L3MDEV_TYPE_MAX (__L3MDEV_TYPE_MAX - 1) typedef int (*lookup_by_table_id_t)(struct net *net, u32 table_d); /** * struct l3mdev_ops - l3mdev operations * * @l3mdev_fib_table: Get FIB table id to use for lookups * * @l3mdev_l3_rcv: Hook in L3 receive path * * @l3mdev_l3_out: Hook in L3 output path * * @l3mdev_link_scope_lookup: IPv6 lookup for linklocal and mcast destinations */ struct l3mdev_ops { u32 (*l3mdev_fib_table)(const struct net_device *dev); struct sk_buff * (*l3mdev_l3_rcv)(struct net_device *dev, struct sk_buff *skb, u16 proto); struct sk_buff * (*l3mdev_l3_out)(struct net_device *dev, struct sock *sk, struct sk_buff *skb, u16 proto); /* IPv6 ops */ struct dst_entry * (*l3mdev_link_scope_lookup)(const struct net_device *dev, struct flowi6 *fl6); }; #ifdef CONFIG_NET_L3_MASTER_DEV int l3mdev_table_lookup_register(enum l3mdev_type l3type, lookup_by_table_id_t fn); void l3mdev_table_lookup_unregister(enum l3mdev_type l3type, lookup_by_table_id_t fn); int l3mdev_ifindex_lookup_by_table_id(enum l3mdev_type l3type, struct net *net, u32 table_id); int l3mdev_fib_rule_match(struct net *net, struct flowi *fl, struct fib_lookup_arg *arg); static inline bool l3mdev_fib_rule_iif_match(const struct flowi *fl, int iifindex) { return !(fl->flowi_flags & FLOWI_FLAG_L3MDEV_OIF) && fl->flowi_l3mdev == iifindex; } static inline bool l3mdev_fib_rule_oif_match(const struct flowi *fl, int oifindex) { return fl->flowi_flags & FLOWI_FLAG_L3MDEV_OIF && fl->flowi_l3mdev == oifindex; } void l3mdev_update_flow(struct net *net, struct flowi *fl); int l3mdev_master_ifindex_rcu(const struct net_device *dev); static inline int l3mdev_master_ifindex(struct net_device *dev) { int ifindex; rcu_read_lock(); ifindex = l3mdev_master_ifindex_rcu(dev); rcu_read_unlock(); return ifindex; } static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex) { struct net_device *dev; int rc = 0; if (ifindex) { rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (dev) rc = l3mdev_master_ifindex_rcu(dev); rcu_read_unlock(); } return rc; } static inline struct net_device *l3mdev_master_dev_rcu(const struct net_device *_dev) { /* netdev_master_upper_dev_get_rcu calls * list_first_or_null_rcu to walk the upper dev list. * list_first_or_null_rcu does not handle a const arg. We aren't * making changes, just want the master device from that list so * typecast to remove the const */ struct net_device *dev = (struct net_device *)_dev; struct net_device *master; if (!dev) return NULL; if (netif_is_l3_master(dev)) master = dev; else if (netif_is_l3_slave(dev)) master = netdev_master_upper_dev_get_rcu(dev); else master = NULL; return master; } int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex); static inline int l3mdev_master_upper_ifindex_by_index(struct net *net, int ifindex) { rcu_read_lock(); ifindex = l3mdev_master_upper_ifindex_by_index_rcu(net, ifindex); rcu_read_unlock(); return ifindex; } u32 l3mdev_fib_table_rcu(const struct net_device *dev); u32 l3mdev_fib_table_by_index(struct net *net, int ifindex); static inline u32 l3mdev_fib_table(const struct net_device *dev) { u32 tb_id; rcu_read_lock(); tb_id = l3mdev_fib_table_rcu(dev); rcu_read_unlock(); return tb_id; } static inline bool netif_index_is_l3_master(struct net *net, int ifindex) { struct net_device *dev; bool rc = false; if (ifindex == 0) return false; rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (dev) rc = netif_is_l3_master(dev); rcu_read_unlock(); return rc; } struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6); static inline struct sk_buff *l3mdev_l3_rcv(struct sk_buff *skb, u16 proto) { struct net_device *master = NULL; if (netif_is_l3_slave(skb->dev)) master = netdev_master_upper_dev_get_rcu(skb->dev); else if (netif_is_l3_master(skb->dev) || netif_has_l3_rx_handler(skb->dev)) master = skb->dev; if (master && master->l3mdev_ops->l3mdev_l3_rcv) skb = master->l3mdev_ops->l3mdev_l3_rcv(master, skb, proto); return skb; } static inline struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb) { return l3mdev_l3_rcv(skb, AF_INET); } static inline struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb) { return l3mdev_l3_rcv(skb, AF_INET6); } static inline struct sk_buff *l3mdev_l3_out(struct sock *sk, struct sk_buff *skb, u16 proto) { struct net_device *dev = skb_dst(skb)->dev; if (netif_is_l3_slave(dev)) { struct net_device *master; rcu_read_lock(); master = netdev_master_upper_dev_get_rcu(dev); if (master && master->l3mdev_ops->l3mdev_l3_out) skb = master->l3mdev_ops->l3mdev_l3_out(master, sk, skb, proto); rcu_read_unlock(); } return skb; } static inline struct sk_buff *l3mdev_ip_out(struct sock *sk, struct sk_buff *skb) { return l3mdev_l3_out(sk, skb, AF_INET); } static inline struct sk_buff *l3mdev_ip6_out(struct sock *sk, struct sk_buff *skb) { return l3mdev_l3_out(sk, skb, AF_INET6); } #else static inline int l3mdev_master_ifindex_rcu(const struct net_device *dev) { return 0; } static inline int l3mdev_master_ifindex(struct net_device *dev) { return 0; } static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex) { return 0; } static inline int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex) { return 0; } static inline int l3mdev_master_upper_ifindex_by_index(struct net *net, int ifindex) { return 0; } static inline struct net_device *l3mdev_master_dev_rcu(const struct net_device *dev) { return NULL; } static inline u32 l3mdev_fib_table_rcu(const struct net_device *dev) { return 0; } static inline u32 l3mdev_fib_table(const struct net_device *dev) { return 0; } static inline u32 l3mdev_fib_table_by_index(struct net *net, int ifindex) { return 0; } static inline bool netif_index_is_l3_master(struct net *net, int ifindex) { return false; } static inline struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6) { return NULL; } static inline struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb) { return skb; } static inline struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb) { return skb; } static inline struct sk_buff *l3mdev_ip_out(struct sock *sk, struct sk_buff *skb) { return skb; } static inline struct sk_buff *l3mdev_ip6_out(struct sock *sk, struct sk_buff *skb) { return skb; } static inline int l3mdev_table_lookup_register(enum l3mdev_type l3type, lookup_by_table_id_t fn) { return -EOPNOTSUPP; } static inline void l3mdev_table_lookup_unregister(enum l3mdev_type l3type, lookup_by_table_id_t fn) { } static inline int l3mdev_ifindex_lookup_by_table_id(enum l3mdev_type l3type, struct net *net, u32 table_id) { return -ENODEV; } static inline int l3mdev_fib_rule_match(struct net *net, struct flowi *fl, struct fib_lookup_arg *arg) { return 1; } static inline bool l3mdev_fib_rule_iif_match(const struct flowi *fl, int iifindex) { return false; } static inline bool l3mdev_fib_rule_oif_match(const struct flowi *fl, int oifindex) { return false; } static inline void l3mdev_update_flow(struct net *net, struct flowi *fl) { } #endif #endif /* _NET_L3MDEV_H_ */
19 19 13 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2015, Heiner Kallweit <hkallweit1@gmail.com> */ #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include "leds.h" DEFINE_LED_TRIGGER(bt_power_led_trigger); struct hci_basic_led_trigger { struct led_trigger led_trigger; struct hci_dev *hdev; }; #define to_hci_basic_led_trigger(arg) container_of(arg, \ struct hci_basic_led_trigger, led_trigger) void hci_leds_update_powered(struct hci_dev *hdev, bool enabled) { if (hdev->power_led) led_trigger_event(hdev->power_led, enabled ? LED_FULL : LED_OFF); if (!enabled) { struct hci_dev *d; read_lock(&hci_dev_list_lock); list_for_each_entry(d, &hci_dev_list, list) { if (test_bit(HCI_UP, &d->flags)) enabled = true; } read_unlock(&hci_dev_list_lock); } led_trigger_event(bt_power_led_trigger, enabled ? LED_FULL : LED_OFF); } static int power_activate(struct led_classdev *led_cdev) { struct hci_basic_led_trigger *htrig; bool powered; htrig = to_hci_basic_led_trigger(led_cdev->trigger); powered = test_bit(HCI_UP, &htrig->hdev->flags); led_set_brightness(led_cdev, powered ? LED_FULL : LED_OFF); return 0; } static struct led_trigger *led_allocate_basic(struct hci_dev *hdev, int (*activate)(struct led_classdev *led_cdev), const char *name) { struct hci_basic_led_trigger *htrig; htrig = devm_kzalloc(&hdev->dev, sizeof(*htrig), GFP_KERNEL); if (!htrig) return NULL; htrig->hdev = hdev; htrig->led_trigger.activate = activate; htrig->led_trigger.name = devm_kasprintf(&hdev->dev, GFP_KERNEL, "%s-%s", hdev->name, name); if (!htrig->led_trigger.name) goto err_alloc; if (devm_led_trigger_register(&hdev->dev, &htrig->led_trigger)) goto err_register; return &htrig->led_trigger; err_register: devm_kfree(&hdev->dev, (void *)htrig->led_trigger.name); err_alloc: devm_kfree(&hdev->dev, htrig); return NULL; } void hci_leds_init(struct hci_dev *hdev) { /* initialize power_led */ hdev->power_led = led_allocate_basic(hdev, power_activate, "power"); } void bt_leds_init(void) { led_trigger_register_simple("bluetooth-power", &bt_power_led_trigger); } void bt_leds_cleanup(void) { led_trigger_unregister_simple(bt_power_led_trigger); }
4 4 3 1 3 3 3 2 3 191 191 191 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 // SPDX-License-Identifier: GPL-2.0 /* * xfrm6_policy.c: based on xfrm4_policy.c * * Authors: * Mitsuru KANDA @USAGI * Kazunori MIYAZAWA @USAGI * Kunihiro Ishiguro <kunihiro@ipinfusion.com> * IPv6 support * YOSHIFUJI Hideaki * Split up af-specific portion * */ #include <linux/err.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <net/addrconf.h> #include <net/dst.h> #include <net/xfrm.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/l3mdev.h> static struct dst_entry *xfrm6_dst_lookup(const struct xfrm_dst_lookup_params *params) { struct flowi6 fl6; struct dst_entry *dst; int err; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_l3mdev = l3mdev_master_ifindex_by_index(params->net, params->oif); fl6.flowi6_mark = params->mark; memcpy(&fl6.daddr, params->daddr, sizeof(fl6.daddr)); if (params->saddr) memcpy(&fl6.saddr, params->saddr, sizeof(fl6.saddr)); fl6.flowi4_proto = params->ipproto; fl6.uli = params->uli; dst = ip6_route_output(params->net, NULL, &fl6); err = dst->error; if (dst->error) { dst_release(dst); dst = ERR_PTR(err); } return dst; } static int xfrm6_get_saddr(xfrm_address_t *saddr, const struct xfrm_dst_lookup_params *params) { struct dst_entry *dst; struct net_device *dev; struct inet6_dev *idev; dst = xfrm6_dst_lookup(params); if (IS_ERR(dst)) return -EHOSTUNREACH; idev = ip6_dst_idev(dst); if (!idev) { dst_release(dst); return -EHOSTUNREACH; } dev = idev->dev; ipv6_dev_get_saddr(dev_net(dev), dev, &params->daddr->in6, 0, &saddr->in6); dst_release(dst); return 0; } static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl) { struct rt6_info *rt = dst_rt6_info(xdst->route); xdst->u.dst.dev = dev; netdev_hold(dev, &xdst->u.dst.dev_tracker, GFP_ATOMIC); xdst->u.rt6.rt6i_idev = in6_dev_get(dev); if (!xdst->u.rt6.rt6i_idev) { netdev_put(dev, &xdst->u.dst.dev_tracker); return -ENODEV; } /* Sheit... I remember I did this right. Apparently, * it was magically lost, so this code needs audit */ xdst->u.rt6.rt6i_flags = rt->rt6i_flags & (RTF_ANYCAST | RTF_LOCAL); xdst->route_cookie = rt6_get_cookie(rt); xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway; xdst->u.rt6.rt6i_dst = rt->rt6i_dst; xdst->u.rt6.rt6i_src = rt->rt6i_src; rt6_uncached_list_add(&xdst->u.rt6); return 0; } static void xfrm6_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; path->ops->update_pmtu(path, sk, skb, mtu, confirm_neigh); } static void xfrm6_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; path->ops->redirect(path, sk, skb); } static void xfrm6_dst_destroy(struct dst_entry *dst) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; dst_destroy_metrics_generic(dst); rt6_uncached_list_del(&xdst->u.rt6); if (likely(xdst->u.rt6.rt6i_idev)) in6_dev_put(xdst->u.rt6.rt6i_idev); xfrm_dst_destroy(xdst); } static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { struct xfrm_dst *xdst; xdst = (struct xfrm_dst *)dst; if (xdst->u.rt6.rt6i_idev->dev == dev) { struct inet6_dev *loopback_idev = in6_dev_get(dev_net(dev)->loopback_dev); do { in6_dev_put(xdst->u.rt6.rt6i_idev); xdst->u.rt6.rt6i_idev = loopback_idev; in6_dev_hold(loopback_idev); xdst = (struct xfrm_dst *)xfrm_dst_child(&xdst->u.dst); } while (xdst->u.dst.xfrm); __in6_dev_put(loopback_idev); } xfrm_dst_ifdown(dst, dev); } static struct dst_ops xfrm6_dst_ops_template = { .family = AF_INET6, .update_pmtu = xfrm6_update_pmtu, .redirect = xfrm6_redirect, .cow_metrics = dst_cow_metrics_generic, .destroy = xfrm6_dst_destroy, .ifdown = xfrm6_dst_ifdown, .local_out = __ip6_local_out, .gc_thresh = 32768, }; static const struct xfrm_policy_afinfo xfrm6_policy_afinfo = { .dst_ops = &xfrm6_dst_ops_template, .dst_lookup = xfrm6_dst_lookup, .get_saddr = xfrm6_get_saddr, .fill_dst = xfrm6_fill_dst, .blackhole_route = ip6_blackhole_route, }; static int __init xfrm6_policy_init(void) { return xfrm_policy_register_afinfo(&xfrm6_policy_afinfo, AF_INET6); } static void xfrm6_policy_fini(void) { xfrm_policy_unregister_afinfo(&xfrm6_policy_afinfo); } #ifdef CONFIG_SYSCTL static struct ctl_table xfrm6_policy_table[] = { { .procname = "xfrm6_gc_thresh", .data = &init_net.xfrm.xfrm6_dst_ops.gc_thresh, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, }; static int __net_init xfrm6_net_sysctl_init(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; table = xfrm6_policy_table; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(xfrm6_policy_table), GFP_KERNEL); if (!table) goto err_alloc; table[0].data = &net->xfrm.xfrm6_dst_ops.gc_thresh; } hdr = register_net_sysctl_sz(net, "net/ipv6", table, ARRAY_SIZE(xfrm6_policy_table)); if (!hdr) goto err_reg; net->ipv6.sysctl.xfrm6_hdr = hdr; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static void __net_exit xfrm6_net_sysctl_exit(struct net *net) { const struct ctl_table *table; if (!net->ipv6.sysctl.xfrm6_hdr) return; table = net->ipv6.sysctl.xfrm6_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv6.sysctl.xfrm6_hdr); if (!net_eq(net, &init_net)) kfree(table); } #else /* CONFIG_SYSCTL */ static inline int xfrm6_net_sysctl_init(struct net *net) { return 0; } static inline void xfrm6_net_sysctl_exit(struct net *net) { } #endif static int __net_init xfrm6_net_init(struct net *net) { int ret; memcpy(&net->xfrm.xfrm6_dst_ops, &xfrm6_dst_ops_template, sizeof(xfrm6_dst_ops_template)); ret = dst_entries_init(&net->xfrm.xfrm6_dst_ops); if (ret) return ret; ret = xfrm6_net_sysctl_init(net); if (ret) dst_entries_destroy(&net->xfrm.xfrm6_dst_ops); return ret; } static void __net_exit xfrm6_net_exit(struct net *net) { xfrm6_net_sysctl_exit(net); dst_entries_destroy(&net->xfrm.xfrm6_dst_ops); } static struct pernet_operations xfrm6_net_ops = { .init = xfrm6_net_init, .exit = xfrm6_net_exit, }; int __init xfrm6_init(void) { int ret; ret = xfrm6_policy_init(); if (ret) goto out; ret = xfrm6_state_init(); if (ret) goto out_policy; ret = xfrm6_protocol_init(); if (ret) goto out_state; ret = register_pernet_subsys(&xfrm6_net_ops); if (ret) goto out_protocol; ret = xfrm_nat_keepalive_init(AF_INET6); if (ret) goto out_nat_keepalive; out: return ret; out_nat_keepalive: unregister_pernet_subsys(&xfrm6_net_ops); out_protocol: xfrm6_protocol_fini(); out_state: xfrm6_state_fini(); out_policy: xfrm6_policy_fini(); goto out; } void xfrm6_fini(void) { xfrm_nat_keepalive_fini(AF_INET6); unregister_pernet_subsys(&xfrm6_net_ops); xfrm6_protocol_fini(); xfrm6_policy_fini(); xfrm6_state_fini(); }
368 42 5 155 10 18 3 10 147 53 4 400 399 399 2668 2668 178 178 2494 351 404 2668 2662 20 2671 1 2670 4 208 1 2 1 1 1 1 1 1 3 4 1 2 2 2 1 1 1 2 1 1 2 2 209 208 206 209 10 155 156 18 2 156 10 18 155 2 2 2 2 2 176 176 160 23 175 1 1 175 63 53 160 6 170 8 167 154 21 27 157 2 152 11 164 3 162 2 160 1 6 159 29 1 164 149 17 163 164 140 27 24 24 18 12 26 5 14 19 3 1 4 4 37 1 4 19 5 10 10 5 1 3 6 10 5 13 65 44 34 2 1 2 3 31 10 9 3 1 2 1 1 2 2 40 35 40 40 39 40 40 40 40 39 40 40 40 40 40 40 40 39 40 40 40 40 36 4 35 4 34 4 32 4 28 30 30 24 1 5 25 2 2 25 2 57 3 54 39 2 13 4 10 3 46 29 26 23 3 20 20 19 1 161 153 1 7 22 136 116 27 27 167 166 147 151 21 122 8 124 5 123 3 108 16 111 121 3 113 119 109 118 117 114 2 5 109 2 2 3 3 110 2 101 99 104 5 101 107 2 106 3 107 2 107 2 104 5 104 4 107 2 105 4 102 7 102 7 105 4 102 95 99 4 94 9 99 4 81 97 2 131 1 126 4 108 22 338 356 2 4 4 4 1 3 1 2 3 1 2 2 4 4 1 2 8 3 3 6 3 3 4 1 4 2 2 2 1 2 1 1 3 1 2 1 2 3 3 2 1 357 3 3 3 3 15 13 3 3 4 7 3 15 3 15 8 15 13 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 // SPDX-License-Identifier: GPL-2.0-or-later /* * Bridge netlink control interface * * Authors: * Stephen Hemminger <shemminger@osdl.org> */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/etherdevice.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/sock.h> #include <uapi/linux/if_bridge.h> #include "br_private.h" #include "br_private_stp.h" #include "br_private_cfm.h" #include "br_private_tunnel.h" #include "br_private_mcast_eht.h" static int __get_num_vlan_infos(struct net_bridge_vlan_group *vg, u32 filter_mask) { struct net_bridge_vlan *v; u16 vid_range_start = 0, vid_range_end = 0, vid_range_flags = 0; u16 flags, pvid; int num_vlans = 0; if (!(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) return 0; pvid = br_get_pvid(vg); /* Count number of vlan infos */ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { flags = 0; /* only a context, bridge vlan not activated */ if (!br_vlan_should_use(v)) continue; if (v->vid == pvid) flags |= BRIDGE_VLAN_INFO_PVID; if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) flags |= BRIDGE_VLAN_INFO_UNTAGGED; if (vid_range_start == 0) { goto initvars; } else if ((v->vid - vid_range_end) == 1 && flags == vid_range_flags) { vid_range_end = v->vid; continue; } else { if ((vid_range_end - vid_range_start) > 0) num_vlans += 2; else num_vlans += 1; } initvars: vid_range_start = v->vid; vid_range_end = v->vid; vid_range_flags = flags; } if (vid_range_start != 0) { if ((vid_range_end - vid_range_start) > 0) num_vlans += 2; else num_vlans += 1; } return num_vlans; } static int br_get_num_vlan_infos(struct net_bridge_vlan_group *vg, u32 filter_mask) { int num_vlans; if (!vg) return 0; if (filter_mask & RTEXT_FILTER_BRVLAN) return vg->num_vlans; rcu_read_lock(); num_vlans = __get_num_vlan_infos(vg, filter_mask); rcu_read_unlock(); return num_vlans; } static size_t br_get_link_af_size_filtered(const struct net_device *dev, u32 filter_mask) { struct net_bridge_vlan_group *vg = NULL; struct net_bridge_port *p = NULL; struct net_bridge *br = NULL; u32 num_cfm_peer_mep_infos; u32 num_cfm_mep_infos; size_t vinfo_sz = 0; int num_vlan_infos; rcu_read_lock(); if (netif_is_bridge_port(dev)) { p = br_port_get_check_rcu(dev); if (p) vg = nbp_vlan_group_rcu(p); } else if (netif_is_bridge_master(dev)) { br = netdev_priv(dev); vg = br_vlan_group_rcu(br); } num_vlan_infos = br_get_num_vlan_infos(vg, filter_mask); rcu_read_unlock(); if (p && (p->flags & BR_VLAN_TUNNEL)) vinfo_sz += br_get_vlan_tunnel_info_size(vg); /* Each VLAN is returned in bridge_vlan_info along with flags */ vinfo_sz += num_vlan_infos * nla_total_size(sizeof(struct bridge_vlan_info)); if (p && vg && (filter_mask & RTEXT_FILTER_MST)) vinfo_sz += br_mst_info_size(vg); if (!(filter_mask & RTEXT_FILTER_CFM_STATUS)) return vinfo_sz; if (!br) return vinfo_sz; /* CFM status info must be added */ br_cfm_mep_count(br, &num_cfm_mep_infos); br_cfm_peer_mep_count(br, &num_cfm_peer_mep_infos); vinfo_sz += nla_total_size(0); /* IFLA_BRIDGE_CFM */ /* For each status struct the MEP instance (u32) is added */ /* MEP instance (u32) + br_cfm_mep_status */ vinfo_sz += num_cfm_mep_infos * /*IFLA_BRIDGE_CFM_MEP_STATUS_INSTANCE */ (nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_MEP_STATUS_OPCODE_UNEXP_SEEN */ + nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_MEP_STATUS_VERSION_UNEXP_SEEN */ + nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_MEP_STATUS_RX_LEVEL_LOW_SEEN */ + nla_total_size(sizeof(u32))); /* MEP instance (u32) + br_cfm_cc_peer_status */ vinfo_sz += num_cfm_peer_mep_infos * /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_INSTANCE */ (nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_PEER_MEPID */ + nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_CCM_DEFECT */ + nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_RDI */ + nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_PORT_TLV_VALUE */ + nla_total_size(sizeof(u8)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_IF_TLV_VALUE */ + nla_total_size(sizeof(u8)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEEN */ + nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_TLV_SEEN */ + nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEQ_UNEXP_SEEN */ + nla_total_size(sizeof(u32))); return vinfo_sz; } static inline size_t br_port_info_size(void) { return nla_total_size(1) /* IFLA_BRPORT_STATE */ + nla_total_size(2) /* IFLA_BRPORT_PRIORITY */ + nla_total_size(4) /* IFLA_BRPORT_COST */ + nla_total_size(1) /* IFLA_BRPORT_MODE */ + nla_total_size(1) /* IFLA_BRPORT_GUARD */ + nla_total_size(1) /* IFLA_BRPORT_PROTECT */ + nla_total_size(1) /* IFLA_BRPORT_FAST_LEAVE */ + nla_total_size(1) /* IFLA_BRPORT_MCAST_TO_UCAST */ + nla_total_size(1) /* IFLA_BRPORT_LEARNING */ + nla_total_size(1) /* IFLA_BRPORT_UNICAST_FLOOD */ + nla_total_size(1) /* IFLA_BRPORT_MCAST_FLOOD */ + nla_total_size(1) /* IFLA_BRPORT_BCAST_FLOOD */ + nla_total_size(1) /* IFLA_BRPORT_PROXYARP */ + nla_total_size(1) /* IFLA_BRPORT_PROXYARP_WIFI */ + nla_total_size(1) /* IFLA_BRPORT_VLAN_TUNNEL */ + nla_total_size(1) /* IFLA_BRPORT_NEIGH_SUPPRESS */ + nla_total_size(1) /* IFLA_BRPORT_ISOLATED */ + nla_total_size(1) /* IFLA_BRPORT_LOCKED */ + nla_total_size(1) /* IFLA_BRPORT_MAB */ + nla_total_size(1) /* IFLA_BRPORT_NEIGH_VLAN_SUPPRESS */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_COST */ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_ID */ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_NO */ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_TOPOLOGY_CHANGE_ACK */ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_CONFIG_PENDING */ + nla_total_size_64bit(sizeof(u64)) /* IFLA_BRPORT_MESSAGE_AGE_TIMER */ + nla_total_size_64bit(sizeof(u64)) /* IFLA_BRPORT_FORWARD_DELAY_TIMER */ + nla_total_size_64bit(sizeof(u64)) /* IFLA_BRPORT_HOLD_TIMER */ #ifdef CONFIG_BRIDGE_IGMP_SNOOPING + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MULTICAST_ROUTER */ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_N_GROUPS */ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_MAX_GROUPS */ #endif + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_GROUP_FWD_MASK */ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_RING_OPEN */ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_IN_OPEN */ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT */ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_EHT_HOSTS_CNT */ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_BACKUP_NHID */ + 0; } static inline size_t br_nlmsg_size(struct net_device *dev, u32 filter_mask) { return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + nla_total_size(4) /* IFLA_MASTER */ + nla_total_size(4) /* IFLA_MTU */ + nla_total_size(4) /* IFLA_LINK */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(br_port_info_size()) /* IFLA_PROTINFO */ + nla_total_size(br_get_link_af_size_filtered(dev, filter_mask)) /* IFLA_AF_SPEC */ + nla_total_size(4); /* IFLA_BRPORT_BACKUP_PORT */ } static int br_port_fill_attrs(struct sk_buff *skb, const struct net_bridge_port *p) { u8 mode = !!(p->flags & BR_HAIRPIN_MODE); struct net_bridge_port *backup_p; u64 timerval; if (nla_put_u8(skb, IFLA_BRPORT_STATE, p->state) || nla_put_u16(skb, IFLA_BRPORT_PRIORITY, p->priority) || nla_put_u32(skb, IFLA_BRPORT_COST, p->path_cost) || nla_put_u8(skb, IFLA_BRPORT_MODE, mode) || nla_put_u8(skb, IFLA_BRPORT_GUARD, !!(p->flags & BR_BPDU_GUARD)) || nla_put_u8(skb, IFLA_BRPORT_PROTECT, !!(p->flags & BR_ROOT_BLOCK)) || nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE, !!(p->flags & BR_MULTICAST_FAST_LEAVE)) || nla_put_u8(skb, IFLA_BRPORT_MCAST_TO_UCAST, !!(p->flags & BR_MULTICAST_TO_UNICAST)) || nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) || nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD, !!(p->flags & BR_FLOOD)) || nla_put_u8(skb, IFLA_BRPORT_MCAST_FLOOD, !!(p->flags & BR_MCAST_FLOOD)) || nla_put_u8(skb, IFLA_BRPORT_BCAST_FLOOD, !!(p->flags & BR_BCAST_FLOOD)) || nla_put_u8(skb, IFLA_BRPORT_PROXYARP, !!(p->flags & BR_PROXYARP)) || nla_put_u8(skb, IFLA_BRPORT_PROXYARP_WIFI, !!(p->flags & BR_PROXYARP_WIFI)) || nla_put(skb, IFLA_BRPORT_ROOT_ID, sizeof(struct ifla_bridge_id), &p->designated_root) || nla_put(skb, IFLA_BRPORT_BRIDGE_ID, sizeof(struct ifla_bridge_id), &p->designated_bridge) || nla_put_u16(skb, IFLA_BRPORT_DESIGNATED_PORT, p->designated_port) || nla_put_u16(skb, IFLA_BRPORT_DESIGNATED_COST, p->designated_cost) || nla_put_u16(skb, IFLA_BRPORT_ID, p->port_id) || nla_put_u16(skb, IFLA_BRPORT_NO, p->port_no) || nla_put_u8(skb, IFLA_BRPORT_TOPOLOGY_CHANGE_ACK, p->topology_change_ack) || nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending) || nla_put_u8(skb, IFLA_BRPORT_VLAN_TUNNEL, !!(p->flags & BR_VLAN_TUNNEL)) || nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask) || nla_put_u8(skb, IFLA_BRPORT_NEIGH_SUPPRESS, !!(p->flags & BR_NEIGH_SUPPRESS)) || nla_put_u8(skb, IFLA_BRPORT_MRP_RING_OPEN, !!(p->flags & BR_MRP_LOST_CONT)) || nla_put_u8(skb, IFLA_BRPORT_MRP_IN_OPEN, !!(p->flags & BR_MRP_LOST_IN_CONT)) || nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED)) || nla_put_u8(skb, IFLA_BRPORT_LOCKED, !!(p->flags & BR_PORT_LOCKED)) || nla_put_u8(skb, IFLA_BRPORT_MAB, !!(p->flags & BR_PORT_MAB)) || nla_put_u8(skb, IFLA_BRPORT_NEIGH_VLAN_SUPPRESS, !!(p->flags & BR_NEIGH_VLAN_SUPPRESS))) return -EMSGSIZE; timerval = br_timer_value(&p->message_age_timer); if (nla_put_u64_64bit(skb, IFLA_BRPORT_MESSAGE_AGE_TIMER, timerval, IFLA_BRPORT_PAD)) return -EMSGSIZE; timerval = br_timer_value(&p->forward_delay_timer); if (nla_put_u64_64bit(skb, IFLA_BRPORT_FORWARD_DELAY_TIMER, timerval, IFLA_BRPORT_PAD)) return -EMSGSIZE; timerval = br_timer_value(&p->hold_timer); if (nla_put_u64_64bit(skb, IFLA_BRPORT_HOLD_TIMER, timerval, IFLA_BRPORT_PAD)) return -EMSGSIZE; #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (nla_put_u8(skb, IFLA_BRPORT_MULTICAST_ROUTER, p->multicast_ctx.multicast_router) || nla_put_u32(skb, IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT, p->multicast_eht_hosts_limit) || nla_put_u32(skb, IFLA_BRPORT_MCAST_EHT_HOSTS_CNT, p->multicast_eht_hosts_cnt) || nla_put_u32(skb, IFLA_BRPORT_MCAST_N_GROUPS, br_multicast_ngroups_get(&p->multicast_ctx)) || nla_put_u32(skb, IFLA_BRPORT_MCAST_MAX_GROUPS, br_multicast_ngroups_get_max(&p->multicast_ctx))) return -EMSGSIZE; #endif /* we might be called only with br->lock */ rcu_read_lock(); backup_p = rcu_dereference(p->backup_port); if (backup_p) nla_put_u32(skb, IFLA_BRPORT_BACKUP_PORT, backup_p->dev->ifindex); rcu_read_unlock(); if (p->backup_nhid && nla_put_u32(skb, IFLA_BRPORT_BACKUP_NHID, p->backup_nhid)) return -EMSGSIZE; return 0; } static int br_fill_ifvlaninfo_range(struct sk_buff *skb, u16 vid_start, u16 vid_end, u16 flags) { struct bridge_vlan_info vinfo; if ((vid_end - vid_start) > 0) { /* add range to skb */ vinfo.vid = vid_start; vinfo.flags = flags | BRIDGE_VLAN_INFO_RANGE_BEGIN; if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO, sizeof(vinfo), &vinfo)) goto nla_put_failure; vinfo.vid = vid_end; vinfo.flags = flags | BRIDGE_VLAN_INFO_RANGE_END; if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO, sizeof(vinfo), &vinfo)) goto nla_put_failure; } else { vinfo.vid = vid_start; vinfo.flags = flags; if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO, sizeof(vinfo), &vinfo)) goto nla_put_failure; } return 0; nla_put_failure: return -EMSGSIZE; } static int br_fill_ifvlaninfo_compressed(struct sk_buff *skb, struct net_bridge_vlan_group *vg) { struct net_bridge_vlan *v; u16 vid_range_start = 0, vid_range_end = 0, vid_range_flags = 0; u16 flags, pvid; int err = 0; /* Pack IFLA_BRIDGE_VLAN_INFO's for every vlan * and mark vlan info with begin and end flags * if vlaninfo represents a range */ pvid = br_get_pvid(vg); list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { flags = 0; if (!br_vlan_should_use(v)) continue; if (v->vid == pvid) flags |= BRIDGE_VLAN_INFO_PVID; if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) flags |= BRIDGE_VLAN_INFO_UNTAGGED; if (vid_range_start == 0) { goto initvars; } else if ((v->vid - vid_range_end) == 1 && flags == vid_range_flags) { vid_range_end = v->vid; continue; } else { err = br_fill_ifvlaninfo_range(skb, vid_range_start, vid_range_end, vid_range_flags); if (err) return err; } initvars: vid_range_start = v->vid; vid_range_end = v->vid; vid_range_flags = flags; } if (vid_range_start != 0) { /* Call it once more to send any left over vlans */ err = br_fill_ifvlaninfo_range(skb, vid_range_start, vid_range_end, vid_range_flags); if (err) return err; } return 0; } static int br_fill_ifvlaninfo(struct sk_buff *skb, struct net_bridge_vlan_group *vg) { struct bridge_vlan_info vinfo; struct net_bridge_vlan *v; u16 pvid; pvid = br_get_pvid(vg); list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { if (!br_vlan_should_use(v)) continue; vinfo.vid = v->vid; vinfo.flags = 0; if (v->vid == pvid) vinfo.flags |= BRIDGE_VLAN_INFO_PVID; if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) vinfo.flags |= BRIDGE_VLAN_INFO_UNTAGGED; if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO, sizeof(vinfo), &vinfo)) goto nla_put_failure; } return 0; nla_put_failure: return -EMSGSIZE; } /* * Create one netlink message for one interface * Contains port and master info as well as carrier and bridge state. */ static int br_fill_ifinfo(struct sk_buff *skb, const struct net_bridge_port *port, u32 pid, u32 seq, int event, unsigned int flags, u32 filter_mask, const struct net_device *dev, bool getlink) { u8 operstate = netif_running(dev) ? READ_ONCE(dev->operstate) : IF_OPER_DOWN; struct nlattr *af = NULL; struct net_bridge *br; struct ifinfomsg *hdr; struct nlmsghdr *nlh; if (port) br = port->br; else br = netdev_priv(dev); br_debug(br, "br_fill_ifinfo event %d port %s master %s\n", event, dev->name, br->dev->name); nlh = nlmsg_put(skb, pid, seq, event, sizeof(*hdr), flags); if (nlh == NULL) return -EMSGSIZE; hdr = nlmsg_data(nlh); hdr->ifi_family = AF_BRIDGE; hdr->__ifi_pad = 0; hdr->ifi_type = dev->type; hdr->ifi_index = dev->ifindex; hdr->ifi_flags = netif_get_flags(dev); hdr->ifi_change = 0; if (nla_put_string(skb, IFLA_IFNAME, dev->name) || nla_put_u32(skb, IFLA_MASTER, br->dev->ifindex) || nla_put_u32(skb, IFLA_MTU, dev->mtu) || nla_put_u8(skb, IFLA_OPERSTATE, operstate) || (dev->addr_len && nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || (dev->ifindex != dev_get_iflink(dev) && nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev)))) goto nla_put_failure; if (event == RTM_NEWLINK && port) { struct nlattr *nest; nest = nla_nest_start(skb, IFLA_PROTINFO); if (nest == NULL || br_port_fill_attrs(skb, port) < 0) goto nla_put_failure; nla_nest_end(skb, nest); } if (filter_mask & (RTEXT_FILTER_BRVLAN | RTEXT_FILTER_BRVLAN_COMPRESSED | RTEXT_FILTER_MRP | RTEXT_FILTER_CFM_CONFIG | RTEXT_FILTER_CFM_STATUS | RTEXT_FILTER_MST)) { af = nla_nest_start_noflag(skb, IFLA_AF_SPEC); if (!af) goto nla_put_failure; } /* Check if the VID information is requested */ if ((filter_mask & RTEXT_FILTER_BRVLAN) || (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) { struct net_bridge_vlan_group *vg; int err; /* RCU needed because of the VLAN locking rules (rcu || rtnl) */ rcu_read_lock(); if (port) vg = nbp_vlan_group_rcu(port); else vg = br_vlan_group_rcu(br); if (!vg || !vg->num_vlans) { rcu_read_unlock(); goto done; } if (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) err = br_fill_ifvlaninfo_compressed(skb, vg); else err = br_fill_ifvlaninfo(skb, vg); if (port && (port->flags & BR_VLAN_TUNNEL)) err = br_fill_vlan_tunnel_info(skb, vg); rcu_read_unlock(); if (err) goto nla_put_failure; } if (filter_mask & RTEXT_FILTER_MRP) { int err; if (!br_mrp_enabled(br) || port) goto done; rcu_read_lock(); err = br_mrp_fill_info(skb, br); rcu_read_unlock(); if (err) goto nla_put_failure; } if (filter_mask & (RTEXT_FILTER_CFM_CONFIG | RTEXT_FILTER_CFM_STATUS)) { struct nlattr *cfm_nest = NULL; int err; if (!br_cfm_created(br) || port) goto done; cfm_nest = nla_nest_start(skb, IFLA_BRIDGE_CFM); if (!cfm_nest) goto nla_put_failure; if (filter_mask & RTEXT_FILTER_CFM_CONFIG) { rcu_read_lock(); err = br_cfm_config_fill_info(skb, br); rcu_read_unlock(); if (err) goto nla_put_failure; } if (filter_mask & RTEXT_FILTER_CFM_STATUS) { rcu_read_lock(); err = br_cfm_status_fill_info(skb, br, getlink); rcu_read_unlock(); if (err) goto nla_put_failure; } nla_nest_end(skb, cfm_nest); } if ((filter_mask & RTEXT_FILTER_MST) && br_opt_get(br, BROPT_MST_ENABLED) && port) { const struct net_bridge_vlan_group *vg = nbp_vlan_group(port); struct nlattr *mst_nest; int err; if (!vg || !vg->num_vlans) goto done; mst_nest = nla_nest_start(skb, IFLA_BRIDGE_MST); if (!mst_nest) goto nla_put_failure; err = br_mst_fill_info(skb, vg); if (err) goto nla_put_failure; nla_nest_end(skb, mst_nest); } done: if (af) { if (nlmsg_get_pos(skb) - (void *)af > nla_attr_size(0)) nla_nest_end(skb, af); else nla_nest_cancel(skb, af); } nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } void br_info_notify(int event, const struct net_bridge *br, const struct net_bridge_port *port, u32 filter) { struct net_device *dev; struct sk_buff *skb; int err = -ENOBUFS; struct net *net; u16 port_no = 0; if (WARN_ON(!port && !br)) return; if (port) { dev = port->dev; br = port->br; port_no = port->port_no; } else { dev = br->dev; } net = dev_net(dev); br_debug(br, "port %u(%s) event %d\n", port_no, dev->name, event); skb = nlmsg_new(br_nlmsg_size(dev, filter), GFP_ATOMIC); if (skb == NULL) goto errout; err = br_fill_ifinfo(skb, port, 0, 0, event, 0, filter, dev, false); if (err < 0) { /* -EMSGSIZE implies BUG in br_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); return; errout: rtnl_set_sk_err(net, RTNLGRP_LINK, err); } /* Notify listeners of a change in bridge or port information */ void br_ifinfo_notify(int event, const struct net_bridge *br, const struct net_bridge_port *port) { u32 filter = RTEXT_FILTER_BRVLAN_COMPRESSED; br_info_notify(event, br, port, filter); } /* * Dump information about all ports, in response to GETLINK */ int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u32 filter_mask, int nlflags) { struct net_bridge_port *port = br_port_get_rtnl(dev); if (!port && !(filter_mask & RTEXT_FILTER_BRVLAN) && !(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) && !(filter_mask & RTEXT_FILTER_MRP) && !(filter_mask & RTEXT_FILTER_CFM_CONFIG) && !(filter_mask & RTEXT_FILTER_CFM_STATUS)) return 0; return br_fill_ifinfo(skb, port, pid, seq, RTM_NEWLINK, nlflags, filter_mask, dev, true); } static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p, int cmd, struct bridge_vlan_info *vinfo, bool *changed, struct netlink_ext_ack *extack) { bool curr_change; int err = 0; switch (cmd) { case RTM_SETLINK: if (p) { /* if the MASTER flag is set this will act on the global * per-VLAN entry as well */ err = nbp_vlan_add(p, vinfo->vid, vinfo->flags, &curr_change, extack); } else { vinfo->flags |= BRIDGE_VLAN_INFO_BRENTRY; err = br_vlan_add(br, vinfo->vid, vinfo->flags, &curr_change, extack); } if (curr_change) *changed = true; break; case RTM_DELLINK: if (p) { if (!nbp_vlan_delete(p, vinfo->vid)) *changed = true; if ((vinfo->flags & BRIDGE_VLAN_INFO_MASTER) && !br_vlan_delete(p->br, vinfo->vid)) *changed = true; } else if (!br_vlan_delete(br, vinfo->vid)) { *changed = true; } break; } return err; } int br_process_vlan_info(struct net_bridge *br, struct net_bridge_port *p, int cmd, struct bridge_vlan_info *vinfo_curr, struct bridge_vlan_info **vinfo_last, bool *changed, struct netlink_ext_ack *extack) { int err, rtm_cmd; if (!br_vlan_valid_id(vinfo_curr->vid, extack)) return -EINVAL; /* needed for vlan-only NEWVLAN/DELVLAN notifications */ rtm_cmd = br_afspec_cmd_to_rtm(cmd); if (vinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) { if (!br_vlan_valid_range(vinfo_curr, *vinfo_last, extack)) return -EINVAL; *vinfo_last = vinfo_curr; return 0; } if (*vinfo_last) { struct bridge_vlan_info tmp_vinfo; int v, v_change_start = 0; if (!br_vlan_valid_range(vinfo_curr, *vinfo_last, extack)) return -EINVAL; memcpy(&tmp_vinfo, *vinfo_last, sizeof(struct bridge_vlan_info)); for (v = (*vinfo_last)->vid; v <= vinfo_curr->vid; v++) { bool curr_change = false; tmp_vinfo.vid = v; err = br_vlan_info(br, p, cmd, &tmp_vinfo, &curr_change, extack); if (err) break; if (curr_change) { *changed = curr_change; if (!v_change_start) v_change_start = v; } else { /* nothing to notify yet */ if (!v_change_start) continue; br_vlan_notify(br, p, v_change_start, v - 1, rtm_cmd); v_change_start = 0; } cond_resched(); } /* v_change_start is set only if the last/whole range changed */ if (v_change_start) br_vlan_notify(br, p, v_change_start, v - 1, rtm_cmd); *vinfo_last = NULL; return err; } err = br_vlan_info(br, p, cmd, vinfo_curr, changed, extack); if (*changed) br_vlan_notify(br, p, vinfo_curr->vid, 0, rtm_cmd); return err; } static int br_afspec(struct net_bridge *br, struct net_bridge_port *p, struct nlattr *af_spec, int cmd, bool *changed, struct netlink_ext_ack *extack) { struct bridge_vlan_info *vinfo_curr = NULL; struct bridge_vlan_info *vinfo_last = NULL; struct nlattr *attr; struct vtunnel_info tinfo_last = {}; struct vtunnel_info tinfo_curr = {}; int err = 0, rem; nla_for_each_nested(attr, af_spec, rem) { err = 0; switch (nla_type(attr)) { case IFLA_BRIDGE_VLAN_TUNNEL_INFO: if (!p || !(p->flags & BR_VLAN_TUNNEL)) return -EINVAL; err = br_parse_vlan_tunnel_info(attr, &tinfo_curr); if (err) return err; err = br_process_vlan_tunnel_info(br, p, cmd, &tinfo_curr, &tinfo_last, changed); if (err) return err; break; case IFLA_BRIDGE_VLAN_INFO: if (nla_len(attr) != sizeof(struct bridge_vlan_info)) return -EINVAL; vinfo_curr = nla_data(attr); err = br_process_vlan_info(br, p, cmd, vinfo_curr, &vinfo_last, changed, extack); if (err) return err; break; case IFLA_BRIDGE_MRP: err = br_mrp_parse(br, p, attr, cmd, extack); if (err) return err; break; case IFLA_BRIDGE_CFM: err = br_cfm_parse(br, p, attr, cmd, extack); if (err) return err; break; case IFLA_BRIDGE_MST: if (!p) { NL_SET_ERR_MSG(extack, "MST states can only be set on bridge ports"); return -EINVAL; } if (cmd != RTM_SETLINK) { NL_SET_ERR_MSG(extack, "MST states can only be set through RTM_SETLINK"); return -EINVAL; } err = br_mst_process(p, attr, extack); if (err) return err; break; } } return err; } static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_UNSPEC] = { .strict_start_type = IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT + 1 }, [IFLA_BRPORT_STATE] = { .type = NLA_U8 }, [IFLA_BRPORT_COST] = { .type = NLA_U32 }, [IFLA_BRPORT_PRIORITY] = { .type = NLA_U16 }, [IFLA_BRPORT_MODE] = { .type = NLA_U8 }, [IFLA_BRPORT_GUARD] = { .type = NLA_U8 }, [IFLA_BRPORT_PROTECT] = { .type = NLA_U8 }, [IFLA_BRPORT_FAST_LEAVE]= { .type = NLA_U8 }, [IFLA_BRPORT_LEARNING] = { .type = NLA_U8 }, [IFLA_BRPORT_UNICAST_FLOOD] = { .type = NLA_U8 }, [IFLA_BRPORT_PROXYARP] = { .type = NLA_U8 }, [IFLA_BRPORT_PROXYARP_WIFI] = { .type = NLA_U8 }, [IFLA_BRPORT_MULTICAST_ROUTER] = { .type = NLA_U8 }, [IFLA_BRPORT_MCAST_TO_UCAST] = { .type = NLA_U8 }, [IFLA_BRPORT_MCAST_FLOOD] = { .type = NLA_U8 }, [IFLA_BRPORT_BCAST_FLOOD] = { .type = NLA_U8 }, [IFLA_BRPORT_VLAN_TUNNEL] = { .type = NLA_U8 }, [IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 }, [IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 }, [IFLA_BRPORT_ISOLATED] = { .type = NLA_U8 }, [IFLA_BRPORT_LOCKED] = { .type = NLA_U8 }, [IFLA_BRPORT_MAB] = { .type = NLA_U8 }, [IFLA_BRPORT_BACKUP_PORT] = { .type = NLA_U32 }, [IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT] = { .type = NLA_U32 }, [IFLA_BRPORT_MCAST_N_GROUPS] = { .type = NLA_REJECT }, [IFLA_BRPORT_MCAST_MAX_GROUPS] = { .type = NLA_U32 }, [IFLA_BRPORT_NEIGH_VLAN_SUPPRESS] = NLA_POLICY_MAX(NLA_U8, 1), [IFLA_BRPORT_BACKUP_NHID] = { .type = NLA_U32 }, }; /* Change the state of the port and notify spanning tree */ static int br_set_port_state(struct net_bridge_port *p, u8 state) { if (state > BR_STATE_BLOCKING) return -EINVAL; /* if kernel STP is running, don't allow changes */ if (p->br->stp_enabled == BR_KERNEL_STP) return -EBUSY; /* if device is not up, change is not allowed * if link is not present, only allowable state is disabled */ if (!netif_running(p->dev) || (!netif_oper_up(p->dev) && state != BR_STATE_DISABLED)) return -ENETDOWN; br_set_state(p, state); br_port_state_selection(p->br); return 0; } /* Set/clear or port flags based on attribute */ static void br_set_port_flag(struct net_bridge_port *p, struct nlattr *tb[], int attrtype, unsigned long mask) { if (!tb[attrtype]) return; if (nla_get_u8(tb[attrtype])) p->flags |= mask; else p->flags &= ~mask; } /* Process bridge protocol info on port */ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[], struct netlink_ext_ack *extack) { unsigned long old_flags, changed_mask; bool br_vlan_tunnel_old; int err; old_flags = p->flags; br_vlan_tunnel_old = (old_flags & BR_VLAN_TUNNEL) ? true : false; br_set_port_flag(p, tb, IFLA_BRPORT_MODE, BR_HAIRPIN_MODE); br_set_port_flag(p, tb, IFLA_BRPORT_GUARD, BR_BPDU_GUARD); br_set_port_flag(p, tb, IFLA_BRPORT_FAST_LEAVE, BR_MULTICAST_FAST_LEAVE); br_set_port_flag(p, tb, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK); br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING); br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD); br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD); br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_TO_UCAST, BR_MULTICAST_TO_UNICAST); br_set_port_flag(p, tb, IFLA_BRPORT_BCAST_FLOOD, BR_BCAST_FLOOD); br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP); br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI); br_set_port_flag(p, tb, IFLA_BRPORT_VLAN_TUNNEL, BR_VLAN_TUNNEL); br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_SUPPRESS, BR_NEIGH_SUPPRESS); br_set_port_flag(p, tb, IFLA_BRPORT_ISOLATED, BR_ISOLATED); br_set_port_flag(p, tb, IFLA_BRPORT_LOCKED, BR_PORT_LOCKED); br_set_port_flag(p, tb, IFLA_BRPORT_MAB, BR_PORT_MAB); br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_VLAN_SUPPRESS, BR_NEIGH_VLAN_SUPPRESS); if ((p->flags & BR_PORT_MAB) && (!(p->flags & BR_PORT_LOCKED) || !(p->flags & BR_LEARNING))) { NL_SET_ERR_MSG(extack, "Bridge port must be locked and have learning enabled when MAB is enabled"); p->flags = old_flags; return -EINVAL; } else if (!(p->flags & BR_PORT_MAB) && (old_flags & BR_PORT_MAB)) { struct net_bridge_fdb_flush_desc desc = { .flags = BIT(BR_FDB_LOCKED), .flags_mask = BIT(BR_FDB_LOCKED), .port_ifindex = p->dev->ifindex, }; br_fdb_flush(p->br, &desc); } changed_mask = old_flags ^ p->flags; err = br_switchdev_set_port_flag(p, p->flags, changed_mask, extack); if (err) { p->flags = old_flags; return err; } if (br_vlan_tunnel_old && !(p->flags & BR_VLAN_TUNNEL)) nbp_vlan_tunnel_info_flush(p); br_port_flags_change(p, changed_mask); if (tb[IFLA_BRPORT_COST]) { err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST])); if (err) return err; } if (tb[IFLA_BRPORT_PRIORITY]) { err = br_stp_set_port_priority(p, nla_get_u16(tb[IFLA_BRPORT_PRIORITY])); if (err) return err; } if (tb[IFLA_BRPORT_STATE]) { err = br_set_port_state(p, nla_get_u8(tb[IFLA_BRPORT_STATE])); if (err) return err; } if (tb[IFLA_BRPORT_FLUSH]) br_fdb_delete_by_port(p->br, p, 0, 0); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (tb[IFLA_BRPORT_MULTICAST_ROUTER]) { u8 mcast_router = nla_get_u8(tb[IFLA_BRPORT_MULTICAST_ROUTER]); err = br_multicast_set_port_router(&p->multicast_ctx, mcast_router); if (err) return err; } if (tb[IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT]) { u32 hlimit; hlimit = nla_get_u32(tb[IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT]); err = br_multicast_eht_set_hosts_limit(p, hlimit); if (err) return err; } if (tb[IFLA_BRPORT_MCAST_MAX_GROUPS]) { u32 max_groups; max_groups = nla_get_u32(tb[IFLA_BRPORT_MCAST_MAX_GROUPS]); br_multicast_ngroups_set_max(&p->multicast_ctx, max_groups); } #endif if (tb[IFLA_BRPORT_GROUP_FWD_MASK]) { u16 fwd_mask = nla_get_u16(tb[IFLA_BRPORT_GROUP_FWD_MASK]); if (fwd_mask & BR_GROUPFWD_MACPAUSE) return -EINVAL; p->group_fwd_mask = fwd_mask; } if (tb[IFLA_BRPORT_BACKUP_PORT]) { struct net_device *backup_dev = NULL; u32 backup_ifindex; backup_ifindex = nla_get_u32(tb[IFLA_BRPORT_BACKUP_PORT]); if (backup_ifindex) { backup_dev = __dev_get_by_index(dev_net(p->dev), backup_ifindex); if (!backup_dev) return -ENOENT; } err = nbp_backup_change(p, backup_dev); if (err) return err; } if (tb[IFLA_BRPORT_BACKUP_NHID]) { u32 backup_nhid = nla_get_u32(tb[IFLA_BRPORT_BACKUP_NHID]); WRITE_ONCE(p->backup_nhid, backup_nhid); } return 0; } /* Change state and parameters on port. */ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags, struct netlink_ext_ack *extack) { struct net_bridge *br = (struct net_bridge *)netdev_priv(dev); struct nlattr *tb[IFLA_BRPORT_MAX + 1]; struct net_bridge_port *p; struct nlattr *protinfo; struct nlattr *afspec; bool changed = false; int err = 0; protinfo = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_PROTINFO); afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (!protinfo && !afspec) return 0; p = br_port_get_rtnl(dev); /* We want to accept dev as bridge itself if the AF_SPEC * is set to see if someone is setting vlan info on the bridge */ if (!p && !afspec) return -EINVAL; if (p && protinfo) { if (protinfo->nla_type & NLA_F_NESTED) { err = nla_parse_nested_deprecated(tb, IFLA_BRPORT_MAX, protinfo, br_port_policy, NULL); if (err) return err; spin_lock_bh(&p->br->lock); err = br_setport(p, tb, extack); spin_unlock_bh(&p->br->lock); } else { /* Binary compatibility with old RSTP */ if (nla_len(protinfo) < sizeof(u8)) return -EINVAL; spin_lock_bh(&p->br->lock); err = br_set_port_state(p, nla_get_u8(protinfo)); spin_unlock_bh(&p->br->lock); } if (err) goto out; changed = true; } if (afspec) err = br_afspec(br, p, afspec, RTM_SETLINK, &changed, extack); if (changed) br_ifinfo_notify(RTM_NEWLINK, br, p); out: return err; } /* Delete port information */ int br_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) { struct net_bridge *br = (struct net_bridge *)netdev_priv(dev); struct net_bridge_port *p; struct nlattr *afspec; bool changed = false; int err = 0; afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (!afspec) return 0; p = br_port_get_rtnl(dev); /* We want to accept dev as bridge itself as well */ if (!p && !netif_is_bridge_master(dev)) return -EINVAL; err = br_afspec(br, p, afspec, RTM_DELLINK, &changed, NULL); if (changed) /* Send RTM_NEWLINK because userspace * expects RTM_NEWLINK for vlan dels */ br_ifinfo_notify(RTM_NEWLINK, br, p); return err; } static int br_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) return -EADDRNOTAVAIL; } if (!data) return 0; #ifdef CONFIG_BRIDGE_VLAN_FILTERING if (data[IFLA_BR_VLAN_PROTOCOL] && !eth_type_vlan(nla_get_be16(data[IFLA_BR_VLAN_PROTOCOL]))) return -EPROTONOSUPPORT; if (data[IFLA_BR_VLAN_DEFAULT_PVID]) { __u16 defpvid = nla_get_u16(data[IFLA_BR_VLAN_DEFAULT_PVID]); if (defpvid >= VLAN_VID_MASK) return -EINVAL; } #endif return 0; } static int br_port_slave_changelink(struct net_device *brdev, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(brdev); int ret; if (!data) return 0; spin_lock_bh(&br->lock); ret = br_setport(br_port_get_rtnl(dev), data, extack); spin_unlock_bh(&br->lock); return ret; } static int br_port_fill_slave_info(struct sk_buff *skb, const struct net_device *brdev, const struct net_device *dev) { return br_port_fill_attrs(skb, br_port_get_rtnl(dev)); } static size_t br_port_get_slave_size(const struct net_device *brdev, const struct net_device *dev) { return br_port_info_size(); } static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_UNSPEC] = { .strict_start_type = IFLA_BR_FDB_N_LEARNED }, [IFLA_BR_FORWARD_DELAY] = { .type = NLA_U32 }, [IFLA_BR_HELLO_TIME] = { .type = NLA_U32 }, [IFLA_BR_MAX_AGE] = { .type = NLA_U32 }, [IFLA_BR_AGEING_TIME] = { .type = NLA_U32 }, [IFLA_BR_STP_STATE] = { .type = NLA_U32 }, [IFLA_BR_PRIORITY] = { .type = NLA_U16 }, [IFLA_BR_VLAN_FILTERING] = { .type = NLA_U8 }, [IFLA_BR_VLAN_PROTOCOL] = { .type = NLA_U16 }, [IFLA_BR_GROUP_FWD_MASK] = { .type = NLA_U16 }, [IFLA_BR_GROUP_ADDR] = { .type = NLA_BINARY, .len = ETH_ALEN }, [IFLA_BR_MCAST_ROUTER] = { .type = NLA_U8 }, [IFLA_BR_MCAST_SNOOPING] = { .type = NLA_U8 }, [IFLA_BR_MCAST_QUERY_USE_IFADDR] = { .type = NLA_U8 }, [IFLA_BR_MCAST_QUERIER] = { .type = NLA_U8 }, [IFLA_BR_MCAST_HASH_ELASTICITY] = { .type = NLA_U32 }, [IFLA_BR_MCAST_HASH_MAX] = { .type = NLA_U32 }, [IFLA_BR_MCAST_LAST_MEMBER_CNT] = { .type = NLA_U32 }, [IFLA_BR_MCAST_STARTUP_QUERY_CNT] = { .type = NLA_U32 }, [IFLA_BR_MCAST_LAST_MEMBER_INTVL] = { .type = NLA_U64 }, [IFLA_BR_MCAST_MEMBERSHIP_INTVL] = { .type = NLA_U64 }, [IFLA_BR_MCAST_QUERIER_INTVL] = { .type = NLA_U64 }, [IFLA_BR_MCAST_QUERY_INTVL] = { .type = NLA_U64 }, [IFLA_BR_MCAST_QUERY_RESPONSE_INTVL] = { .type = NLA_U64 }, [IFLA_BR_MCAST_STARTUP_QUERY_INTVL] = { .type = NLA_U64 }, [IFLA_BR_NF_CALL_IPTABLES] = { .type = NLA_U8 }, [IFLA_BR_NF_CALL_IP6TABLES] = { .type = NLA_U8 }, [IFLA_BR_NF_CALL_ARPTABLES] = { .type = NLA_U8 }, [IFLA_BR_VLAN_DEFAULT_PVID] = { .type = NLA_U16 }, [IFLA_BR_VLAN_STATS_ENABLED] = { .type = NLA_U8 }, [IFLA_BR_MCAST_STATS_ENABLED] = { .type = NLA_U8 }, [IFLA_BR_MCAST_IGMP_VERSION] = { .type = NLA_U8 }, [IFLA_BR_MCAST_MLD_VERSION] = { .type = NLA_U8 }, [IFLA_BR_VLAN_STATS_PER_PORT] = { .type = NLA_U8 }, [IFLA_BR_MULTI_BOOLOPT] = NLA_POLICY_EXACT_LEN(sizeof(struct br_boolopt_multi)), [IFLA_BR_FDB_N_LEARNED] = { .type = NLA_REJECT }, [IFLA_BR_FDB_MAX_LEARNED] = { .type = NLA_U32 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(brdev); int err; if (!data) return 0; if (data[IFLA_BR_FORWARD_DELAY]) { err = br_set_forward_delay(br, nla_get_u32(data[IFLA_BR_FORWARD_DELAY])); if (err) return err; } if (data[IFLA_BR_HELLO_TIME]) { err = br_set_hello_time(br, nla_get_u32(data[IFLA_BR_HELLO_TIME])); if (err) return err; } if (data[IFLA_BR_MAX_AGE]) { err = br_set_max_age(br, nla_get_u32(data[IFLA_BR_MAX_AGE])); if (err) return err; } if (data[IFLA_BR_AGEING_TIME]) { err = br_set_ageing_time(br, nla_get_u32(data[IFLA_BR_AGEING_TIME])); if (err) return err; } if (data[IFLA_BR_STP_STATE]) { u32 stp_enabled = nla_get_u32(data[IFLA_BR_STP_STATE]); err = br_stp_set_enabled(br, stp_enabled, extack); if (err) return err; } if (data[IFLA_BR_PRIORITY]) { u32 priority = nla_get_u16(data[IFLA_BR_PRIORITY]); br_stp_set_bridge_priority(br, priority); } if (data[IFLA_BR_VLAN_FILTERING]) { u8 vlan_filter = nla_get_u8(data[IFLA_BR_VLAN_FILTERING]); err = br_vlan_filter_toggle(br, vlan_filter, extack); if (err) return err; } #ifdef CONFIG_BRIDGE_VLAN_FILTERING if (data[IFLA_BR_VLAN_PROTOCOL]) { __be16 vlan_proto = nla_get_be16(data[IFLA_BR_VLAN_PROTOCOL]); err = __br_vlan_set_proto(br, vlan_proto, extack); if (err) return err; } if (data[IFLA_BR_VLAN_DEFAULT_PVID]) { __u16 defpvid = nla_get_u16(data[IFLA_BR_VLAN_DEFAULT_PVID]); err = __br_vlan_set_default_pvid(br, defpvid, extack); if (err) return err; } if (data[IFLA_BR_VLAN_STATS_ENABLED]) { __u8 vlan_stats = nla_get_u8(data[IFLA_BR_VLAN_STATS_ENABLED]); err = br_vlan_set_stats(br, vlan_stats); if (err) return err; } if (data[IFLA_BR_VLAN_STATS_PER_PORT]) { __u8 per_port = nla_get_u8(data[IFLA_BR_VLAN_STATS_PER_PORT]); err = br_vlan_set_stats_per_port(br, per_port); if (err) return err; } #endif if (data[IFLA_BR_GROUP_FWD_MASK]) { u16 fwd_mask = nla_get_u16(data[IFLA_BR_GROUP_FWD_MASK]); if (fwd_mask & BR_GROUPFWD_RESTRICTED) return -EINVAL; br->group_fwd_mask = fwd_mask; } if (data[IFLA_BR_GROUP_ADDR]) { u8 new_addr[ETH_ALEN]; if (nla_len(data[IFLA_BR_GROUP_ADDR]) != ETH_ALEN) return -EINVAL; memcpy(new_addr, nla_data(data[IFLA_BR_GROUP_ADDR]), ETH_ALEN); if (!is_link_local_ether_addr(new_addr)) return -EINVAL; if (new_addr[5] == 1 || /* 802.3x Pause address */ new_addr[5] == 2 || /* 802.3ad Slow protocols */ new_addr[5] == 3) /* 802.1X PAE address */ return -EINVAL; spin_lock_bh(&br->lock); memcpy(br->group_addr, new_addr, sizeof(br->group_addr)); spin_unlock_bh(&br->lock); br_opt_toggle(br, BROPT_GROUP_ADDR_SET, true); br_recalculate_fwd_mask(br); } if (data[IFLA_BR_FDB_FLUSH]) { struct net_bridge_fdb_flush_desc desc = { .flags_mask = BIT(BR_FDB_STATIC) }; br_fdb_flush(br, &desc); } #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (data[IFLA_BR_MCAST_ROUTER]) { u8 multicast_router = nla_get_u8(data[IFLA_BR_MCAST_ROUTER]); err = br_multicast_set_router(&br->multicast_ctx, multicast_router); if (err) return err; } if (data[IFLA_BR_MCAST_SNOOPING]) { u8 mcast_snooping = nla_get_u8(data[IFLA_BR_MCAST_SNOOPING]); err = br_multicast_toggle(br, mcast_snooping, extack); if (err) return err; } if (data[IFLA_BR_MCAST_QUERY_USE_IFADDR]) { u8 val; val = nla_get_u8(data[IFLA_BR_MCAST_QUERY_USE_IFADDR]); br_opt_toggle(br, BROPT_MULTICAST_QUERY_USE_IFADDR, !!val); } if (data[IFLA_BR_MCAST_QUERIER]) { u8 mcast_querier = nla_get_u8(data[IFLA_BR_MCAST_QUERIER]); err = br_multicast_set_querier(&br->multicast_ctx, mcast_querier); if (err) return err; } if (data[IFLA_BR_MCAST_HASH_ELASTICITY]) br_warn(br, "the hash_elasticity option has been deprecated and is always %u\n", RHT_ELASTICITY); if (data[IFLA_BR_MCAST_HASH_MAX]) br->hash_max = nla_get_u32(data[IFLA_BR_MCAST_HASH_MAX]); if (data[IFLA_BR_MCAST_LAST_MEMBER_CNT]) { u32 val = nla_get_u32(data[IFLA_BR_MCAST_LAST_MEMBER_CNT]); br->multicast_ctx.multicast_last_member_count = val; } if (data[IFLA_BR_MCAST_STARTUP_QUERY_CNT]) { u32 val = nla_get_u32(data[IFLA_BR_MCAST_STARTUP_QUERY_CNT]); br->multicast_ctx.multicast_startup_query_count = val; } if (data[IFLA_BR_MCAST_LAST_MEMBER_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_LAST_MEMBER_INTVL]); br->multicast_ctx.multicast_last_member_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_MEMBERSHIP_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_MEMBERSHIP_INTVL]); br->multicast_ctx.multicast_membership_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_QUERIER_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERIER_INTVL]); br->multicast_ctx.multicast_querier_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_QUERY_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERY_INTVL]); br_multicast_set_query_intvl(&br->multicast_ctx, val); } if (data[IFLA_BR_MCAST_QUERY_RESPONSE_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERY_RESPONSE_INTVL]); br->multicast_ctx.multicast_query_response_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_STARTUP_QUERY_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_STARTUP_QUERY_INTVL]); br_multicast_set_startup_query_intvl(&br->multicast_ctx, val); } if (data[IFLA_BR_MCAST_STATS_ENABLED]) { __u8 mcast_stats; mcast_stats = nla_get_u8(data[IFLA_BR_MCAST_STATS_ENABLED]); br_opt_toggle(br, BROPT_MULTICAST_STATS_ENABLED, !!mcast_stats); } if (data[IFLA_BR_MCAST_IGMP_VERSION]) { __u8 igmp_version; igmp_version = nla_get_u8(data[IFLA_BR_MCAST_IGMP_VERSION]); err = br_multicast_set_igmp_version(&br->multicast_ctx, igmp_version); if (err) return err; } #if IS_ENABLED(CONFIG_IPV6) if (data[IFLA_BR_MCAST_MLD_VERSION]) { __u8 mld_version; mld_version = nla_get_u8(data[IFLA_BR_MCAST_MLD_VERSION]); err = br_multicast_set_mld_version(&br->multicast_ctx, mld_version); if (err) return err; } #endif #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (data[IFLA_BR_NF_CALL_IPTABLES]) { u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_IPTABLES]); br_opt_toggle(br, BROPT_NF_CALL_IPTABLES, !!val); } if (data[IFLA_BR_NF_CALL_IP6TABLES]) { u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_IP6TABLES]); br_opt_toggle(br, BROPT_NF_CALL_IP6TABLES, !!val); } if (data[IFLA_BR_NF_CALL_ARPTABLES]) { u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_ARPTABLES]); br_opt_toggle(br, BROPT_NF_CALL_ARPTABLES, !!val); } #endif if (data[IFLA_BR_MULTI_BOOLOPT]) { struct br_boolopt_multi *bm; bm = nla_data(data[IFLA_BR_MULTI_BOOLOPT]); err = br_boolopt_multi_toggle(br, bm, extack); if (err) return err; } if (data[IFLA_BR_FDB_MAX_LEARNED]) { u32 val = nla_get_u32(data[IFLA_BR_FDB_MAX_LEARNED]); WRITE_ONCE(br->fdb_max_learned, val); } return 0; } static int br_dev_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(dev); struct nlattr **data = params->data; struct nlattr **tb = params->tb; int err; err = register_netdevice(dev); if (err) return err; if (tb[IFLA_ADDRESS]) { spin_lock_bh(&br->lock); br_stp_change_bridge_id(br, nla_data(tb[IFLA_ADDRESS])); spin_unlock_bh(&br->lock); } err = br_changelink(dev, tb, data, extack); if (err) br_dev_delete(dev, NULL); return err; } static size_t br_get_size(const struct net_device *brdev) { return nla_total_size(sizeof(u32)) + /* IFLA_BR_FORWARD_DELAY */ nla_total_size(sizeof(u32)) + /* IFLA_BR_HELLO_TIME */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MAX_AGE */ nla_total_size(sizeof(u32)) + /* IFLA_BR_AGEING_TIME */ nla_total_size(sizeof(u32)) + /* IFLA_BR_STP_STATE */ nla_total_size(sizeof(u16)) + /* IFLA_BR_PRIORITY */ nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_FILTERING */ #ifdef CONFIG_BRIDGE_VLAN_FILTERING nla_total_size(sizeof(__be16)) + /* IFLA_BR_VLAN_PROTOCOL */ nla_total_size(sizeof(u16)) + /* IFLA_BR_VLAN_DEFAULT_PVID */ nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_STATS_ENABLED */ nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_STATS_PER_PORT */ #endif nla_total_size(sizeof(u16)) + /* IFLA_BR_GROUP_FWD_MASK */ nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_ROOT_ID */ nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_BRIDGE_ID */ nla_total_size(sizeof(u16)) + /* IFLA_BR_ROOT_PORT */ nla_total_size(sizeof(u32)) + /* IFLA_BR_ROOT_PATH_COST */ nla_total_size(sizeof(u8)) + /* IFLA_BR_TOPOLOGY_CHANGE */ nla_total_size(sizeof(u8)) + /* IFLA_BR_TOPOLOGY_CHANGE_DETECTED */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_HELLO_TIMER */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_TCN_TIMER */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_TOPOLOGY_CHANGE_TIMER */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_GC_TIMER */ nla_total_size(ETH_ALEN) + /* IFLA_BR_GROUP_ADDR */ nla_total_size(sizeof(u32)) + /* IFLA_BR_FDB_N_LEARNED */ nla_total_size(sizeof(u32)) + /* IFLA_BR_FDB_MAX_LEARNED */ #ifdef CONFIG_BRIDGE_IGMP_SNOOPING nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_ROUTER */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_SNOOPING */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERY_USE_IFADDR */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERIER */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_STATS_ENABLED */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_ELASTICITY */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_MAX */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_LAST_MEMBER_CNT */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_STARTUP_QUERY_CNT */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_LAST_MEMBER_INTVL */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_MEMBERSHIP_INTVL */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERIER_INTVL */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_INTVL */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_RESPONSE_INTVL */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_STARTUP_QUERY_INTVL */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_IGMP_VERSION */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_MLD_VERSION */ br_multicast_querier_state_size() + /* IFLA_BR_MCAST_QUERIER_STATE */ #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IPTABLES */ nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IP6TABLES */ nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_ARPTABLES */ #endif nla_total_size(sizeof(struct br_boolopt_multi)) + /* IFLA_BR_MULTI_BOOLOPT */ 0; } static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) { struct net_bridge *br = netdev_priv(brdev); u32 forward_delay = jiffies_to_clock_t(br->forward_delay); u32 hello_time = jiffies_to_clock_t(br->hello_time); u32 age_time = jiffies_to_clock_t(br->max_age); u32 ageing_time = jiffies_to_clock_t(br->ageing_time); u32 stp_enabled = br->stp_enabled; u16 priority = (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]; u8 vlan_enabled = br_vlan_enabled(br->dev); struct br_boolopt_multi bm; u64 clockval; clockval = br_timer_value(&br->hello_timer); if (nla_put_u64_64bit(skb, IFLA_BR_HELLO_TIMER, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = br_timer_value(&br->tcn_timer); if (nla_put_u64_64bit(skb, IFLA_BR_TCN_TIMER, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = br_timer_value(&br->topology_change_timer); if (nla_put_u64_64bit(skb, IFLA_BR_TOPOLOGY_CHANGE_TIMER, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = br_timer_value(&br->gc_work.timer); if (nla_put_u64_64bit(skb, IFLA_BR_GC_TIMER, clockval, IFLA_BR_PAD)) return -EMSGSIZE; br_boolopt_multi_get(br, &bm); if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) || nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) || nla_put_u32(skb, IFLA_BR_MAX_AGE, age_time) || nla_put_u32(skb, IFLA_BR_AGEING_TIME, ageing_time) || nla_put_u32(skb, IFLA_BR_STP_STATE, stp_enabled) || nla_put_u16(skb, IFLA_BR_PRIORITY, priority) || nla_put_u8(skb, IFLA_BR_VLAN_FILTERING, vlan_enabled) || nla_put_u16(skb, IFLA_BR_GROUP_FWD_MASK, br->group_fwd_mask) || nla_put(skb, IFLA_BR_BRIDGE_ID, sizeof(struct ifla_bridge_id), &br->bridge_id) || nla_put(skb, IFLA_BR_ROOT_ID, sizeof(struct ifla_bridge_id), &br->designated_root) || nla_put_u16(skb, IFLA_BR_ROOT_PORT, br->root_port) || nla_put_u32(skb, IFLA_BR_ROOT_PATH_COST, br->root_path_cost) || nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE, br->topology_change) || nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE_DETECTED, br->topology_change_detected) || nla_put(skb, IFLA_BR_GROUP_ADDR, ETH_ALEN, br->group_addr) || nla_put(skb, IFLA_BR_MULTI_BOOLOPT, sizeof(bm), &bm) || nla_put_u32(skb, IFLA_BR_FDB_N_LEARNED, atomic_read(&br->fdb_n_learned)) || nla_put_u32(skb, IFLA_BR_FDB_MAX_LEARNED, br->fdb_max_learned)) return -EMSGSIZE; #ifdef CONFIG_BRIDGE_VLAN_FILTERING if (nla_put_be16(skb, IFLA_BR_VLAN_PROTOCOL, br->vlan_proto) || nla_put_u16(skb, IFLA_BR_VLAN_DEFAULT_PVID, br->default_pvid) || nla_put_u8(skb, IFLA_BR_VLAN_STATS_ENABLED, br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) || nla_put_u8(skb, IFLA_BR_VLAN_STATS_PER_PORT, br_opt_get(br, BROPT_VLAN_STATS_PER_PORT))) return -EMSGSIZE; #endif #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (nla_put_u8(skb, IFLA_BR_MCAST_ROUTER, br->multicast_ctx.multicast_router) || nla_put_u8(skb, IFLA_BR_MCAST_SNOOPING, br_opt_get(br, BROPT_MULTICAST_ENABLED)) || nla_put_u8(skb, IFLA_BR_MCAST_QUERY_USE_IFADDR, br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR)) || nla_put_u8(skb, IFLA_BR_MCAST_QUERIER, br->multicast_ctx.multicast_querier) || nla_put_u8(skb, IFLA_BR_MCAST_STATS_ENABLED, br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)) || nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY, RHT_ELASTICITY) || nla_put_u32(skb, IFLA_BR_MCAST_HASH_MAX, br->hash_max) || nla_put_u32(skb, IFLA_BR_MCAST_LAST_MEMBER_CNT, br->multicast_ctx.multicast_last_member_count) || nla_put_u32(skb, IFLA_BR_MCAST_STARTUP_QUERY_CNT, br->multicast_ctx.multicast_startup_query_count) || nla_put_u8(skb, IFLA_BR_MCAST_IGMP_VERSION, br->multicast_ctx.multicast_igmp_version) || br_multicast_dump_querier_state(skb, &br->multicast_ctx, IFLA_BR_MCAST_QUERIER_STATE)) return -EMSGSIZE; #if IS_ENABLED(CONFIG_IPV6) if (nla_put_u8(skb, IFLA_BR_MCAST_MLD_VERSION, br->multicast_ctx.multicast_mld_version)) return -EMSGSIZE; #endif clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_last_member_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_LAST_MEMBER_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_membership_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_MEMBERSHIP_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_querier_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERIER_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_query_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERY_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_query_response_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERY_RESPONSE_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_startup_query_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_STARTUP_QUERY_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (nla_put_u8(skb, IFLA_BR_NF_CALL_IPTABLES, br_opt_get(br, BROPT_NF_CALL_IPTABLES) ? 1 : 0) || nla_put_u8(skb, IFLA_BR_NF_CALL_IP6TABLES, br_opt_get(br, BROPT_NF_CALL_IP6TABLES) ? 1 : 0) || nla_put_u8(skb, IFLA_BR_NF_CALL_ARPTABLES, br_opt_get(br, BROPT_NF_CALL_ARPTABLES) ? 1 : 0)) return -EMSGSIZE; #endif return 0; } static size_t br_get_linkxstats_size(const struct net_device *dev, int attr) { struct net_bridge_port *p = NULL; struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; struct net_bridge *br; int numvls = 0; switch (attr) { case IFLA_STATS_LINK_XSTATS: br = netdev_priv(dev); vg = br_vlan_group(br); break; case IFLA_STATS_LINK_XSTATS_SLAVE: p = br_port_get_rtnl(dev); if (!p) return 0; vg = nbp_vlan_group(p); break; default: return 0; } if (vg) { /* we need to count all, even placeholder entries */ list_for_each_entry(v, &vg->vlan_list, vlist) numvls++; } return numvls * nla_total_size(sizeof(struct bridge_vlan_xstats)) + nla_total_size_64bit(sizeof(struct br_mcast_stats)) + (p ? nla_total_size_64bit(sizeof(p->stp_xstats)) : 0) + nla_total_size(0); } static int br_fill_linkxstats(struct sk_buff *skb, const struct net_device *dev, int *prividx, int attr) { struct nlattr *nla __maybe_unused; struct net_bridge_port *p = NULL; struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; struct net_bridge *br; struct nlattr *nest; int vl_idx = 0; switch (attr) { case IFLA_STATS_LINK_XSTATS: br = netdev_priv(dev); vg = br_vlan_group(br); break; case IFLA_STATS_LINK_XSTATS_SLAVE: p = br_port_get_rtnl(dev); if (!p) return 0; br = p->br; vg = nbp_vlan_group(p); break; default: return -EINVAL; } nest = nla_nest_start_noflag(skb, LINK_XSTATS_TYPE_BRIDGE); if (!nest) return -EMSGSIZE; if (vg) { u16 pvid; pvid = br_get_pvid(vg); list_for_each_entry(v, &vg->vlan_list, vlist) { struct bridge_vlan_xstats vxi; struct pcpu_sw_netstats stats; if (++vl_idx < *prividx) continue; memset(&vxi, 0, sizeof(vxi)); vxi.vid = v->vid; vxi.flags = v->flags; if (v->vid == pvid) vxi.flags |= BRIDGE_VLAN_INFO_PVID; br_vlan_get_stats(v, &stats); vxi.rx_bytes = u64_stats_read(&stats.rx_bytes); vxi.rx_packets = u64_stats_read(&stats.rx_packets); vxi.tx_bytes = u64_stats_read(&stats.tx_bytes); vxi.tx_packets = u64_stats_read(&stats.tx_packets); if (nla_put(skb, BRIDGE_XSTATS_VLAN, sizeof(vxi), &vxi)) goto nla_put_failure; } } #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (++vl_idx >= *prividx) { nla = nla_reserve_64bit(skb, BRIDGE_XSTATS_MCAST, sizeof(struct br_mcast_stats), BRIDGE_XSTATS_PAD); if (!nla) goto nla_put_failure; br_multicast_get_stats(br, p, nla_data(nla)); } #endif if (p) { nla = nla_reserve_64bit(skb, BRIDGE_XSTATS_STP, sizeof(p->stp_xstats), BRIDGE_XSTATS_PAD); if (!nla) goto nla_put_failure; spin_lock_bh(&br->lock); memcpy(nla_data(nla), &p->stp_xstats, sizeof(p->stp_xstats)); spin_unlock_bh(&br->lock); } nla_nest_end(skb, nest); *prividx = 0; return 0; nla_put_failure: nla_nest_end(skb, nest); *prividx = vl_idx; return -EMSGSIZE; } static struct rtnl_af_ops br_af_ops __read_mostly = { .family = AF_BRIDGE, .get_link_af_size = br_get_link_af_size_filtered, }; struct rtnl_link_ops br_link_ops __read_mostly = { .kind = "bridge", .priv_size = sizeof(struct net_bridge), .setup = br_dev_setup, .maxtype = IFLA_BR_MAX, .policy = br_policy, .validate = br_validate, .newlink = br_dev_newlink, .changelink = br_changelink, .dellink = br_dev_delete, .get_size = br_get_size, .fill_info = br_fill_info, .fill_linkxstats = br_fill_linkxstats, .get_linkxstats_size = br_get_linkxstats_size, .slave_maxtype = IFLA_BRPORT_MAX, .slave_policy = br_port_policy, .slave_changelink = br_port_slave_changelink, .get_slave_size = br_port_get_slave_size, .fill_slave_info = br_port_fill_slave_info, }; int __init br_netlink_init(void) { int err; err = br_vlan_rtnl_init(); if (err) goto out; err = rtnl_af_register(&br_af_ops); if (err) goto out_vlan; err = rtnl_link_register(&br_link_ops); if (err) goto out_af; return 0; out_af: rtnl_af_unregister(&br_af_ops); out_vlan: br_vlan_rtnl_uninit(); out: return err; } void br_netlink_fini(void) { br_vlan_rtnl_uninit(); rtnl_af_unregister(&br_af_ops); rtnl_link_unregister(&br_link_ops); }
4714 4701 4715 4714 4709 3112 3115 8 3 2 8 12 5 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 #include <linux/bpf.h> #include <linux/vmalloc.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/idr.h> #include <linux/namei.h> #include <linux/user_namespace.h> #include <linux/security.h> static bool bpf_ns_capable(struct user_namespace *ns, int cap) { return ns_capable(ns, cap) || (cap != CAP_SYS_ADMIN && ns_capable(ns, CAP_SYS_ADMIN)); } bool bpf_token_capable(const struct bpf_token *token, int cap) { struct user_namespace *userns; /* BPF token allows ns_capable() level of capabilities */ userns = token ? token->userns : &init_user_ns; if (!bpf_ns_capable(userns, cap)) return false; if (token && security_bpf_token_capable(token, cap) < 0) return false; return true; } void bpf_token_inc(struct bpf_token *token) { atomic64_inc(&token->refcnt); } static void bpf_token_free(struct bpf_token *token) { security_bpf_token_free(token); put_user_ns(token->userns); kfree(token); } static void bpf_token_put_deferred(struct work_struct *work) { struct bpf_token *token = container_of(work, struct bpf_token, work); bpf_token_free(token); } void bpf_token_put(struct bpf_token *token) { if (!token) return; if (!atomic64_dec_and_test(&token->refcnt)) return; INIT_WORK(&token->work, bpf_token_put_deferred); schedule_work(&token->work); } static int bpf_token_release(struct inode *inode, struct file *filp) { struct bpf_token *token = filp->private_data; bpf_token_put(token); return 0; } static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp) { struct bpf_token *token = filp->private_data; u64 mask; BUILD_BUG_ON(__MAX_BPF_CMD >= 64); mask = BIT_ULL(__MAX_BPF_CMD) - 1; if ((token->allowed_cmds & mask) == mask) seq_printf(m, "allowed_cmds:\tany\n"); else seq_printf(m, "allowed_cmds:\t0x%llx\n", token->allowed_cmds); BUILD_BUG_ON(__MAX_BPF_MAP_TYPE >= 64); mask = BIT_ULL(__MAX_BPF_MAP_TYPE) - 1; if ((token->allowed_maps & mask) == mask) seq_printf(m, "allowed_maps:\tany\n"); else seq_printf(m, "allowed_maps:\t0x%llx\n", token->allowed_maps); BUILD_BUG_ON(__MAX_BPF_PROG_TYPE >= 64); mask = BIT_ULL(__MAX_BPF_PROG_TYPE) - 1; if ((token->allowed_progs & mask) == mask) seq_printf(m, "allowed_progs:\tany\n"); else seq_printf(m, "allowed_progs:\t0x%llx\n", token->allowed_progs); BUILD_BUG_ON(__MAX_BPF_ATTACH_TYPE >= 64); mask = BIT_ULL(__MAX_BPF_ATTACH_TYPE) - 1; if ((token->allowed_attachs & mask) == mask) seq_printf(m, "allowed_attachs:\tany\n"); else seq_printf(m, "allowed_attachs:\t0x%llx\n", token->allowed_attachs); } #define BPF_TOKEN_INODE_NAME "bpf-token" static const struct inode_operations bpf_token_iops = { }; const struct file_operations bpf_token_fops = { .release = bpf_token_release, .show_fdinfo = bpf_token_show_fdinfo, }; int bpf_token_create(union bpf_attr *attr) { struct bpf_mount_opts *mnt_opts; struct bpf_token *token = NULL; struct user_namespace *userns; struct inode *inode; struct file *file; CLASS(fd, f)(attr->token_create.bpffs_fd); struct path path; struct super_block *sb; umode_t mode; int err, fd; if (fd_empty(f)) return -EBADF; path = fd_file(f)->f_path; sb = path.dentry->d_sb; if (path.dentry != sb->s_root) return -EINVAL; if (sb->s_op != &bpf_super_ops) return -EINVAL; err = path_permission(&path, MAY_ACCESS); if (err) return err; userns = sb->s_user_ns; /* * Enforce that creators of BPF tokens are in the same user * namespace as the BPF FS instance. This makes reasoning about * permissions a lot easier and we can always relax this later. */ if (current_user_ns() != userns) return -EPERM; if (!ns_capable(userns, CAP_BPF)) return -EPERM; /* Creating BPF token in init_user_ns doesn't make much sense. */ if (current_user_ns() == &init_user_ns) return -EOPNOTSUPP; mnt_opts = sb->s_fs_info; if (mnt_opts->delegate_cmds == 0 && mnt_opts->delegate_maps == 0 && mnt_opts->delegate_progs == 0 && mnt_opts->delegate_attachs == 0) return -ENOENT; /* no BPF token delegation is set up */ mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); inode = bpf_get_inode(sb, NULL, mode); if (IS_ERR(inode)) return PTR_ERR(inode); inode->i_op = &bpf_token_iops; inode->i_fop = &bpf_token_fops; clear_nlink(inode); /* make sure it is unlinked */ file = alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, O_RDWR, &bpf_token_fops); if (IS_ERR(file)) { iput(inode); return PTR_ERR(file); } token = kzalloc(sizeof(*token), GFP_USER); if (!token) { err = -ENOMEM; goto out_file; } atomic64_set(&token->refcnt, 1); /* remember bpffs owning userns for future ns_capable() checks */ token->userns = get_user_ns(userns); token->allowed_cmds = mnt_opts->delegate_cmds; token->allowed_maps = mnt_opts->delegate_maps; token->allowed_progs = mnt_opts->delegate_progs; token->allowed_attachs = mnt_opts->delegate_attachs; err = security_bpf_token_create(token, attr, &path); if (err) goto out_token; fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) { err = fd; goto out_token; } file->private_data = token; fd_install(fd, file); return fd; out_token: bpf_token_free(token); out_file: fput(file); return err; } int bpf_token_get_info_by_fd(struct bpf_token *token, const union bpf_attr *attr, union bpf_attr __user *uattr) { struct bpf_token_info __user *uinfo = u64_to_user_ptr(attr->info.info); struct bpf_token_info info; u32 info_len = attr->info.info_len; info_len = min_t(u32, info_len, sizeof(info)); memset(&info, 0, sizeof(info)); info.allowed_cmds = token->allowed_cmds; info.allowed_maps = token->allowed_maps; info.allowed_progs = token->allowed_progs; info.allowed_attachs = token->allowed_attachs; if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) return -EFAULT; return 0; } struct bpf_token *bpf_token_get_from_fd(u32 ufd) { CLASS(fd, f)(ufd); struct bpf_token *token; if (fd_empty(f)) return ERR_PTR(-EBADF); if (fd_file(f)->f_op != &bpf_token_fops) return ERR_PTR(-EINVAL); token = fd_file(f)->private_data; bpf_token_inc(token); return token; } bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd) { if (!token) return false; if (!(token->allowed_cmds & BIT_ULL(cmd))) return false; return security_bpf_token_cmd(token, cmd) == 0; } bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type) { if (!token || type >= __MAX_BPF_MAP_TYPE) return false; return token->allowed_maps & BIT_ULL(type); } bool bpf_token_allow_prog_type(const struct bpf_token *token, enum bpf_prog_type prog_type, enum bpf_attach_type attach_type) { if (!token || prog_type >= __MAX_BPF_PROG_TYPE || attach_type >= __MAX_BPF_ATTACH_TYPE) return false; return (token->allowed_progs & BIT_ULL(prog_type)) && (token->allowed_attachs & BIT_ULL(attach_type)); }
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef __CPUSET_INTERNAL_H #define __CPUSET_INTERNAL_H #include <linux/cgroup.h> #include <linux/cpu.h> #include <linux/cpumask.h> #include <linux/cpuset.h> #include <linux/spinlock.h> #include <linux/union_find.h> /* See "Frequency meter" comments, below. */ struct fmeter { int cnt; /* unprocessed events count */ int val; /* most recent output value */ time64_t time; /* clock (secs) when val computed */ spinlock_t lock; /* guards read or write of above */ }; /* * Invalid partition error code */ enum prs_errcode { PERR_NONE = 0, PERR_INVCPUS, PERR_INVPARENT, PERR_NOTPART, PERR_NOTEXCL, PERR_NOCPUS, PERR_HOTPLUG, PERR_CPUSEMPTY, PERR_HKEEPING, PERR_ACCESS, PERR_REMOTE, }; /* bits in struct cpuset flags field */ typedef enum { CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, CS_MEM_HARDWALL, CS_MEMORY_MIGRATE, CS_SCHED_LOAD_BALANCE, CS_SPREAD_PAGE, CS_SPREAD_SLAB, } cpuset_flagbits_t; /* The various types of files and directories in a cpuset file system */ typedef enum { FILE_MEMORY_MIGRATE, FILE_CPULIST, FILE_MEMLIST, FILE_EFFECTIVE_CPULIST, FILE_EFFECTIVE_MEMLIST, FILE_SUBPARTS_CPULIST, FILE_EXCLUSIVE_CPULIST, FILE_EFFECTIVE_XCPULIST, FILE_ISOLATED_CPULIST, FILE_CPU_EXCLUSIVE, FILE_MEM_EXCLUSIVE, FILE_MEM_HARDWALL, FILE_SCHED_LOAD_BALANCE, FILE_PARTITION_ROOT, FILE_SCHED_RELAX_DOMAIN_LEVEL, FILE_MEMORY_PRESSURE_ENABLED, FILE_MEMORY_PRESSURE, FILE_SPREAD_PAGE, FILE_SPREAD_SLAB, } cpuset_filetype_t; struct cpuset { struct cgroup_subsys_state css; unsigned long flags; /* "unsigned long" so bitops work */ /* * On default hierarchy: * * The user-configured masks can only be changed by writing to * cpuset.cpus and cpuset.mems, and won't be limited by the * parent masks. * * The effective masks is the real masks that apply to the tasks * in the cpuset. They may be changed if the configured masks are * changed or hotplug happens. * * effective_mask == configured_mask & parent's effective_mask, * and if it ends up empty, it will inherit the parent's mask. * * * On legacy hierarchy: * * The user-configured masks are always the same with effective masks. */ /* user-configured CPUs and Memory Nodes allow to tasks */ cpumask_var_t cpus_allowed; nodemask_t mems_allowed; /* effective CPUs and Memory Nodes allow to tasks */ cpumask_var_t effective_cpus; nodemask_t effective_mems; /* * Exclusive CPUs dedicated to current cgroup (default hierarchy only) * * The effective_cpus of a valid partition root comes solely from its * effective_xcpus and some of the effective_xcpus may be distributed * to sub-partitions below & hence excluded from its effective_cpus. * For a valid partition root, its effective_cpus have no relationship * with cpus_allowed unless its exclusive_cpus isn't set. * * This value will only be set if either exclusive_cpus is set or * when this cpuset becomes a local partition root. */ cpumask_var_t effective_xcpus; /* * Exclusive CPUs as requested by the user (default hierarchy only) * * Its value is independent of cpus_allowed and designates the set of * CPUs that can be granted to the current cpuset or its children when * it becomes a valid partition root. The effective set of exclusive * CPUs granted (effective_xcpus) depends on whether those exclusive * CPUs are passed down by its ancestors and not yet taken up by * another sibling partition root along the way. * * If its value isn't set, it defaults to cpus_allowed. */ cpumask_var_t exclusive_cpus; /* * This is old Memory Nodes tasks took on. * * - top_cpuset.old_mems_allowed is initialized to mems_allowed. * - A new cpuset's old_mems_allowed is initialized when some * task is moved into it. * - old_mems_allowed is used in cpuset_migrate_mm() when we change * cpuset.mems_allowed and have tasks' nodemask updated, and * then old_mems_allowed is updated to mems_allowed. */ nodemask_t old_mems_allowed; struct fmeter fmeter; /* memory_pressure filter */ /* * Tasks are being attached to this cpuset. Used to prevent * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). */ int attach_in_progress; /* for custom sched domain */ int relax_domain_level; /* number of valid local child partitions */ int nr_subparts; /* partition root state */ int partition_root_state; /* * number of SCHED_DEADLINE tasks attached to this cpuset, so that we * know when to rebuild associated root domain bandwidth information. */ int nr_deadline_tasks; int nr_migrate_dl_tasks; u64 sum_migrate_dl_bw; /* Invalid partition error code, not lock protected */ enum prs_errcode prs_err; /* Handle for cpuset.cpus.partition */ struct cgroup_file partition_file; /* Remote partition silbling list anchored at remote_children */ struct list_head remote_sibling; /* Used to merge intersecting subsets for generate_sched_domains */ struct uf_node node; }; static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) { return css ? container_of(css, struct cpuset, css) : NULL; } /* Retrieve the cpuset for a task */ static inline struct cpuset *task_cs(struct task_struct *task) { return css_cs(task_css(task, cpuset_cgrp_id)); } static inline struct cpuset *parent_cs(struct cpuset *cs) { return css_cs(cs->css.parent); } /* convenient tests for these bits */ static inline bool is_cpuset_online(struct cpuset *cs) { return css_is_online(&cs->css) && !css_is_dying(&cs->css); } static inline int is_cpu_exclusive(const struct cpuset *cs) { return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); } static inline int is_mem_exclusive(const struct cpuset *cs) { return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); } static inline int is_mem_hardwall(const struct cpuset *cs) { return test_bit(CS_MEM_HARDWALL, &cs->flags); } static inline int is_sched_load_balance(const struct cpuset *cs) { return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); } static inline int is_memory_migrate(const struct cpuset *cs) { return test_bit(CS_MEMORY_MIGRATE, &cs->flags); } static inline int is_spread_page(const struct cpuset *cs) { return test_bit(CS_SPREAD_PAGE, &cs->flags); } static inline int is_spread_slab(const struct cpuset *cs) { return test_bit(CS_SPREAD_SLAB, &cs->flags); } /** * cpuset_for_each_child - traverse online children of a cpuset * @child_cs: loop cursor pointing to the current child * @pos_css: used for iteration * @parent_cs: target cpuset to walk children of * * Walk @child_cs through the online children of @parent_cs. Must be used * with RCU read locked. */ #define cpuset_for_each_child(child_cs, pos_css, parent_cs) \ css_for_each_child((pos_css), &(parent_cs)->css) \ if (is_cpuset_online(((child_cs) = css_cs((pos_css))))) /** * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants * @des_cs: loop cursor pointing to the current descendant * @pos_css: used for iteration * @root_cs: target cpuset to walk ancestor of * * Walk @des_cs through the online descendants of @root_cs. Must be used * with RCU read locked. The caller may modify @pos_css by calling * css_rightmost_descendant() to skip subtree. @root_cs is included in the * iteration and the first node to be visited. */ #define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) void rebuild_sched_domains_locked(void); void cpuset_callback_lock_irq(void); void cpuset_callback_unlock_irq(void); void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus); void cpuset_update_tasks_nodemask(struct cpuset *cs); int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on); ssize_t cpuset_write_resmask(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); int cpuset_common_seq_show(struct seq_file *sf, void *v); void cpuset_full_lock(void); void cpuset_full_unlock(void); /* * cpuset-v1.c */ #ifdef CONFIG_CPUSETS_V1 extern struct cftype cpuset1_files[]; void fmeter_init(struct fmeter *fmp); void cpuset1_update_task_spread_flags(struct cpuset *cs, struct task_struct *tsk); void cpuset1_update_tasks_flags(struct cpuset *cs); void cpuset1_hotplug_update_tasks(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated); int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial); #else static inline void fmeter_init(struct fmeter *fmp) {} static inline void cpuset1_update_task_spread_flags(struct cpuset *cs, struct task_struct *tsk) {} static inline void cpuset1_update_tasks_flags(struct cpuset *cs) {} static inline void cpuset1_hotplug_update_tasks(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated) {} static inline int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial) { return 0; } #endif /* CONFIG_CPUSETS_V1 */ #endif /* __CPUSET_INTERNAL_H */
12 12 5 5 12 12 12 12 5 5 16 5 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds */ /* * 'tty_io.c' gives an orthogonal feeling to tty's, be they consoles * or rs-channels. It also implements echoing, cooked mode etc. * * Kill-line thanks to John T Kohl, who also corrected VMIN = VTIME = 0. * * Modified by Theodore Ts'o, 9/14/92, to dynamically allocate the * tty_struct and tty_queue structures. Previously there was an array * of 256 tty_struct's which was statically allocated, and the * tty_queue structures were allocated at boot time. Both are now * dynamically allocated only when the tty is open. * * Also restructured routines so that there is more of a separation * between the high-level tty routines (tty_io.c and tty_ioctl.c) and * the low-level tty routines (serial.c, pty.c, console.c). This * makes for cleaner and more compact code. -TYT, 9/17/92 * * Modified by Fred N. van Kempen, 01/29/93, to add line disciplines * which can be dynamically activated and de-activated by the line * discipline handling modules (like SLIP). * * NOTE: pay no attention to the line discipline code (yet); its * interface is still subject to change in this version... * -- TYT, 1/31/92 * * Added functionality to the OPOST tty handling. No delays, but all * other bits should be there. * -- Nick Holloway <alfie@dcs.warwick.ac.uk>, 27th May 1993. * * Rewrote canonical mode and added more termios flags. * -- julian@uhunix.uhcc.hawaii.edu (J. Cowley), 13Jan94 * * Reorganized FASYNC support so mouse code can share it. * -- ctm@ardi.com, 9Sep95 * * New TIOCLINUX variants added. * -- mj@k332.feld.cvut.cz, 19-Nov-95 * * Restrict vt switching via ioctl() * -- grif@cs.ucr.edu, 5-Dec-95 * * Move console and virtual terminal code to more appropriate files, * implement CONFIG_VT and generalize console device interface. * -- Marko Kohtala <Marko.Kohtala@hut.fi>, March 97 * * Rewrote tty_init_dev and tty_release_dev to eliminate races. * -- Bill Hawes <whawes@star.net>, June 97 * * Added devfs support. * -- C. Scott Ananian <cananian@alumni.princeton.edu>, 13-Jan-1998 * * Added support for a Unix98-style ptmx device. * -- C. Scott Ananian <cananian@alumni.princeton.edu>, 14-Jan-1998 * * Reduced memory usage for older ARM systems * -- Russell King <rmk@arm.linux.org.uk> * * Move do_SAK() into process context. Less stack use in devfs functions. * alloc_tty_struct() always uses kmalloc() * -- Andrew Morton <andrewm@uow.edu.eu> 17Mar01 */ #include <linux/types.h> #include <linux/major.h> #include <linux/errno.h> #include <linux/signal.h> #include <linux/fcntl.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/interrupt.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/devpts_fs.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/console.h> #include <linux/timer.h> #include <linux/ctype.h> #include <linux/kd.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/ppp-ioctl.h> #include <linux/proc_fs.h> #include <linux/init.h> #include <linux/module.h> #include <linux/device.h> #include <linux/wait.h> #include <linux/bitops.h> #include <linux/delay.h> #include <linux/seq_file.h> #include <linux/serial.h> #include <linux/ratelimit.h> #include <linux/compat.h> #include <linux/uaccess.h> #include <linux/termios_internal.h> #include <linux/fs.h> #include <linux/kbd_kern.h> #include <linux/vt_kern.h> #include <linux/selection.h> #include <linux/kmod.h> #include <linux/nsproxy.h> #include "tty.h" #undef TTY_DEBUG_HANGUP #ifdef TTY_DEBUG_HANGUP # define tty_debug_hangup(tty, f, args...) tty_debug(tty, f, ##args) #else # define tty_debug_hangup(tty, f, args...) do { } while (0) #endif #define TTY_PARANOIA_CHECK 1 #define CHECK_TTY_COUNT 1 struct ktermios tty_std_termios = { /* for the benefit of tty drivers */ .c_iflag = ICRNL | IXON, .c_oflag = OPOST | ONLCR, .c_cflag = B38400 | CS8 | CREAD | HUPCL, .c_lflag = ISIG | ICANON | ECHO | ECHOE | ECHOK | ECHOCTL | ECHOKE | IEXTEN, .c_cc = INIT_C_CC, .c_ispeed = 38400, .c_ospeed = 38400, /* .c_line = N_TTY, */ }; EXPORT_SYMBOL(tty_std_termios); /* This list gets poked at by procfs and various bits of boot up code. This * could do with some rationalisation such as pulling the tty proc function * into this file. */ LIST_HEAD(tty_drivers); /* linked list of tty drivers */ /* Mutex to protect creating and releasing a tty */ DEFINE_MUTEX(tty_mutex); static ssize_t tty_read(struct kiocb *, struct iov_iter *); static ssize_t tty_write(struct kiocb *, struct iov_iter *); static __poll_t tty_poll(struct file *, poll_table *); static int tty_open(struct inode *, struct file *); #ifdef CONFIG_COMPAT static long tty_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #else #define tty_compat_ioctl NULL #endif static int __tty_fasync(int fd, struct file *filp, int on); static int tty_fasync(int fd, struct file *filp, int on); static void release_tty(struct tty_struct *tty, int idx); /** * free_tty_struct - free a disused tty * @tty: tty struct to free * * Free the write buffers, tty queue and tty memory itself. * * Locking: none. Must be called after tty is definitely unused */ static void free_tty_struct(struct tty_struct *tty) { tty_ldisc_deinit(tty); put_device(tty->dev); kvfree(tty->write_buf); kfree(tty); } static inline struct tty_struct *file_tty(struct file *file) { return ((struct tty_file_private *)file->private_data)->tty; } int tty_alloc_file(struct file *file) { struct tty_file_private *priv; priv = kmalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; file->private_data = priv; return 0; } /* Associate a new file with the tty structure */ void tty_add_file(struct tty_struct *tty, struct file *file) { struct tty_file_private *priv = file->private_data; priv->tty = tty; priv->file = file; spin_lock(&tty->files_lock); list_add(&priv->list, &tty->tty_files); spin_unlock(&tty->files_lock); } /** * tty_free_file - free file->private_data * @file: to free private_data of * * This shall be used only for fail path handling when tty_add_file was not * called yet. */ void tty_free_file(struct file *file) { struct tty_file_private *priv = file->private_data; file->private_data = NULL; kfree(priv); } /* Delete file from its tty */ static void tty_del_file(struct file *file) { struct tty_file_private *priv = file->private_data; struct tty_struct *tty = priv->tty; spin_lock(&tty->files_lock); list_del(&priv->list); spin_unlock(&tty->files_lock); tty_free_file(file); } /** * tty_name - return tty naming * @tty: tty structure * * Convert a tty structure into a name. The name reflects the kernel naming * policy and if udev is in use may not reflect user space * * Locking: none */ const char *tty_name(const struct tty_struct *tty) { if (!tty) /* Hmm. NULL pointer. That's fun. */ return "NULL tty"; return tty->name; } EXPORT_SYMBOL(tty_name); const char *tty_driver_name(const struct tty_struct *tty) { if (!tty || !tty->driver) return ""; return tty->driver->name; } static int tty_paranoia_check(struct tty_struct *tty, struct inode *inode, const char *routine) { #ifdef TTY_PARANOIA_CHECK if (!tty) { pr_warn("(%d:%d): %s: NULL tty\n", imajor(inode), iminor(inode), routine); return 1; } #endif return 0; } /* Caller must hold tty_lock */ static void check_tty_count(struct tty_struct *tty, const char *routine) { #ifdef CHECK_TTY_COUNT struct list_head *p; int count = 0, kopen_count = 0; scoped_guard(spinlock, &tty->files_lock) list_for_each(p, &tty->tty_files) count++; if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_SLAVE && tty->link && tty->link->count) count++; if (tty_port_kopened(tty->port)) kopen_count++; if (tty->count != (count + kopen_count)) { tty_warn(tty, "%s: tty->count(%d) != (#fd's(%d) + #kopen's(%d))\n", routine, tty->count, count, kopen_count); } #endif } /** * get_tty_driver - find device of a tty * @device: device identifier * @index: returns the index of the tty * * This routine returns a tty driver structure, given a device number and also * passes back the index number. * * Locking: caller must hold tty_mutex */ static struct tty_driver *get_tty_driver(dev_t device, int *index) { struct tty_driver *p; list_for_each_entry(p, &tty_drivers, tty_drivers) { dev_t base = MKDEV(p->major, p->minor_start); if (device < base || device >= base + p->num) continue; *index = device - base; return tty_driver_kref_get(p); } return NULL; } /** * tty_dev_name_to_number - return dev_t for device name * @name: user space name of device under /dev * @number: pointer to dev_t that this function will populate * * This function converts device names like ttyS0 or ttyUSB1 into dev_t like * (4, 64) or (188, 1). If no corresponding driver is registered then the * function returns -%ENODEV. * * Locking: this acquires tty_mutex to protect the tty_drivers list from * being modified while we are traversing it, and makes sure to * release it before exiting. */ int tty_dev_name_to_number(const char *name, dev_t *number) { struct tty_driver *p; int ret; int index, prefix_length = 0; const char *str; for (str = name; *str && !isdigit(*str); str++) ; if (!*str) return -EINVAL; ret = kstrtoint(str, 10, &index); if (ret) return ret; prefix_length = str - name; guard(mutex)(&tty_mutex); list_for_each_entry(p, &tty_drivers, tty_drivers) if (prefix_length == strlen(p->name) && strncmp(name, p->name, prefix_length) == 0) { if (index < p->num) { *number = MKDEV(p->major, p->minor_start + index); return 0; } } return -ENODEV; } EXPORT_SYMBOL_GPL(tty_dev_name_to_number); #ifdef CONFIG_CONSOLE_POLL /** * tty_find_polling_driver - find device of a polled tty * @name: name string to match * @line: pointer to resulting tty line nr * * This routine returns a tty driver structure, given a name and the condition * that the tty driver is capable of polled operation. */ struct tty_driver *tty_find_polling_driver(char *name, int *line) { struct tty_driver *p; int tty_line = 0; int len; char *str, *stp; for (str = name; *str; str++) if ((*str >= '0' && *str <= '9') || *str == ',') break; if (!*str) return NULL; len = str - name; tty_line = simple_strtoul(str, &str, 10); guard(mutex)(&tty_mutex); /* Search through the tty devices to look for a match */ list_for_each_entry(p, &tty_drivers, tty_drivers) { if (!len || strncmp(name, p->name, len) != 0) continue; stp = str; if (*stp == ',') stp++; if (*stp == '\0') stp = NULL; if (tty_line >= 0 && tty_line < p->num && p->ops && p->ops->poll_init && !p->ops->poll_init(p, tty_line, stp)) { *line = tty_line; return tty_driver_kref_get(p); } } return NULL; } EXPORT_SYMBOL_GPL(tty_find_polling_driver); #endif static ssize_t hung_up_tty_read(struct kiocb *iocb, struct iov_iter *to) { return 0; } static ssize_t hung_up_tty_write(struct kiocb *iocb, struct iov_iter *from) { return -EIO; } /* No kernel lock held - none needed ;) */ static __poll_t hung_up_tty_poll(struct file *filp, poll_table *wait) { return EPOLLIN | EPOLLOUT | EPOLLERR | EPOLLHUP | EPOLLRDNORM | EPOLLWRNORM; } static long hung_up_tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return cmd == TIOCSPGRP ? -ENOTTY : -EIO; } static long hung_up_tty_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return cmd == TIOCSPGRP ? -ENOTTY : -EIO; } static int hung_up_tty_fasync(int fd, struct file *file, int on) { return -ENOTTY; } static void tty_show_fdinfo(struct seq_file *m, struct file *file) { struct tty_struct *tty = file_tty(file); if (tty && tty->ops && tty->ops->show_fdinfo) tty->ops->show_fdinfo(tty, m); } static const struct file_operations tty_fops = { .read_iter = tty_read, .write_iter = tty_write, .splice_read = copy_splice_read, .splice_write = iter_file_splice_write, .poll = tty_poll, .unlocked_ioctl = tty_ioctl, .compat_ioctl = tty_compat_ioctl, .open = tty_open, .release = tty_release, .fasync = tty_fasync, .show_fdinfo = tty_show_fdinfo, }; static const struct file_operations console_fops = { .read_iter = tty_read, .write_iter = redirected_tty_write, .splice_read = copy_splice_read, .splice_write = iter_file_splice_write, .poll = tty_poll, .unlocked_ioctl = tty_ioctl, .compat_ioctl = tty_compat_ioctl, .open = tty_open, .release = tty_release, .fasync = tty_fasync, }; static const struct file_operations hung_up_tty_fops = { .read_iter = hung_up_tty_read, .write_iter = hung_up_tty_write, .poll = hung_up_tty_poll, .unlocked_ioctl = hung_up_tty_ioctl, .compat_ioctl = hung_up_tty_compat_ioctl, .release = tty_release, .fasync = hung_up_tty_fasync, }; static DEFINE_SPINLOCK(redirect_lock); static struct file *redirect; /** * tty_wakeup - request more data * @tty: terminal * * Internal and external helper for wakeups of tty. This function informs the * line discipline if present that the driver is ready to receive more output * data. */ void tty_wakeup(struct tty_struct *tty) { struct tty_ldisc *ld; if (test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags)) { ld = tty_ldisc_ref(tty); if (ld) { if (ld->ops->write_wakeup) ld->ops->write_wakeup(tty); tty_ldisc_deref(ld); } } wake_up_interruptible_poll(&tty->write_wait, EPOLLOUT); } EXPORT_SYMBOL_GPL(tty_wakeup); /** * tty_release_redirect - Release a redirect on a pty if present * @tty: tty device * * This is available to the pty code so if the master closes, if the slave is a * redirect it can release the redirect. */ static struct file *tty_release_redirect(struct tty_struct *tty) { guard(spinlock)(&redirect_lock); if (redirect && file_tty(redirect) == tty) { struct file *f = redirect; redirect = NULL; return f; } return NULL; } /** * __tty_hangup - actual handler for hangup events * @tty: tty device * @exit_session: if non-zero, signal all foreground group processes * * This can be called by a "kworker" kernel thread. That is process synchronous * but doesn't hold any locks, so we need to make sure we have the appropriate * locks for what we're doing. * * The hangup event clears any pending redirections onto the hung up device. It * ensures future writes will error and it does the needed line discipline * hangup and signal delivery. The tty object itself remains intact. * * Locking: * * BTM * * * redirect lock for undoing redirection * * file list lock for manipulating list of ttys * * tty_ldiscs_lock from called functions * * termios_rwsem resetting termios data * * tasklist_lock to walk task list for hangup event * * * ->siglock to protect ->signal/->sighand * */ static void __tty_hangup(struct tty_struct *tty, int exit_session) { struct file *cons_filp = NULL; struct file *filp, *f; struct tty_file_private *priv; int closecount = 0, n; int refs; if (!tty) return; f = tty_release_redirect(tty); tty_lock(tty); if (test_bit(TTY_HUPPED, &tty->flags)) { tty_unlock(tty); return; } /* * Some console devices aren't actually hung up for technical and * historical reasons, which can lead to indefinite interruptible * sleep in n_tty_read(). The following explicitly tells * n_tty_read() to abort readers. */ set_bit(TTY_HUPPING, &tty->flags); /* inuse_filps is protected by the single tty lock, * this really needs to change if we want to flush the * workqueue with the lock held. */ check_tty_count(tty, "tty_hangup"); spin_lock(&tty->files_lock); /* This breaks for file handles being sent over AF_UNIX sockets ? */ list_for_each_entry(priv, &tty->tty_files, list) { filp = priv->file; if (filp->f_op->write_iter == redirected_tty_write) cons_filp = filp; if (filp->f_op->write_iter != tty_write) continue; closecount++; __tty_fasync(-1, filp, 0); /* can't block */ filp->f_op = &hung_up_tty_fops; } spin_unlock(&tty->files_lock); refs = tty_signal_session_leader(tty, exit_session); /* Account for the p->signal references we killed */ while (refs--) tty_kref_put(tty); tty_ldisc_hangup(tty, cons_filp != NULL); spin_lock_irq(&tty->ctrl.lock); clear_bit(TTY_THROTTLED, &tty->flags); clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); put_pid(tty->ctrl.session); put_pid(tty->ctrl.pgrp); tty->ctrl.session = NULL; tty->ctrl.pgrp = NULL; tty->ctrl.pktstatus = 0; spin_unlock_irq(&tty->ctrl.lock); /* * If one of the devices matches a console pointer, we * cannot just call hangup() because that will cause * tty->count and state->count to go out of sync. * So we just call close() the right number of times. */ if (cons_filp) { if (tty->ops->close) for (n = 0; n < closecount; n++) tty->ops->close(tty, cons_filp); } else if (tty->ops->hangup) tty->ops->hangup(tty); /* * We don't want to have driver/ldisc interactions beyond the ones * we did here. The driver layer expects no calls after ->hangup() * from the ldisc side, which is now guaranteed. */ set_bit(TTY_HUPPED, &tty->flags); clear_bit(TTY_HUPPING, &tty->flags); tty_unlock(tty); if (f) fput(f); } static void do_tty_hangup(struct work_struct *work) { struct tty_struct *tty = container_of(work, struct tty_struct, hangup_work); __tty_hangup(tty, 0); } /** * tty_hangup - trigger a hangup event * @tty: tty to hangup * * A carrier loss (virtual or otherwise) has occurred on @tty. Schedule a * hangup sequence to run after this event. */ void tty_hangup(struct tty_struct *tty) { tty_debug_hangup(tty, "hangup\n"); schedule_work(&tty->hangup_work); } EXPORT_SYMBOL(tty_hangup); /** * tty_vhangup - process vhangup * @tty: tty to hangup * * The user has asked via system call for the terminal to be hung up. We do * this synchronously so that when the syscall returns the process is complete. * That guarantee is necessary for security reasons. */ void tty_vhangup(struct tty_struct *tty) { tty_debug_hangup(tty, "vhangup\n"); __tty_hangup(tty, 0); } EXPORT_SYMBOL(tty_vhangup); /** * tty_vhangup_self - process vhangup for own ctty * * Perform a vhangup on the current controlling tty */ void tty_vhangup_self(void) { struct tty_struct *tty; tty = get_current_tty(); if (tty) { tty_vhangup(tty); tty_kref_put(tty); } } /** * tty_vhangup_session - hangup session leader exit * @tty: tty to hangup * * The session leader is exiting and hanging up its controlling terminal. * Every process in the foreground process group is signalled %SIGHUP. * * We do this synchronously so that when the syscall returns the process is * complete. That guarantee is necessary for security reasons. */ void tty_vhangup_session(struct tty_struct *tty) { tty_debug_hangup(tty, "session hangup\n"); __tty_hangup(tty, 1); } /** * tty_hung_up_p - was tty hung up * @filp: file pointer of tty * * Return: true if the tty has been subject to a vhangup or a carrier loss */ int tty_hung_up_p(struct file *filp) { return (filp && filp->f_op == &hung_up_tty_fops); } EXPORT_SYMBOL(tty_hung_up_p); void __stop_tty(struct tty_struct *tty) { if (tty->flow.stopped) return; tty->flow.stopped = true; if (tty->ops->stop) tty->ops->stop(tty); } /** * stop_tty - propagate flow control * @tty: tty to stop * * Perform flow control to the driver. May be called on an already stopped * device and will not re-call the &tty_driver->stop() method. * * This functionality is used by both the line disciplines for halting incoming * flow and by the driver. It may therefore be called from any context, may be * under the tty %atomic_write_lock but not always. * * Locking: * flow.lock */ void stop_tty(struct tty_struct *tty) { guard(spinlock_irqsave)(&tty->flow.lock); __stop_tty(tty); } EXPORT_SYMBOL(stop_tty); void __start_tty(struct tty_struct *tty) { if (!tty->flow.stopped || tty->flow.tco_stopped) return; tty->flow.stopped = false; if (tty->ops->start) tty->ops->start(tty); tty_wakeup(tty); } /** * start_tty - propagate flow control * @tty: tty to start * * Start a tty that has been stopped if at all possible. If @tty was previously * stopped and is now being started, the &tty_driver->start() method is invoked * and the line discipline woken. * * Locking: * flow.lock */ void start_tty(struct tty_struct *tty) { guard(spinlock_irqsave)(&tty->flow.lock); __start_tty(tty); } EXPORT_SYMBOL(start_tty); static void tty_update_time(struct tty_struct *tty, bool mtime) { time64_t sec = ktime_get_real_seconds(); struct tty_file_private *priv; guard(spinlock)(&tty->files_lock); list_for_each_entry(priv, &tty->tty_files, list) { struct inode *inode = file_inode(priv->file); struct timespec64 time = mtime ? inode_get_mtime(inode) : inode_get_atime(inode); /* * We only care if the two values differ in anything other than the * lower three bits (i.e every 8 seconds). If so, then we can update * the time of the tty device, otherwise it could be construded as a * security leak to let userspace know the exact timing of the tty. */ if ((sec ^ time.tv_sec) & ~7) { if (mtime) inode_set_mtime(inode, sec, 0); else inode_set_atime(inode, sec, 0); } } } /* * Iterate on the ldisc ->read() function until we've gotten all * the data the ldisc has for us. * * The "cookie" is something that the ldisc read function can fill * in to let us know that there is more data to be had. * * We promise to continue to call the ldisc until it stops returning * data or clears the cookie. The cookie may be something that the * ldisc maintains state for and needs to free. */ static ssize_t iterate_tty_read(struct tty_ldisc *ld, struct tty_struct *tty, struct file *file, struct iov_iter *to) { void *cookie = NULL; unsigned long offset = 0; ssize_t retval = 0; size_t copied, count = iov_iter_count(to); u8 kernel_buf[64]; do { ssize_t size = min(count, sizeof(kernel_buf)); size = ld->ops->read(tty, file, kernel_buf, size, &cookie, offset); if (!size) break; if (size < 0) { /* Did we have an earlier error (ie -EFAULT)? */ if (retval) break; retval = size; /* * -EOVERFLOW means we didn't have enough space * for a whole packet, and we shouldn't return * a partial result. */ if (retval == -EOVERFLOW) offset = 0; break; } copied = copy_to_iter(kernel_buf, size, to); offset += copied; count -= copied; /* * If the user copy failed, we still need to do another ->read() * call if we had a cookie to let the ldisc clear up. * * But make sure size is zeroed. */ if (unlikely(copied != size)) { count = 0; retval = -EFAULT; } } while (cookie); /* We always clear tty buffer in case they contained passwords */ memzero_explicit(kernel_buf, sizeof(kernel_buf)); return offset ? offset : retval; } /** * tty_read - read method for tty device files * @iocb: kernel I/O control block * @to: destination for the data read * * Perform the read system call function on this terminal device. Checks * for hung up devices before calling the line discipline method. * * Locking: * Locks the line discipline internally while needed. Multiple read calls * may be outstanding in parallel. */ static ssize_t tty_read(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct tty_struct *tty = file_tty(file); struct tty_ldisc *ld; ssize_t ret; if (tty_paranoia_check(tty, inode, "tty_read")) return -EIO; if (!tty || tty_io_error(tty)) return -EIO; /* We want to wait for the line discipline to sort out in this * situation. */ ld = tty_ldisc_ref_wait(tty); if (!ld) return hung_up_tty_read(iocb, to); ret = -EIO; if (ld->ops->read) ret = iterate_tty_read(ld, tty, file, to); tty_ldisc_deref(ld); if (ret > 0) tty_update_time(tty, false); return ret; } void tty_write_unlock(struct tty_struct *tty) { mutex_unlock(&tty->atomic_write_lock); wake_up_interruptible_poll(&tty->write_wait, EPOLLOUT); } int tty_write_lock(struct tty_struct *tty, bool ndelay) { if (!mutex_trylock(&tty->atomic_write_lock)) { if (ndelay) return -EAGAIN; if (mutex_lock_interruptible(&tty->atomic_write_lock)) return -ERESTARTSYS; } return 0; } /* * Split writes up in sane blocksizes to avoid * denial-of-service type attacks */ static ssize_t iterate_tty_write(struct tty_ldisc *ld, struct tty_struct *tty, struct file *file, struct iov_iter *from) { size_t chunk, count = iov_iter_count(from); ssize_t ret, written = 0; ret = tty_write_lock(tty, file->f_flags & O_NDELAY); if (ret < 0) return ret; /* * We chunk up writes into a temporary buffer. This * simplifies low-level drivers immensely, since they * don't have locking issues and user mode accesses. * * But if TTY_NO_WRITE_SPLIT is set, we should use a * big chunk-size.. * * The default chunk-size is 2kB, because the NTTY * layer has problems with bigger chunks. It will * claim to be able to handle more characters than * it actually does. */ chunk = 2048; if (test_bit(TTY_NO_WRITE_SPLIT, &tty->flags)) chunk = 65536; if (count < chunk) chunk = count; /* write_buf/write_cnt is protected by the atomic_write_lock mutex */ if (tty->write_cnt < chunk) { u8 *buf_chunk; if (chunk < 1024) chunk = 1024; buf_chunk = kvmalloc(chunk, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!buf_chunk) { ret = -ENOMEM; goto out; } kvfree(tty->write_buf); tty->write_cnt = chunk; tty->write_buf = buf_chunk; } /* Do the write .. */ for (;;) { size_t size = min(chunk, count); ret = -EFAULT; if (copy_from_iter(tty->write_buf, size, from) != size) break; ret = ld->ops->write(tty, file, tty->write_buf, size); if (ret <= 0) break; written += ret; if (ret > size) break; /* FIXME! Have Al check this! */ if (ret != size) iov_iter_revert(from, size-ret); count -= ret; if (!count) break; ret = -ERESTARTSYS; if (signal_pending(current)) break; cond_resched(); } if (written) { tty_update_time(tty, true); ret = written; } out: tty_write_unlock(tty); return ret; } #ifdef CONFIG_PRINT_QUOTA_WARNING /** * tty_write_message - write a message to a certain tty, not just the console. * @tty: the destination tty_struct * @msg: the message to write * * This is used for messages that need to be redirected to a specific tty. We * don't put it into the syslog queue right now maybe in the future if really * needed. * * We must still hold the BTM and test the CLOSING flag for the moment. * * This function is DEPRECATED, do not use in new code. */ void tty_write_message(struct tty_struct *tty, char *msg) { if (tty) { mutex_lock(&tty->atomic_write_lock); tty_lock(tty); if (tty->ops->write && tty->count > 0) tty->ops->write(tty, msg, strlen(msg)); tty_unlock(tty); tty_write_unlock(tty); } } #endif static ssize_t file_tty_write(struct file *file, struct kiocb *iocb, struct iov_iter *from) { struct tty_struct *tty = file_tty(file); struct tty_ldisc *ld; ssize_t ret; if (tty_paranoia_check(tty, file_inode(file), "tty_write")) return -EIO; if (!tty || !tty->ops->write || tty_io_error(tty)) return -EIO; /* Short term debug to catch buggy drivers */ if (tty->ops->write_room == NULL) tty_err(tty, "missing write_room method\n"); ld = tty_ldisc_ref_wait(tty); if (!ld) return hung_up_tty_write(iocb, from); if (!ld->ops->write) ret = -EIO; else ret = iterate_tty_write(ld, tty, file, from); tty_ldisc_deref(ld); return ret; } /** * tty_write - write method for tty device file * @iocb: kernel I/O control block * @from: iov_iter with data to write * * Write data to a tty device via the line discipline. * * Locking: * Locks the line discipline as required * Writes to the tty driver are serialized by the atomic_write_lock * and are then processed in chunks to the device. The line * discipline write method will not be invoked in parallel for * each device. */ static ssize_t tty_write(struct kiocb *iocb, struct iov_iter *from) { return file_tty_write(iocb->ki_filp, iocb, from); } ssize_t redirected_tty_write(struct kiocb *iocb, struct iov_iter *iter) { struct file *p = NULL; spin_lock(&redirect_lock); if (redirect) p = get_file(redirect); spin_unlock(&redirect_lock); /* * We know the redirected tty is just another tty, we can * call file_tty_write() directly with that file pointer. */ if (p) { ssize_t res; res = file_tty_write(p, iocb, iter); fput(p); return res; } return tty_write(iocb, iter); } /** * tty_send_xchar - send priority character * @tty: the tty to send to * @ch: xchar to send * * Send a high priority character to the tty even if stopped. * * Locking: none for xchar method, write ordering for write method. */ int tty_send_xchar(struct tty_struct *tty, u8 ch) { bool was_stopped = tty->flow.stopped; if (tty->ops->send_xchar) { down_read(&tty->termios_rwsem); tty->ops->send_xchar(tty, ch); up_read(&tty->termios_rwsem); return 0; } if (tty_write_lock(tty, false) < 0) return -ERESTARTSYS; down_read(&tty->termios_rwsem); if (was_stopped) start_tty(tty); tty->ops->write(tty, &ch, 1); if (was_stopped) stop_tty(tty); up_read(&tty->termios_rwsem); tty_write_unlock(tty); return 0; } /** * pty_line_name - generate name for a pty * @driver: the tty driver in use * @index: the minor number * @p: output buffer of at least 6 bytes * * Generate a name from a @driver reference and write it to the output buffer * @p. * * Locking: None */ static void pty_line_name(struct tty_driver *driver, int index, char *p) { static const char ptychar[] = "pqrstuvwxyzabcde"; int i = index + driver->name_base; /* ->name is initialized to "ttyp", but "tty" is expected */ sprintf(p, "%s%c%x", driver->subtype == PTY_TYPE_SLAVE ? "tty" : driver->name, ptychar[i >> 4 & 0xf], i & 0xf); } /** * tty_line_name - generate name for a tty * @driver: the tty driver in use * @index: the minor number * @p: output buffer of at least 7 bytes * * Generate a name from a @driver reference and write it to the output buffer * @p. * * Locking: None */ static ssize_t tty_line_name(struct tty_driver *driver, int index, char *p) { if (driver->flags & TTY_DRIVER_UNNUMBERED_NODE) return sprintf(p, "%s", driver->name); else return sprintf(p, "%s%d", driver->name, index + driver->name_base); } /** * tty_driver_lookup_tty() - find an existing tty, if any * @driver: the driver for the tty * @file: file object * @idx: the minor number * * Return: the tty, if found. If not found, return %NULL or ERR_PTR() if the * driver lookup() method returns an error. * * Locking: tty_mutex must be held. If the tty is found, bump the tty kref. */ static struct tty_struct *tty_driver_lookup_tty(struct tty_driver *driver, struct file *file, int idx) { struct tty_struct *tty; if (driver->ops->lookup) { if (!file) tty = ERR_PTR(-EIO); else tty = driver->ops->lookup(driver, file, idx); } else { if (idx >= driver->num) return ERR_PTR(-EINVAL); tty = driver->ttys[idx]; } if (!IS_ERR(tty)) tty_kref_get(tty); return tty; } /** * tty_init_termios - helper for termios setup * @tty: the tty to set up * * Initialise the termios structure for this tty. This runs under the * %tty_mutex currently so we can be relaxed about ordering. */ void tty_init_termios(struct tty_struct *tty) { struct ktermios *tp; int idx = tty->index; if (tty->driver->flags & TTY_DRIVER_RESET_TERMIOS) tty->termios = tty->driver->init_termios; else { /* Check for lazy saved data */ tp = tty->driver->termios[idx]; if (tp != NULL) { tty->termios = *tp; tty->termios.c_line = tty->driver->init_termios.c_line; } else tty->termios = tty->driver->init_termios; } /* Compatibility until drivers always set this */ tty->termios.c_ispeed = tty_termios_input_baud_rate(&tty->termios); tty->termios.c_ospeed = tty_termios_baud_rate(&tty->termios); } EXPORT_SYMBOL_GPL(tty_init_termios); /** * tty_standard_install - usual tty->ops->install * @driver: the driver for the tty * @tty: the tty * * If the @driver overrides @tty->ops->install, it still can call this function * to perform the standard install operations. */ int tty_standard_install(struct tty_driver *driver, struct tty_struct *tty) { tty_init_termios(tty); tty_driver_kref_get(driver); tty->count++; driver->ttys[tty->index] = tty; return 0; } EXPORT_SYMBOL_GPL(tty_standard_install); /** * tty_driver_install_tty() - install a tty entry in the driver * @driver: the driver for the tty * @tty: the tty * * Install a tty object into the driver tables. The @tty->index field will be * set by the time this is called. This method is responsible for ensuring any * need additional structures are allocated and configured. * * Locking: tty_mutex for now */ static int tty_driver_install_tty(struct tty_driver *driver, struct tty_struct *tty) { return driver->ops->install ? driver->ops->install(driver, tty) : tty_standard_install(driver, tty); } /** * tty_driver_remove_tty() - remove a tty from the driver tables * @driver: the driver for the tty * @tty: tty to remove * * Remove a tty object from the driver tables. The tty->index field will be set * by the time this is called. * * Locking: tty_mutex for now */ static void tty_driver_remove_tty(struct tty_driver *driver, struct tty_struct *tty) { if (driver->ops->remove) driver->ops->remove(driver, tty); else driver->ttys[tty->index] = NULL; } /** * tty_reopen() - fast re-open of an open tty * @tty: the tty to open * * Re-opens on master ptys are not allowed and return -%EIO. * * Locking: Caller must hold tty_lock * Return: 0 on success, -errno on error. */ static int tty_reopen(struct tty_struct *tty) { struct tty_driver *driver = tty->driver; struct tty_ldisc *ld; int retval = 0; if (driver->type == TTY_DRIVER_TYPE_PTY && driver->subtype == PTY_TYPE_MASTER) return -EIO; if (!tty->count) return -EAGAIN; if (test_bit(TTY_EXCLUSIVE, &tty->flags) && !capable(CAP_SYS_ADMIN)) return -EBUSY; ld = tty_ldisc_ref_wait(tty); if (ld) { tty_ldisc_deref(ld); } else { retval = tty_ldisc_lock(tty, 5 * HZ); if (retval) return retval; if (!tty->ldisc) retval = tty_ldisc_reinit(tty, tty->termios.c_line); tty_ldisc_unlock(tty); } if (retval == 0) tty->count++; return retval; } /** * tty_init_dev - initialise a tty device * @driver: tty driver we are opening a device on * @idx: device index * * Prepare a tty device. This may not be a "new" clean device but could also be * an active device. The pty drivers require special handling because of this. * * Locking: * The function is called under the tty_mutex, which protects us from the * tty struct or driver itself going away. * * On exit the tty device has the line discipline attached and a reference * count of 1. If a pair was created for pty/tty use and the other was a pty * master then it too has a reference count of 1. * * WSH 06/09/97: Rewritten to remove races and properly clean up after a failed * open. The new code protects the open with a mutex, so it's really quite * straightforward. The mutex locking can probably be relaxed for the (most * common) case of reopening a tty. * * Return: new tty structure */ struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx) { struct tty_struct *tty; int retval; /* * First time open is complex, especially for PTY devices. * This code guarantees that either everything succeeds and the * TTY is ready for operation, or else the table slots are vacated * and the allocated memory released. (Except that the termios * may be retained.) */ if (!try_module_get(driver->owner)) return ERR_PTR(-ENODEV); tty = alloc_tty_struct(driver, idx); if (!tty) { retval = -ENOMEM; goto err_module_put; } tty_lock(tty); retval = tty_driver_install_tty(driver, tty); if (retval < 0) goto err_free_tty; if (!tty->port) tty->port = driver->ports[idx]; if (WARN_RATELIMIT(!tty->port, "%s: %s driver does not set tty->port. This would crash the kernel. Fix the driver!\n", __func__, tty->driver->name)) { retval = -EINVAL; goto err_release_lock; } retval = tty_ldisc_lock(tty, 5 * HZ); if (retval) goto err_release_lock; tty->port->itty = tty; /* * Structures all installed ... call the ldisc open routines. * If we fail here just call release_tty to clean up. No need * to decrement the use counts, as release_tty doesn't care. */ retval = tty_ldisc_setup(tty, tty->link); if (retval) goto err_release_tty; tty_ldisc_unlock(tty); /* Return the tty locked so that it cannot vanish under the caller */ return tty; err_free_tty: tty_unlock(tty); free_tty_struct(tty); err_module_put: module_put(driver->owner); return ERR_PTR(retval); /* call the tty release_tty routine to clean out this slot */ err_release_tty: tty_ldisc_unlock(tty); tty_info_ratelimited(tty, "ldisc open failed (%d), clearing slot %d\n", retval, idx); err_release_lock: tty_unlock(tty); release_tty(tty, idx); return ERR_PTR(retval); } /** * tty_save_termios() - save tty termios data in driver table * @tty: tty whose termios data to save * * Locking: Caller guarantees serialisation with tty_init_termios(). */ void tty_save_termios(struct tty_struct *tty) { struct ktermios *tp; int idx = tty->index; /* If the port is going to reset then it has no termios to save */ if (tty->driver->flags & TTY_DRIVER_RESET_TERMIOS) return; /* Stash the termios data */ tp = tty->driver->termios[idx]; if (tp == NULL) { tp = kmalloc(sizeof(*tp), GFP_KERNEL); if (tp == NULL) return; tty->driver->termios[idx] = tp; } *tp = tty->termios; } EXPORT_SYMBOL_GPL(tty_save_termios); /** * tty_flush_works - flush all works of a tty/pty pair * @tty: tty device to flush works for (or either end of a pty pair) * * Sync flush all works belonging to @tty (and the 'other' tty). */ static void tty_flush_works(struct tty_struct *tty) { flush_work(&tty->SAK_work); flush_work(&tty->hangup_work); if (tty->link) { flush_work(&tty->link->SAK_work); flush_work(&tty->link->hangup_work); } } /** * release_one_tty - release tty structure memory * @work: work of tty we are obliterating * * Releases memory associated with a tty structure, and clears out the * driver table slots. This function is called when a device is no longer * in use. It also gets called when setup of a device fails. * * Locking: * takes the file list lock internally when working on the list of ttys * that the driver keeps. * * This method gets called from a work queue so that the driver private * cleanup ops can sleep (needed for USB at least) */ static void release_one_tty(struct work_struct *work) { struct tty_struct *tty = container_of(work, struct tty_struct, hangup_work); struct tty_driver *driver = tty->driver; struct module *owner = driver->owner; if (tty->ops->cleanup) tty->ops->cleanup(tty); tty_driver_kref_put(driver); module_put(owner); spin_lock(&tty->files_lock); list_del_init(&tty->tty_files); spin_unlock(&tty->files_lock); put_pid(tty->ctrl.pgrp); put_pid(tty->ctrl.session); free_tty_struct(tty); } static void queue_release_one_tty(struct kref *kref) { struct tty_struct *tty = container_of(kref, struct tty_struct, kref); /* The hangup queue is now free so we can reuse it rather than * waste a chunk of memory for each port. */ INIT_WORK(&tty->hangup_work, release_one_tty); schedule_work(&tty->hangup_work); } /** * tty_kref_put - release a tty kref * @tty: tty device * * Release a reference to the @tty device and if need be let the kref layer * destruct the object for us. */ void tty_kref_put(struct tty_struct *tty) { if (tty) kref_put(&tty->kref, queue_release_one_tty); } EXPORT_SYMBOL(tty_kref_put); /** * release_tty - release tty structure memory * @tty: tty device release * @idx: index of the tty device release * * Release both @tty and a possible linked partner (think pty pair), * and decrement the refcount of the backing module. * * Locking: * tty_mutex * takes the file list lock internally when working on the list of ttys * that the driver keeps. */ static void release_tty(struct tty_struct *tty, int idx) { /* This should always be true but check for the moment */ WARN_ON(tty->index != idx); WARN_ON(!mutex_is_locked(&tty_mutex)); if (tty->ops->shutdown) tty->ops->shutdown(tty); tty_save_termios(tty); tty_driver_remove_tty(tty->driver, tty); if (tty->port) tty->port->itty = NULL; if (tty->link) tty->link->port->itty = NULL; if (tty->port) tty_buffer_cancel_work(tty->port); if (tty->link) tty_buffer_cancel_work(tty->link->port); tty_kref_put(tty->link); tty_kref_put(tty); } /** * tty_release_checks - check a tty before real release * @tty: tty to check * @idx: index of the tty * * Performs some paranoid checking before true release of the @tty. This is a * no-op unless %TTY_PARANOIA_CHECK is defined. */ static int tty_release_checks(struct tty_struct *tty, int idx) { #ifdef TTY_PARANOIA_CHECK if (idx < 0 || idx >= tty->driver->num) { tty_debug(tty, "bad idx %d\n", idx); return -1; } /* not much to check for devpts */ if (tty->driver->flags & TTY_DRIVER_DEVPTS_MEM) return 0; if (tty != tty->driver->ttys[idx]) { tty_debug(tty, "bad driver table[%d] = %p\n", idx, tty->driver->ttys[idx]); return -1; } if (tty->driver->other) { struct tty_struct *o_tty = tty->link; if (o_tty != tty->driver->other->ttys[idx]) { tty_debug(tty, "bad other table[%d] = %p\n", idx, tty->driver->other->ttys[idx]); return -1; } if (o_tty->link != tty) { tty_debug(tty, "bad link = %p\n", o_tty->link); return -1; } } #endif return 0; } /** * tty_kclose - closes tty opened by tty_kopen * @tty: tty device * * Performs the final steps to release and free a tty device. It is the same as * tty_release_struct() except that it also resets %TTY_PORT_KOPENED flag on * @tty->port. */ void tty_kclose(struct tty_struct *tty) { /* * Ask the line discipline code to release its structures */ tty_ldisc_release(tty); /* Wait for pending work before tty destruction commences */ tty_flush_works(tty); tty_debug_hangup(tty, "freeing structure\n"); /* * The release_tty function takes care of the details of clearing * the slots and preserving the termios structure. */ mutex_lock(&tty_mutex); tty_port_set_kopened(tty->port, 0); release_tty(tty, tty->index); mutex_unlock(&tty_mutex); } EXPORT_SYMBOL_GPL(tty_kclose); /** * tty_release_struct - release a tty struct * @tty: tty device * @idx: index of the tty * * Performs the final steps to release and free a tty device. It is roughly the * reverse of tty_init_dev(). */ void tty_release_struct(struct tty_struct *tty, int idx) { /* * Ask the line discipline code to release its structures */ tty_ldisc_release(tty); /* Wait for pending work before tty destruction commmences */ tty_flush_works(tty); tty_debug_hangup(tty, "freeing structure\n"); /* * The release_tty function takes care of the details of clearing * the slots and preserving the termios structure. */ mutex_lock(&tty_mutex); release_tty(tty, idx); mutex_unlock(&tty_mutex); } EXPORT_SYMBOL_GPL(tty_release_struct); /** * tty_release - vfs callback for close * @inode: inode of tty * @filp: file pointer for handle to tty * * Called the last time each file handle is closed that references this tty. * There may however be several such references. * * Locking: * Takes BKL. See tty_release_dev(). * * Even releasing the tty structures is a tricky business. We have to be very * careful that the structures are all released at the same time, as interrupts * might otherwise get the wrong pointers. * * WSH 09/09/97: rewritten to avoid some nasty race conditions that could * lead to double frees or releasing memory still in use. */ int tty_release(struct inode *inode, struct file *filp) { struct tty_struct *tty = file_tty(filp); struct tty_struct *o_tty = NULL; int do_sleep, final; int idx; long timeout = 0; int once = 1; if (tty_paranoia_check(tty, inode, __func__)) return 0; tty_lock(tty); check_tty_count(tty, __func__); __tty_fasync(-1, filp, 0); idx = tty->index; if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER) o_tty = tty->link; if (tty_release_checks(tty, idx)) { tty_unlock(tty); return 0; } tty_debug_hangup(tty, "releasing (count=%d)\n", tty->count); if (tty->ops->close) tty->ops->close(tty, filp); /* If tty is pty master, lock the slave pty (stable lock order) */ tty_lock_slave(o_tty); /* * Sanity check: if tty->count is going to zero, there shouldn't be * any waiters on tty->read_wait or tty->write_wait. We test the * wait queues and kick everyone out _before_ actually starting to * close. This ensures that we won't block while releasing the tty * structure. * * The test for the o_tty closing is necessary, since the master and * slave sides may close in any order. If the slave side closes out * first, its count will be one, since the master side holds an open. * Thus this test wouldn't be triggered at the time the slave closed, * so we do it now. */ while (1) { do_sleep = 0; if (tty->count <= 1) { if (waitqueue_active(&tty->read_wait)) { wake_up_poll(&tty->read_wait, EPOLLIN); do_sleep++; } if (waitqueue_active(&tty->write_wait)) { wake_up_poll(&tty->write_wait, EPOLLOUT); do_sleep++; } } if (o_tty && o_tty->count <= 1) { if (waitqueue_active(&o_tty->read_wait)) { wake_up_poll(&o_tty->read_wait, EPOLLIN); do_sleep++; } if (waitqueue_active(&o_tty->write_wait)) { wake_up_poll(&o_tty->write_wait, EPOLLOUT); do_sleep++; } } if (!do_sleep) break; if (once) { once = 0; tty_warn(tty, "read/write wait queue active!\n"); } schedule_timeout_killable(timeout); if (timeout < 120 * HZ) timeout = 2 * timeout + 1; else timeout = MAX_SCHEDULE_TIMEOUT; } if (o_tty) { if (--o_tty->count < 0) { tty_warn(tty, "bad slave count (%d)\n", o_tty->count); o_tty->count = 0; } } if (--tty->count < 0) { tty_warn(tty, "bad tty->count (%d)\n", tty->count); tty->count = 0; } /* * We've decremented tty->count, so we need to remove this file * descriptor off the tty->tty_files list; this serves two * purposes: * - check_tty_count sees the correct number of file descriptors * associated with this tty. * - do_tty_hangup no longer sees this file descriptor as * something that needs to be handled for hangups. */ tty_del_file(filp); /* * Perform some housekeeping before deciding whether to return. * * If _either_ side is closing, make sure there aren't any * processes that still think tty or o_tty is their controlling * tty. */ if (!tty->count) { read_lock(&tasklist_lock); session_clear_tty(tty->ctrl.session); if (o_tty) session_clear_tty(o_tty->ctrl.session); read_unlock(&tasklist_lock); } /* check whether both sides are closing ... */ final = !tty->count && !(o_tty && o_tty->count); tty_unlock_slave(o_tty); tty_unlock(tty); /* At this point, the tty->count == 0 should ensure a dead tty * cannot be re-opened by a racing opener. */ if (!final) return 0; tty_debug_hangup(tty, "final close\n"); tty_release_struct(tty, idx); return 0; } /** * tty_open_current_tty - get locked tty of current task * @device: device number * @filp: file pointer to tty * @return: locked tty of the current task iff @device is /dev/tty * * Performs a re-open of the current task's controlling tty. * * We cannot return driver and index like for the other nodes because devpts * will not work then. It expects inodes to be from devpts FS. */ static struct tty_struct *tty_open_current_tty(dev_t device, struct file *filp) { struct tty_struct *tty; int retval; if (device != MKDEV(TTYAUX_MAJOR, 0)) return NULL; tty = get_current_tty(); if (!tty) return ERR_PTR(-ENXIO); filp->f_flags |= O_NONBLOCK; /* Don't let /dev/tty block */ /* noctty = 1; */ tty_lock(tty); tty_kref_put(tty); /* safe to drop the kref now */ retval = tty_reopen(tty); if (retval < 0) { tty_unlock(tty); tty = ERR_PTR(retval); } return tty; } /** * tty_lookup_driver - lookup a tty driver for a given device file * @device: device number * @filp: file pointer to tty * @index: index for the device in the @return driver * * If returned value is not erroneous, the caller is responsible to decrement * the refcount by tty_driver_kref_put(). * * Locking: %tty_mutex protects get_tty_driver() * * Return: driver for this inode (with increased refcount) */ static struct tty_driver *tty_lookup_driver(dev_t device, struct file *filp, int *index) { struct tty_driver *driver = NULL; switch (device) { #ifdef CONFIG_VT case MKDEV(TTY_MAJOR, 0): { extern struct tty_driver *console_driver; driver = tty_driver_kref_get(console_driver); *index = fg_console; break; } #endif case MKDEV(TTYAUX_MAJOR, 1): { struct tty_driver *console_driver = console_device(index); if (console_driver) { driver = tty_driver_kref_get(console_driver); if (driver && filp) { /* Don't let /dev/console block */ filp->f_flags |= O_NONBLOCK; break; } } if (driver) tty_driver_kref_put(driver); return ERR_PTR(-ENODEV); } default: driver = get_tty_driver(device, index); if (!driver) return ERR_PTR(-ENODEV); break; } return driver; } static struct tty_struct *tty_kopen(dev_t device, int shared) { struct tty_struct *tty; struct tty_driver *driver; int index = -1; mutex_lock(&tty_mutex); driver = tty_lookup_driver(device, NULL, &index); if (IS_ERR(driver)) { mutex_unlock(&tty_mutex); return ERR_CAST(driver); } /* check whether we're reopening an existing tty */ tty = tty_driver_lookup_tty(driver, NULL, index); if (IS_ERR(tty) || shared) goto out; if (tty) { /* drop kref from tty_driver_lookup_tty() */ tty_kref_put(tty); tty = ERR_PTR(-EBUSY); } else { /* tty_init_dev returns tty with the tty_lock held */ tty = tty_init_dev(driver, index); if (IS_ERR(tty)) goto out; tty_port_set_kopened(tty->port, 1); } out: mutex_unlock(&tty_mutex); tty_driver_kref_put(driver); return tty; } /** * tty_kopen_exclusive - open a tty device for kernel * @device: dev_t of device to open * * Opens tty exclusively for kernel. Performs the driver lookup, makes sure * it's not already opened and performs the first-time tty initialization. * * Claims the global %tty_mutex to serialize: * * concurrent first-time tty initialization * * concurrent tty driver removal w/ lookup * * concurrent tty removal from driver table * * Return: the locked initialized &tty_struct */ struct tty_struct *tty_kopen_exclusive(dev_t device) { return tty_kopen(device, 0); } EXPORT_SYMBOL_GPL(tty_kopen_exclusive); /** * tty_kopen_shared - open a tty device for shared in-kernel use * @device: dev_t of device to open * * Opens an already existing tty for in-kernel use. Compared to * tty_kopen_exclusive() above it doesn't ensure to be the only user. * * Locking: identical to tty_kopen() above. */ struct tty_struct *tty_kopen_shared(dev_t device) { return tty_kopen(device, 1); } EXPORT_SYMBOL_GPL(tty_kopen_shared); /** * tty_open_by_driver - open a tty device * @device: dev_t of device to open * @filp: file pointer to tty * * Performs the driver lookup, checks for a reopen, or otherwise performs the * first-time tty initialization. * * * Claims the global tty_mutex to serialize: * * concurrent first-time tty initialization * * concurrent tty driver removal w/ lookup * * concurrent tty removal from driver table * * Return: the locked initialized or re-opened &tty_struct */ static struct tty_struct *tty_open_by_driver(dev_t device, struct file *filp) { struct tty_struct *tty; struct tty_driver *driver = NULL; int index = -1; int retval; mutex_lock(&tty_mutex); driver = tty_lookup_driver(device, filp, &index); if (IS_ERR(driver)) { mutex_unlock(&tty_mutex); return ERR_CAST(driver); } /* check whether we're reopening an existing tty */ tty = tty_driver_lookup_tty(driver, filp, index); if (IS_ERR(tty)) { mutex_unlock(&tty_mutex); goto out; } if (tty) { if (tty_port_kopened(tty->port)) { tty_kref_put(tty); mutex_unlock(&tty_mutex); tty = ERR_PTR(-EBUSY); goto out; } mutex_unlock(&tty_mutex); retval = tty_lock_interruptible(tty); tty_kref_put(tty); /* drop kref from tty_driver_lookup_tty() */ if (retval) { if (retval == -EINTR) retval = -ERESTARTSYS; tty = ERR_PTR(retval); goto out; } retval = tty_reopen(tty); if (retval < 0) { tty_unlock(tty); tty = ERR_PTR(retval); } } else { /* Returns with the tty_lock held for now */ tty = tty_init_dev(driver, index); mutex_unlock(&tty_mutex); } out: tty_driver_kref_put(driver); return tty; } /** * tty_open - open a tty device * @inode: inode of device file * @filp: file pointer to tty * * tty_open() and tty_release() keep up the tty count that contains the number * of opens done on a tty. We cannot use the inode-count, as different inodes * might point to the same tty. * * Open-counting is needed for pty masters, as well as for keeping track of * serial lines: DTR is dropped when the last close happens. * (This is not done solely through tty->count, now. - Ted 1/27/92) * * The termios state of a pty is reset on the first open so that settings don't * persist across reuse. * * Locking: * * %tty_mutex protects tty, tty_lookup_driver() and tty_init_dev(). * * @tty->count should protect the rest. * * ->siglock protects ->signal/->sighand * * Note: the tty_unlock/lock cases without a ref are only safe due to %tty_mutex */ static int tty_open(struct inode *inode, struct file *filp) { struct tty_struct *tty; int noctty, retval; dev_t device = inode->i_rdev; unsigned saved_flags = filp->f_flags; nonseekable_open(inode, filp); retry_open: retval = tty_alloc_file(filp); if (retval) return -ENOMEM; tty = tty_open_current_tty(device, filp); if (!tty) tty = tty_open_by_driver(device, filp); if (IS_ERR(tty)) { tty_free_file(filp); retval = PTR_ERR(tty); if (retval != -EAGAIN || signal_pending(current)) return retval; schedule(); goto retry_open; } tty_add_file(tty, filp); check_tty_count(tty, __func__); tty_debug_hangup(tty, "opening (count=%d)\n", tty->count); if (tty->ops->open) retval = tty->ops->open(tty, filp); else retval = -ENODEV; filp->f_flags = saved_flags; if (retval) { tty_debug_hangup(tty, "open error %d, releasing\n", retval); tty_unlock(tty); /* need to call tty_release without BTM */ tty_release(inode, filp); if (retval != -ERESTARTSYS) return retval; if (signal_pending(current)) return retval; schedule(); /* * Need to reset f_op in case a hangup happened. */ if (tty_hung_up_p(filp)) filp->f_op = &tty_fops; goto retry_open; } clear_bit(TTY_HUPPED, &tty->flags); noctty = (filp->f_flags & O_NOCTTY) || (IS_ENABLED(CONFIG_VT) && device == MKDEV(TTY_MAJOR, 0)) || device == MKDEV(TTYAUX_MAJOR, 1) || (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER); if (!noctty) tty_open_proc_set_tty(filp, tty); tty_unlock(tty); return 0; } /** * tty_poll - check tty status * @filp: file being polled * @wait: poll wait structures to update * * Call the line discipline polling method to obtain the poll status of the * device. * * Locking: locks called line discipline but ldisc poll method may be * re-entered freely by other callers. */ static __poll_t tty_poll(struct file *filp, poll_table *wait) { struct tty_struct *tty = file_tty(filp); struct tty_ldisc *ld; __poll_t ret = 0; if (tty_paranoia_check(tty, file_inode(filp), "tty_poll")) return 0; ld = tty_ldisc_ref_wait(tty); if (!ld) return hung_up_tty_poll(filp, wait); if (ld->ops->poll) ret = ld->ops->poll(tty, filp, wait); tty_ldisc_deref(ld); return ret; } static int __tty_fasync(int fd, struct file *filp, int on) { struct tty_struct *tty = file_tty(filp); unsigned long flags; int retval = 0; if (tty_paranoia_check(tty, file_inode(filp), "tty_fasync")) goto out; if (on) { retval = file_f_owner_allocate(filp); if (retval) goto out; } retval = fasync_helper(fd, filp, on, &tty->fasync); if (retval <= 0) goto out; if (on) { enum pid_type type; struct pid *pid; spin_lock_irqsave(&tty->ctrl.lock, flags); if (tty->ctrl.pgrp) { pid = tty->ctrl.pgrp; type = PIDTYPE_PGID; } else { pid = task_pid(current); type = PIDTYPE_TGID; } get_pid(pid); spin_unlock_irqrestore(&tty->ctrl.lock, flags); __f_setown(filp, pid, type, 0); put_pid(pid); retval = 0; } out: return retval; } static int tty_fasync(int fd, struct file *filp, int on) { struct tty_struct *tty = file_tty(filp); int retval = -ENOTTY; tty_lock(tty); if (!tty_hung_up_p(filp)) retval = __tty_fasync(fd, filp, on); tty_unlock(tty); return retval; } static bool tty_legacy_tiocsti __read_mostly = IS_ENABLED(CONFIG_LEGACY_TIOCSTI); /** * tiocsti - fake input character * @tty: tty to fake input into * @p: pointer to character * * Fake input to a tty device. Does the necessary locking and input management. * * FIXME: does not honour flow control ?? * * Locking: * * Called functions take tty_ldiscs_lock * * current->signal->tty check is safe without locks */ static int tiocsti(struct tty_struct *tty, u8 __user *p) { struct tty_ldisc *ld; u8 ch; if (!tty_legacy_tiocsti && !capable(CAP_SYS_ADMIN)) return -EIO; if ((current->signal->tty != tty) && !capable(CAP_SYS_ADMIN)) return -EPERM; if (get_user(ch, p)) return -EFAULT; tty_audit_tiocsti(tty, ch); ld = tty_ldisc_ref_wait(tty); if (!ld) return -EIO; tty_buffer_lock_exclusive(tty->port); if (ld->ops->receive_buf) ld->ops->receive_buf(tty, &ch, NULL, 1); tty_buffer_unlock_exclusive(tty->port); tty_ldisc_deref(ld); return 0; } /** * tiocgwinsz - implement window query ioctl * @tty: tty * @arg: user buffer for result * * Copies the kernel idea of the window size into the user buffer. * * Locking: @tty->winsize_mutex is taken to ensure the winsize data is * consistent. */ static int tiocgwinsz(struct tty_struct *tty, struct winsize __user *arg) { guard(mutex)(&tty->winsize_mutex); if (copy_to_user(arg, &tty->winsize, sizeof(*arg))) return -EFAULT; return 0; } /** * tty_do_resize - resize event * @tty: tty being resized * @ws: new dimensions * * Update the termios variables and send the necessary signals to peform a * terminal resize correctly. */ int tty_do_resize(struct tty_struct *tty, struct winsize *ws) { struct pid *pgrp; guard(mutex)(&tty->winsize_mutex); if (!memcmp(ws, &tty->winsize, sizeof(*ws))) return 0; /* Signal the foreground process group */ pgrp = tty_get_pgrp(tty); if (pgrp) kill_pgrp(pgrp, SIGWINCH, 1); put_pid(pgrp); tty->winsize = *ws; return 0; } EXPORT_SYMBOL(tty_do_resize); /** * tiocswinsz - implement window size set ioctl * @tty: tty side of tty * @arg: user buffer for result * * Copies the user idea of the window size to the kernel. Traditionally this is * just advisory information but for the Linux console it actually has driver * level meaning and triggers a VC resize. * * Locking: * Driver dependent. The default do_resize method takes the tty termios * mutex and ctrl.lock. The console takes its own lock then calls into the * default method. */ static int tiocswinsz(struct tty_struct *tty, struct winsize __user *arg) { struct winsize tmp_ws; if (copy_from_user(&tmp_ws, arg, sizeof(*arg))) return -EFAULT; if (tty->ops->resize) return tty->ops->resize(tty, &tmp_ws); else return tty_do_resize(tty, &tmp_ws); } /** * tioccons - allow admin to move logical console * @file: the file to become console * * Allow the administrator to move the redirected console device. * * Locking: uses redirect_lock to guard the redirect information */ static int tioccons(struct file *file) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (file->f_op->write_iter == redirected_tty_write) { struct file *f; spin_lock(&redirect_lock); f = redirect; redirect = NULL; spin_unlock(&redirect_lock); if (f) fput(f); return 0; } if (file->f_op->write_iter != tty_write) return -ENOTTY; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; guard(spinlock)(&redirect_lock); if (redirect) return -EBUSY; redirect = get_file(file); return 0; } /** * tiocsetd - set line discipline * @tty: tty device * @p: pointer to user data * * Set the line discipline according to user request. * * Locking: see tty_set_ldisc(), this function is just a helper */ static int tiocsetd(struct tty_struct *tty, int __user *p) { int disc; int ret; if (get_user(disc, p)) return -EFAULT; ret = tty_set_ldisc(tty, disc); return ret; } /** * tiocgetd - get line discipline * @tty: tty device * @p: pointer to user data * * Retrieves the line discipline id directly from the ldisc. * * Locking: waits for ldisc reference (in case the line discipline is changing * or the @tty is being hungup) */ static int tiocgetd(struct tty_struct *tty, int __user *p) { struct tty_ldisc *ld; int ret; ld = tty_ldisc_ref_wait(tty); if (!ld) return -EIO; ret = put_user(ld->ops->num, p); tty_ldisc_deref(ld); return ret; } /** * send_break - performed time break * @tty: device to break on * @duration: timeout in mS * * Perform a timed break on hardware that lacks its own driver level timed * break functionality. * * Locking: * @tty->atomic_write_lock serializes */ static int send_break(struct tty_struct *tty, unsigned int duration) { int retval; if (tty->ops->break_ctl == NULL) return 0; if (tty->driver->flags & TTY_DRIVER_HARDWARE_BREAK) return tty->ops->break_ctl(tty, duration); /* Do the work ourselves */ if (tty_write_lock(tty, false) < 0) return -EINTR; retval = tty->ops->break_ctl(tty, -1); if (!retval) { msleep_interruptible(duration); retval = tty->ops->break_ctl(tty, 0); } else if (retval == -EOPNOTSUPP) { /* some drivers can tell only dynamically */ retval = 0; } tty_write_unlock(tty); if (signal_pending(current)) retval = -EINTR; return retval; } /** * tty_get_tiocm - get tiocm status register * @tty: tty device * * Obtain the modem status bits from the tty driver if the feature * is supported. */ int tty_get_tiocm(struct tty_struct *tty) { int retval = -ENOTTY; if (tty->ops->tiocmget) retval = tty->ops->tiocmget(tty); return retval; } EXPORT_SYMBOL_GPL(tty_get_tiocm); /** * tty_tiocmget - get modem status * @tty: tty device * @p: pointer to result * * Obtain the modem status bits from the tty driver if the feature is * supported. Return -%ENOTTY if it is not available. * * Locking: none (up to the driver) */ static int tty_tiocmget(struct tty_struct *tty, int __user *p) { int retval; retval = tty_get_tiocm(tty); if (retval >= 0) retval = put_user(retval, p); return retval; } /** * tty_tiocmset - set modem status * @tty: tty device * @cmd: command - clear bits, set bits or set all * @p: pointer to desired bits * * Set the modem status bits from the tty driver if the feature * is supported. Return -%ENOTTY if it is not available. * * Locking: none (up to the driver) */ static int tty_tiocmset(struct tty_struct *tty, unsigned int cmd, unsigned __user *p) { int retval; unsigned int set, clear, val; if (tty->ops->tiocmset == NULL) return -ENOTTY; retval = get_user(val, p); if (retval) return retval; set = clear = 0; switch (cmd) { case TIOCMBIS: set = val; break; case TIOCMBIC: clear = val; break; case TIOCMSET: set = val; clear = ~val; break; } set &= TIOCM_DTR|TIOCM_RTS|TIOCM_OUT1|TIOCM_OUT2|TIOCM_LOOP; clear &= TIOCM_DTR|TIOCM_RTS|TIOCM_OUT1|TIOCM_OUT2|TIOCM_LOOP; return tty->ops->tiocmset(tty, set, clear); } /** * tty_get_icount - get tty statistics * @tty: tty device * @icount: output parameter * * Gets a copy of the @tty's icount statistics. * * Locking: none (up to the driver) */ int tty_get_icount(struct tty_struct *tty, struct serial_icounter_struct *icount) { memset(icount, 0, sizeof(*icount)); if (tty->ops->get_icount) return tty->ops->get_icount(tty, icount); else return -ENOTTY; } EXPORT_SYMBOL_GPL(tty_get_icount); static int tty_tiocgicount(struct tty_struct *tty, void __user *arg) { struct serial_icounter_struct icount; int retval; retval = tty_get_icount(tty, &icount); if (retval != 0) return retval; if (copy_to_user(arg, &icount, sizeof(icount))) return -EFAULT; return 0; } static int tty_set_serial(struct tty_struct *tty, struct serial_struct *ss) { int flags; flags = ss->flags & ASYNC_DEPRECATED; if (flags) pr_warn_ratelimited("%s: '%s' is using deprecated serial flags (with no effect): %.8x\n", __func__, current->comm, flags); if (!tty->ops->set_serial) return -ENOTTY; return tty->ops->set_serial(tty, ss); } static int tty_tiocsserial(struct tty_struct *tty, struct serial_struct __user *ss) { struct serial_struct v; if (copy_from_user(&v, ss, sizeof(*ss))) return -EFAULT; return tty_set_serial(tty, &v); } static int tty_tiocgserial(struct tty_struct *tty, struct serial_struct __user *ss) { struct serial_struct v; int err; memset(&v, 0, sizeof(v)); if (!tty->ops->get_serial) return -ENOTTY; err = tty->ops->get_serial(tty, &v); if (!err && copy_to_user(ss, &v, sizeof(v))) err = -EFAULT; return err; } /* * if pty, return the slave side (real_tty) * otherwise, return self */ static struct tty_struct *tty_pair_get_tty(struct tty_struct *tty) { if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER) tty = tty->link; return tty; } /* * Split this up, as gcc can choke on it otherwise.. */ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct tty_struct *tty = file_tty(file); struct tty_struct *real_tty; void __user *p = (void __user *)arg; int retval; struct tty_ldisc *ld; if (tty_paranoia_check(tty, file_inode(file), "tty_ioctl")) return -EINVAL; real_tty = tty_pair_get_tty(tty); /* * Factor out some common prep work */ switch (cmd) { case TIOCSETD: case TIOCSBRK: case TIOCCBRK: case TCSBRK: case TCSBRKP: retval = tty_check_change(tty); if (retval) return retval; if (cmd != TIOCCBRK) { tty_wait_until_sent(tty, 0); if (signal_pending(current)) return -EINTR; } break; } /* * Now do the stuff. */ switch (cmd) { case TIOCSTI: return tiocsti(tty, p); case TIOCGWINSZ: return tiocgwinsz(real_tty, p); case TIOCSWINSZ: return tiocswinsz(real_tty, p); case TIOCCONS: return real_tty != tty ? -EINVAL : tioccons(file); case TIOCEXCL: set_bit(TTY_EXCLUSIVE, &tty->flags); return 0; case TIOCNXCL: clear_bit(TTY_EXCLUSIVE, &tty->flags); return 0; case TIOCGEXCL: { int excl = test_bit(TTY_EXCLUSIVE, &tty->flags); return put_user(excl, (int __user *)p); } case TIOCGETD: return tiocgetd(tty, p); case TIOCSETD: return tiocsetd(tty, p); case TIOCVHANGUP: if (!capable(CAP_SYS_ADMIN)) return -EPERM; tty_vhangup(tty); return 0; case TIOCGDEV: { unsigned int ret = new_encode_dev(tty_devnum(real_tty)); return put_user(ret, (unsigned int __user *)p); } /* * Break handling */ case TIOCSBRK: /* Turn break on, unconditionally */ if (tty->ops->break_ctl) return tty->ops->break_ctl(tty, -1); return 0; case TIOCCBRK: /* Turn break off, unconditionally */ if (tty->ops->break_ctl) return tty->ops->break_ctl(tty, 0); return 0; case TCSBRK: /* SVID version: non-zero arg --> no break */ /* non-zero arg means wait for all output data * to be sent (performed above) but don't send break. * This is used by the tcdrain() termios function. */ if (!arg) return send_break(tty, 250); return 0; case TCSBRKP: /* support for POSIX tcsendbreak() */ return send_break(tty, arg ? arg*100 : 250); case TIOCMGET: return tty_tiocmget(tty, p); case TIOCMSET: case TIOCMBIC: case TIOCMBIS: return tty_tiocmset(tty, cmd, p); case TIOCGICOUNT: return tty_tiocgicount(tty, p); case TCFLSH: switch (arg) { case TCIFLUSH: case TCIOFLUSH: /* flush tty buffer and allow ldisc to process ioctl */ tty_buffer_flush(tty, NULL); break; } break; case TIOCSSERIAL: return tty_tiocsserial(tty, p); case TIOCGSERIAL: return tty_tiocgserial(tty, p); case TIOCGPTPEER: /* Special because the struct file is needed */ return ptm_open_peer(file, tty, (int)arg); default: retval = tty_jobctrl_ioctl(tty, real_tty, file, cmd, arg); if (retval != -ENOIOCTLCMD) return retval; } if (tty->ops->ioctl) { retval = tty->ops->ioctl(tty, cmd, arg); if (retval != -ENOIOCTLCMD) return retval; } ld = tty_ldisc_ref_wait(tty); if (!ld) return hung_up_tty_ioctl(file, cmd, arg); retval = -EINVAL; if (ld->ops->ioctl) { retval = ld->ops->ioctl(tty, cmd, arg); if (retval == -ENOIOCTLCMD) retval = -ENOTTY; } tty_ldisc_deref(ld); return retval; } #ifdef CONFIG_COMPAT struct serial_struct32 { compat_int_t type; compat_int_t line; compat_uint_t port; compat_int_t irq; compat_int_t flags; compat_int_t xmit_fifo_size; compat_int_t custom_divisor; compat_int_t baud_base; unsigned short close_delay; char io_type; char reserved_char; compat_int_t hub6; unsigned short closing_wait; /* time to wait before closing */ unsigned short closing_wait2; /* no longer used... */ compat_uint_t iomem_base; unsigned short iomem_reg_shift; unsigned int port_high; /* compat_ulong_t iomap_base FIXME */ compat_int_t reserved; }; static int compat_tty_tiocsserial(struct tty_struct *tty, struct serial_struct32 __user *ss) { struct serial_struct32 v32; struct serial_struct v; if (copy_from_user(&v32, ss, sizeof(*ss))) return -EFAULT; memcpy(&v, &v32, offsetof(struct serial_struct32, iomem_base)); v.iomem_base = compat_ptr(v32.iomem_base); v.iomem_reg_shift = v32.iomem_reg_shift; v.port_high = v32.port_high; v.iomap_base = 0; return tty_set_serial(tty, &v); } static int compat_tty_tiocgserial(struct tty_struct *tty, struct serial_struct32 __user *ss) { struct serial_struct32 v32; struct serial_struct v; int err; memset(&v, 0, sizeof(v)); memset(&v32, 0, sizeof(v32)); if (!tty->ops->get_serial) return -ENOTTY; err = tty->ops->get_serial(tty, &v); if (!err) { memcpy(&v32, &v, offsetof(struct serial_struct32, iomem_base)); v32.iomem_base = (unsigned long)v.iomem_base >> 32 ? 0xfffffff : ptr_to_compat(v.iomem_base); v32.iomem_reg_shift = v.iomem_reg_shift; v32.port_high = v.port_high; if (copy_to_user(ss, &v32, sizeof(v32))) err = -EFAULT; } return err; } static long tty_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct tty_struct *tty = file_tty(file); struct tty_ldisc *ld; int retval = -ENOIOCTLCMD; switch (cmd) { case TIOCOUTQ: case TIOCSTI: case TIOCGWINSZ: case TIOCSWINSZ: case TIOCGEXCL: case TIOCGETD: case TIOCSETD: case TIOCGDEV: case TIOCMGET: case TIOCMSET: case TIOCMBIC: case TIOCMBIS: case TIOCGICOUNT: case TIOCGPGRP: case TIOCSPGRP: case TIOCGSID: case TIOCSERGETLSR: case TIOCGRS485: case TIOCSRS485: #ifdef TIOCGETP case TIOCGETP: case TIOCSETP: case TIOCSETN: #endif #ifdef TIOCGETC case TIOCGETC: case TIOCSETC: #endif #ifdef TIOCGLTC case TIOCGLTC: case TIOCSLTC: #endif case TCSETSF: case TCSETSW: case TCSETS: case TCGETS: #ifdef TCGETS2 case TCGETS2: case TCSETSF2: case TCSETSW2: case TCSETS2: #endif case TCGETA: case TCSETAF: case TCSETAW: case TCSETA: case TIOCGLCKTRMIOS: case TIOCSLCKTRMIOS: #ifdef TCGETX case TCGETX: case TCSETX: case TCSETXW: case TCSETXF: #endif case TIOCGSOFTCAR: case TIOCSSOFTCAR: case PPPIOCGCHAN: case PPPIOCGUNIT: return tty_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); case TIOCCONS: case TIOCEXCL: case TIOCNXCL: case TIOCVHANGUP: case TIOCSBRK: case TIOCCBRK: case TCSBRK: case TCSBRKP: case TCFLSH: case TIOCGPTPEER: case TIOCNOTTY: case TIOCSCTTY: case TCXONC: case TIOCMIWAIT: case TIOCSERCONFIG: return tty_ioctl(file, cmd, arg); } if (tty_paranoia_check(tty, file_inode(file), "tty_ioctl")) return -EINVAL; switch (cmd) { case TIOCSSERIAL: return compat_tty_tiocsserial(tty, compat_ptr(arg)); case TIOCGSERIAL: return compat_tty_tiocgserial(tty, compat_ptr(arg)); } if (tty->ops->compat_ioctl) { retval = tty->ops->compat_ioctl(tty, cmd, arg); if (retval != -ENOIOCTLCMD) return retval; } ld = tty_ldisc_ref_wait(tty); if (!ld) return hung_up_tty_compat_ioctl(file, cmd, arg); if (ld->ops->compat_ioctl) retval = ld->ops->compat_ioctl(tty, cmd, arg); if (retval == -ENOIOCTLCMD && ld->ops->ioctl) retval = ld->ops->ioctl(tty, (unsigned long)compat_ptr(cmd), arg); tty_ldisc_deref(ld); return retval; } #endif static int this_tty(const void *t, struct file *file, unsigned fd) { if (likely(file->f_op->read_iter != tty_read)) return 0; return file_tty(file) != t ? 0 : fd + 1; } /* * This implements the "Secure Attention Key" --- the idea is to * prevent trojan horses by killing all processes associated with this * tty when the user hits the "Secure Attention Key". Required for * super-paranoid applications --- see the Orange Book for more details. * * This code could be nicer; ideally it should send a HUP, wait a few * seconds, then send a INT, and then a KILL signal. But you then * have to coordinate with the init process, since all processes associated * with the current tty must be dead before the new getty is allowed * to spawn. * * Now, if it would be correct ;-/ The current code has a nasty hole - * it doesn't catch files in flight. We may send the descriptor to ourselves * via AF_UNIX socket, close it and later fetch from socket. FIXME. * * Nasty bug: do_SAK is being called in interrupt context. This can * deadlock. We punt it up to process context. AKPM - 16Mar2001 */ void __do_SAK(struct tty_struct *tty) { struct task_struct *g, *p; struct pid *session; int i; scoped_guard(spinlock_irqsave, &tty->ctrl.lock) session = get_pid(tty->ctrl.session); tty_ldisc_flush(tty); tty_driver_flush_buffer(tty); read_lock(&tasklist_lock); /* Kill the entire session */ do_each_pid_task(session, PIDTYPE_SID, p) { tty_notice(tty, "SAK: killed process %d (%s): by session\n", task_pid_nr(p), p->comm); group_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_SID); } while_each_pid_task(session, PIDTYPE_SID, p); /* Now kill any processes that happen to have the tty open */ for_each_process_thread(g, p) { if (p->signal->tty == tty) { tty_notice(tty, "SAK: killed process %d (%s): by controlling tty\n", task_pid_nr(p), p->comm); group_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_SID); continue; } guard(task_lock)(p); i = iterate_fd(p->files, 0, this_tty, tty); if (i != 0) { tty_notice(tty, "SAK: killed process %d (%s): by fd#%d\n", task_pid_nr(p), p->comm, i - 1); group_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_SID); } } read_unlock(&tasklist_lock); put_pid(session); } static void do_SAK_work(struct work_struct *work) { struct tty_struct *tty = container_of(work, struct tty_struct, SAK_work); __do_SAK(tty); } /* * The tq handling here is a little racy - tty->SAK_work may already be queued. * Fortunately we don't need to worry, because if ->SAK_work is already queued, * the values which we write to it will be identical to the values which it * already has. --akpm */ void do_SAK(struct tty_struct *tty) { if (!tty) return; schedule_work(&tty->SAK_work); } EXPORT_SYMBOL(do_SAK); /* Must put_device() after it's unused! */ static struct device *tty_get_device(struct tty_struct *tty) { dev_t devt = tty_devnum(tty); return class_find_device_by_devt(&tty_class, devt); } /** * alloc_tty_struct - allocate a new tty * @driver: driver which will handle the returned tty * @idx: minor of the tty * * This subroutine allocates and initializes a tty structure. * * Locking: none - @tty in question is not exposed at this point */ struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx) { struct tty_struct *tty; tty = kzalloc(sizeof(*tty), GFP_KERNEL_ACCOUNT); if (!tty) return NULL; kref_init(&tty->kref); if (tty_ldisc_init(tty)) { kfree(tty); return NULL; } tty->ctrl.session = NULL; tty->ctrl.pgrp = NULL; mutex_init(&tty->legacy_mutex); mutex_init(&tty->throttle_mutex); init_rwsem(&tty->termios_rwsem); mutex_init(&tty->winsize_mutex); init_ldsem(&tty->ldisc_sem); init_waitqueue_head(&tty->write_wait); init_waitqueue_head(&tty->read_wait); INIT_WORK(&tty->hangup_work, do_tty_hangup); mutex_init(&tty->atomic_write_lock); spin_lock_init(&tty->ctrl.lock); spin_lock_init(&tty->flow.lock); spin_lock_init(&tty->files_lock); INIT_LIST_HEAD(&tty->tty_files); INIT_WORK(&tty->SAK_work, do_SAK_work); tty->driver = driver; tty->ops = driver->ops; tty->index = idx; tty_line_name(driver, idx, tty->name); tty->dev = tty_get_device(tty); return tty; } /** * tty_put_char - write one character to a tty * @tty: tty * @ch: character to write * * Write one byte to the @tty using the provided @tty->ops->put_char() method * if present. * * Note: the specific put_char operation in the driver layer may go * away soon. Don't call it directly, use this method * * Return: the number of characters successfully output. */ int tty_put_char(struct tty_struct *tty, u8 ch) { if (tty->ops->put_char) return tty->ops->put_char(tty, ch); return tty->ops->write(tty, &ch, 1); } EXPORT_SYMBOL_GPL(tty_put_char); static int tty_cdev_add(struct tty_driver *driver, dev_t dev, unsigned int index, unsigned int count) { int err; /* init here, since reused cdevs cause crashes */ driver->cdevs[index] = cdev_alloc(); if (!driver->cdevs[index]) return -ENOMEM; driver->cdevs[index]->ops = &tty_fops; driver->cdevs[index]->owner = driver->owner; err = cdev_add(driver->cdevs[index], dev, count); if (err) kobject_put(&driver->cdevs[index]->kobj); return err; } /** * tty_register_device - register a tty device * @driver: the tty driver that describes the tty device * @index: the index in the tty driver for this tty device * @device: a struct device that is associated with this tty device. * This field is optional, if there is no known struct device * for this tty device it can be set to NULL safely. * * This call is required to be made to register an individual tty device * if the tty driver's flags have the %TTY_DRIVER_DYNAMIC_DEV bit set. If * that bit is not set, this function should not be called by a tty * driver. * * Locking: ?? * * Return: A pointer to the struct device for this tty device (or * ERR_PTR(-EFOO) on error). */ struct device *tty_register_device(struct tty_driver *driver, unsigned index, struct device *device) { return tty_register_device_attr(driver, index, device, NULL, NULL); } EXPORT_SYMBOL(tty_register_device); static void tty_device_create_release(struct device *dev) { dev_dbg(dev, "releasing...\n"); kfree(dev); } /** * tty_register_device_attr - register a tty device * @driver: the tty driver that describes the tty device * @index: the index in the tty driver for this tty device * @device: a struct device that is associated with this tty device. * This field is optional, if there is no known struct device * for this tty device it can be set to %NULL safely. * @drvdata: Driver data to be set to device. * @attr_grp: Attribute group to be set on device. * * This call is required to be made to register an individual tty device if the * tty driver's flags have the %TTY_DRIVER_DYNAMIC_DEV bit set. If that bit is * not set, this function should not be called by a tty driver. * * Locking: ?? * * Return: A pointer to the struct device for this tty device (or * ERR_PTR(-EFOO) on error). */ struct device *tty_register_device_attr(struct tty_driver *driver, unsigned index, struct device *device, void *drvdata, const struct attribute_group **attr_grp) { char name[64]; dev_t devt = MKDEV(driver->major, driver->minor_start) + index; struct ktermios *tp; struct device *dev; int retval; if (index >= driver->num) { pr_err("%s: Attempt to register invalid tty line number (%d)\n", driver->name, index); return ERR_PTR(-EINVAL); } if (driver->type == TTY_DRIVER_TYPE_PTY) pty_line_name(driver, index, name); else tty_line_name(driver, index, name); dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return ERR_PTR(-ENOMEM); dev->devt = devt; dev->class = &tty_class; dev->parent = device; dev->release = tty_device_create_release; dev_set_name(dev, "%s", name); dev->groups = attr_grp; dev_set_drvdata(dev, drvdata); dev_set_uevent_suppress(dev, 1); retval = device_register(dev); if (retval) goto err_put; if (!(driver->flags & TTY_DRIVER_DYNAMIC_ALLOC)) { /* * Free any saved termios data so that the termios state is * reset when reusing a minor number. */ tp = driver->termios[index]; if (tp) { driver->termios[index] = NULL; kfree(tp); } retval = tty_cdev_add(driver, devt, index, 1); if (retval) goto err_del; } dev_set_uevent_suppress(dev, 0); kobject_uevent(&dev->kobj, KOBJ_ADD); return dev; err_del: device_del(dev); err_put: put_device(dev); return ERR_PTR(retval); } EXPORT_SYMBOL_GPL(tty_register_device_attr); /** * tty_unregister_device - unregister a tty device * @driver: the tty driver that describes the tty device * @index: the index in the tty driver for this tty device * * If a tty device is registered with a call to tty_register_device() then * this function must be called when the tty device is gone. * * Locking: ?? */ void tty_unregister_device(struct tty_driver *driver, unsigned index) { device_destroy(&tty_class, MKDEV(driver->major, driver->minor_start) + index); if (!(driver->flags & TTY_DRIVER_DYNAMIC_ALLOC)) { cdev_del(driver->cdevs[index]); driver->cdevs[index] = NULL; } } EXPORT_SYMBOL(tty_unregister_device); /** * __tty_alloc_driver - allocate tty driver * @lines: count of lines this driver can handle at most * @owner: module which is responsible for this driver * @flags: some of enum tty_driver_flag, will be set in driver->flags * * This should not be called directly, tty_alloc_driver() should be used * instead. * * Returns: struct tty_driver or a PTR-encoded error (use IS_ERR() and friends). */ struct tty_driver *__tty_alloc_driver(unsigned int lines, struct module *owner, unsigned long flags) { struct tty_driver *driver; unsigned int cdevs = 1; int err; if (!lines || (flags & TTY_DRIVER_UNNUMBERED_NODE && lines > 1)) return ERR_PTR(-EINVAL); driver = kzalloc(sizeof(*driver), GFP_KERNEL); if (!driver) return ERR_PTR(-ENOMEM); kref_init(&driver->kref); driver->num = lines; driver->owner = owner; driver->flags = flags; if (!(flags & TTY_DRIVER_DEVPTS_MEM)) { driver->ttys = kcalloc(lines, sizeof(*driver->ttys), GFP_KERNEL); driver->termios = kcalloc(lines, sizeof(*driver->termios), GFP_KERNEL); if (!driver->ttys || !driver->termios) { err = -ENOMEM; goto err_free_all; } } if (!(flags & TTY_DRIVER_DYNAMIC_ALLOC)) { driver->ports = kcalloc(lines, sizeof(*driver->ports), GFP_KERNEL); if (!driver->ports) { err = -ENOMEM; goto err_free_all; } cdevs = lines; } driver->cdevs = kcalloc(cdevs, sizeof(*driver->cdevs), GFP_KERNEL); if (!driver->cdevs) { err = -ENOMEM; goto err_free_all; } return driver; err_free_all: kfree(driver->ports); kfree(driver->ttys); kfree(driver->termios); kfree(driver->cdevs); kfree(driver); return ERR_PTR(err); } EXPORT_SYMBOL(__tty_alloc_driver); static void destruct_tty_driver(struct kref *kref) { struct tty_driver *driver = container_of(kref, struct tty_driver, kref); int i; struct ktermios *tp; if (driver->flags & TTY_DRIVER_INSTALLED) { for (i = 0; i < driver->num; i++) { tp = driver->termios[i]; if (tp) { driver->termios[i] = NULL; kfree(tp); } if (!(driver->flags & TTY_DRIVER_DYNAMIC_DEV)) tty_unregister_device(driver, i); } proc_tty_unregister_driver(driver); if (driver->flags & TTY_DRIVER_DYNAMIC_ALLOC) cdev_del(driver->cdevs[0]); } kfree(driver->cdevs); kfree(driver->ports); kfree(driver->termios); kfree(driver->ttys); kfree(driver); } /** * tty_driver_kref_put - drop a reference to a tty driver * @driver: driver of which to drop the reference * * The final put will destroy and free up the driver. */ void tty_driver_kref_put(struct tty_driver *driver) { kref_put(&driver->kref, destruct_tty_driver); } EXPORT_SYMBOL(tty_driver_kref_put); /** * tty_register_driver - register a tty driver * @driver: driver to register * * Called by a tty driver to register itself. */ int tty_register_driver(struct tty_driver *driver) { int error; int i; dev_t dev; struct device *d; if (!driver->major) { error = alloc_chrdev_region(&dev, driver->minor_start, driver->num, driver->name); if (!error) { driver->major = MAJOR(dev); driver->minor_start = MINOR(dev); } } else { dev = MKDEV(driver->major, driver->minor_start); error = register_chrdev_region(dev, driver->num, driver->name); } if (error < 0) goto err; if (driver->flags & TTY_DRIVER_DYNAMIC_ALLOC) { error = tty_cdev_add(driver, dev, 0, driver->num); if (error) goto err_unreg_char; } scoped_guard(mutex, &tty_mutex) list_add(&driver->tty_drivers, &tty_drivers); if (!(driver->flags & TTY_DRIVER_DYNAMIC_DEV)) { for (i = 0; i < driver->num; i++) { d = tty_register_device(driver, i, NULL); if (IS_ERR(d)) { error = PTR_ERR(d); goto err_unreg_devs; } } } proc_tty_register_driver(driver); driver->flags |= TTY_DRIVER_INSTALLED; return 0; err_unreg_devs: for (i--; i >= 0; i--) tty_unregister_device(driver, i); scoped_guard(mutex, &tty_mutex) list_del(&driver->tty_drivers); err_unreg_char: unregister_chrdev_region(dev, driver->num); err: return error; } EXPORT_SYMBOL(tty_register_driver); /** * tty_unregister_driver - unregister a tty driver * @driver: driver to unregister * * Called by a tty driver to unregister itself. */ void tty_unregister_driver(struct tty_driver *driver) { unregister_chrdev_region(MKDEV(driver->major, driver->minor_start), driver->num); scoped_guard(mutex, &tty_mutex) list_del(&driver->tty_drivers); } EXPORT_SYMBOL(tty_unregister_driver); dev_t tty_devnum(struct tty_struct *tty) { return MKDEV(tty->driver->major, tty->driver->minor_start) + tty->index; } EXPORT_SYMBOL(tty_devnum); void tty_default_fops(struct file_operations *fops) { *fops = tty_fops; } static char *tty_devnode(const struct device *dev, umode_t *mode) { if (!mode) return NULL; if (dev->devt == MKDEV(TTYAUX_MAJOR, 0) || dev->devt == MKDEV(TTYAUX_MAJOR, 2)) *mode = 0666; return NULL; } const struct class tty_class = { .name = "tty", .devnode = tty_devnode, }; static int __init tty_class_init(void) { return class_register(&tty_class); } postcore_initcall(tty_class_init); /* 3/2004 jmc: why do these devices exist? */ static struct cdev tty_cdev, console_cdev; static ssize_t show_cons_active(struct device *dev, struct device_attribute *attr, char *buf) { struct console *cs[16]; int i = 0; struct console *c; ssize_t count = 0; /* * Hold the console_list_lock to guarantee that no consoles are * unregistered until all console processing is complete. * This also allows safe traversal of the console list and * race-free reading of @flags. */ console_list_lock(); for_each_console(c) { if (!c->device) continue; if (!(c->flags & CON_NBCON) && !c->write) continue; if ((c->flags & CON_ENABLED) == 0) continue; cs[i++] = c; if (i >= ARRAY_SIZE(cs)) break; } /* * Take console_lock to serialize device() callback with * other console operations. For example, fg_console is * modified under console_lock when switching vt. */ console_lock(); while (i--) { int index = cs[i]->index; struct tty_driver *drv = cs[i]->device(cs[i], &index); /* don't resolve tty0 as some programs depend on it */ if (drv && (cs[i]->index > 0 || drv->major != TTY_MAJOR)) count += tty_line_name(drv, index, buf + count); else count += sprintf(buf + count, "%s%d", cs[i]->name, cs[i]->index); count += sprintf(buf + count, "%c", i ? ' ':'\n'); } console_unlock(); console_list_unlock(); return count; } static DEVICE_ATTR(active, S_IRUGO, show_cons_active, NULL); static struct attribute *cons_dev_attrs[] = { &dev_attr_active.attr, NULL }; ATTRIBUTE_GROUPS(cons_dev); static struct device *consdev; void console_sysfs_notify(void) { if (consdev) sysfs_notify(&consdev->kobj, NULL, "active"); } static const struct ctl_table tty_table[] = { { .procname = "legacy_tiocsti", .data = &tty_legacy_tiocsti, .maxlen = sizeof(tty_legacy_tiocsti), .mode = 0644, .proc_handler = proc_dobool, }, { .procname = "ldisc_autoload", .data = &tty_ldisc_autoload, .maxlen = sizeof(tty_ldisc_autoload), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, }; /* * Ok, now we can initialize the rest of the tty devices and can count * on memory allocations, interrupts etc.. */ int __init tty_init(void) { register_sysctl_init("dev/tty", tty_table); cdev_init(&tty_cdev, &tty_fops); if (cdev_add(&tty_cdev, MKDEV(TTYAUX_MAJOR, 0), 1) || register_chrdev_region(MKDEV(TTYAUX_MAJOR, 0), 1, "/dev/tty") < 0) panic("Couldn't register /dev/tty driver\n"); device_create(&tty_class, NULL, MKDEV(TTYAUX_MAJOR, 0), NULL, "tty"); cdev_init(&console_cdev, &console_fops); if (cdev_add(&console_cdev, MKDEV(TTYAUX_MAJOR, 1), 1) || register_chrdev_region(MKDEV(TTYAUX_MAJOR, 1), 1, "/dev/console") < 0) panic("Couldn't register /dev/console driver\n"); consdev = device_create_with_groups(&tty_class, NULL, MKDEV(TTYAUX_MAJOR, 1), NULL, cons_dev_groups, "console"); if (IS_ERR(consdev)) consdev = NULL; #ifdef CONFIG_VT vty_init(&console_fops); #endif return 0; }
1866 1868 1867 1867 1866 1866 1869 1866 626 627 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 // SPDX-License-Identifier: GPL-2.0-only /* * Link physical devices with ACPI devices support * * Copyright (c) 2005 David Shaohua Li <shaohua.li@intel.com> * Copyright (c) 2005 Intel Corp. */ #define pr_fmt(fmt) "ACPI: " fmt #include <linux/acpi_iort.h> #include <linux/export.h> #include <linux/init.h> #include <linux/list.h> #include <linux/device.h> #include <linux/slab.h> #include <linux/rwsem.h> #include <linux/acpi.h> #include <linux/dma-mapping.h> #include <linux/pci.h> #include <linux/pci-acpi.h> #include <linux/platform_device.h> #include "internal.h" static LIST_HEAD(bus_type_list); static DECLARE_RWSEM(bus_type_sem); #define PHYSICAL_NODE_STRING "physical_node" #define PHYSICAL_NODE_NAME_SIZE (sizeof(PHYSICAL_NODE_STRING) + 10) int register_acpi_bus_type(struct acpi_bus_type *type) { if (acpi_disabled) return -ENODEV; if (type && type->match && type->find_companion) { down_write(&bus_type_sem); list_add_tail(&type->list, &bus_type_list); up_write(&bus_type_sem); pr_info("bus type %s registered\n", type->name); return 0; } return -ENODEV; } EXPORT_SYMBOL_GPL(register_acpi_bus_type); int unregister_acpi_bus_type(struct acpi_bus_type *type) { if (acpi_disabled) return 0; if (type) { down_write(&bus_type_sem); list_del_init(&type->list); up_write(&bus_type_sem); pr_info("bus type %s unregistered\n", type->name); return 0; } return -ENODEV; } EXPORT_SYMBOL_GPL(unregister_acpi_bus_type); static struct acpi_bus_type *acpi_get_bus_type(struct device *dev) { struct acpi_bus_type *tmp, *ret = NULL; down_read(&bus_type_sem); list_for_each_entry(tmp, &bus_type_list, list) { if (tmp->match(dev)) { ret = tmp; break; } } up_read(&bus_type_sem); return ret; } #define FIND_CHILD_MIN_SCORE 1 #define FIND_CHILD_MID_SCORE 2 #define FIND_CHILD_MAX_SCORE 3 static int match_any(struct acpi_device *adev, void *not_used) { return 1; } static bool acpi_dev_has_children(struct acpi_device *adev) { return acpi_dev_for_each_child(adev, match_any, NULL) > 0; } static int find_child_checks(struct acpi_device *adev, bool check_children) { unsigned long long sta; acpi_status status; if (check_children && !acpi_dev_has_children(adev)) return -ENODEV; status = acpi_evaluate_integer(adev->handle, "_STA", NULL, &sta); if (status == AE_NOT_FOUND) { /* * Special case: backlight device objects without _STA are * preferred to other objects with the same _ADR value, because * it is more likely that they are actually useful. */ if (adev->pnp.type.backlight) return FIND_CHILD_MID_SCORE; return FIND_CHILD_MIN_SCORE; } if (ACPI_FAILURE(status) || !(sta & ACPI_STA_DEVICE_ENABLED)) return -ENODEV; /* * If the device has a _HID returning a valid ACPI/PNP device ID, it is * better to make it look less attractive here, so that the other device * with the same _ADR value (that may not have a valid device ID) can be * matched going forward. [This means a second spec violation in a row, * so whatever we do here is best effort anyway.] */ if (adev->pnp.type.platform_id) return FIND_CHILD_MIN_SCORE; return FIND_CHILD_MAX_SCORE; } struct find_child_walk_data { struct acpi_device *adev; u64 address; int score; bool check_sta; bool check_children; }; static int check_one_child(struct acpi_device *adev, void *data) { struct find_child_walk_data *wd = data; int score; if (!adev->pnp.type.bus_address || acpi_device_adr(adev) != wd->address) return 0; if (!wd->adev) { /* * This is the first matching object, so save it. If it is not * necessary to look for any other matching objects, stop the * search. */ wd->adev = adev; return !(wd->check_sta || wd->check_children); } /* * There is more than one matching device object with the same _ADR * value. That really is unexpected, so we are kind of beyond the scope * of the spec here. We have to choose which one to return, though. * * First, get the score for the previously found object and terminate * the walk if it is maximum. */ if (!wd->score) { score = find_child_checks(wd->adev, wd->check_children); if (score == FIND_CHILD_MAX_SCORE) return 1; wd->score = score; } /* * Second, if the object that has just been found has a better score, * replace the previously found one with it and terminate the walk if * the new score is maximum. */ score = find_child_checks(adev, wd->check_children); if (score > wd->score) { wd->adev = adev; if (score == FIND_CHILD_MAX_SCORE) return 1; wd->score = score; } /* Continue, because there may be better matches. */ return 0; } static struct acpi_device *acpi_find_child(struct acpi_device *parent, u64 address, bool check_children, bool check_sta) { struct find_child_walk_data wd = { .address = address, .check_children = check_children, .check_sta = check_sta, .adev = NULL, .score = 0, }; if (parent) acpi_dev_for_each_child(parent, check_one_child, &wd); return wd.adev; } struct acpi_device *acpi_find_child_device(struct acpi_device *parent, u64 address, bool check_children) { return acpi_find_child(parent, address, check_children, true); } EXPORT_SYMBOL_GPL(acpi_find_child_device); struct acpi_device *acpi_find_child_by_adr(struct acpi_device *adev, acpi_bus_address adr) { return acpi_find_child(adev, adr, false, false); } EXPORT_SYMBOL_GPL(acpi_find_child_by_adr); static void acpi_physnode_link_name(char *buf, unsigned int node_id) { if (node_id > 0) snprintf(buf, PHYSICAL_NODE_NAME_SIZE, PHYSICAL_NODE_STRING "%u", node_id); else strcpy(buf, PHYSICAL_NODE_STRING); } int acpi_bind_one(struct device *dev, struct acpi_device *acpi_dev) { struct acpi_device_physical_node *physical_node, *pn; char physical_node_name[PHYSICAL_NODE_NAME_SIZE]; struct list_head *physnode_list; unsigned int node_id; int retval = -EINVAL; if (has_acpi_companion(dev)) { if (acpi_dev) { dev_warn(dev, "ACPI companion already set\n"); return -EINVAL; } else { acpi_dev = ACPI_COMPANION(dev); } } if (!acpi_dev) return -EINVAL; acpi_dev_get(acpi_dev); get_device(dev); physical_node = kzalloc(sizeof(*physical_node), GFP_KERNEL); if (!physical_node) { retval = -ENOMEM; goto err; } mutex_lock(&acpi_dev->physical_node_lock); /* * Keep the list sorted by node_id so that the IDs of removed nodes can * be recycled easily. */ physnode_list = &acpi_dev->physical_node_list; node_id = 0; list_for_each_entry(pn, &acpi_dev->physical_node_list, node) { /* Sanity check. */ if (pn->dev == dev) { mutex_unlock(&acpi_dev->physical_node_lock); dev_warn(dev, "Already associated with ACPI node\n"); kfree(physical_node); if (ACPI_COMPANION(dev) != acpi_dev) goto err; put_device(dev); acpi_dev_put(acpi_dev); return 0; } if (pn->node_id == node_id) { physnode_list = &pn->node; node_id++; } } physical_node->node_id = node_id; physical_node->dev = dev; list_add(&physical_node->node, physnode_list); acpi_dev->physical_node_count++; if (!has_acpi_companion(dev)) ACPI_COMPANION_SET(dev, acpi_dev); acpi_physnode_link_name(physical_node_name, node_id); retval = sysfs_create_link(&acpi_dev->dev.kobj, &dev->kobj, physical_node_name); if (retval) dev_err(&acpi_dev->dev, "Failed to create link %s (%d)\n", physical_node_name, retval); retval = sysfs_create_link(&dev->kobj, &acpi_dev->dev.kobj, "firmware_node"); if (retval) dev_err(dev, "Failed to create link firmware_node (%d)\n", retval); mutex_unlock(&acpi_dev->physical_node_lock); if (acpi_dev->wakeup.flags.valid) device_set_wakeup_capable(dev, true); return 0; err: ACPI_COMPANION_SET(dev, NULL); put_device(dev); acpi_dev_put(acpi_dev); return retval; } EXPORT_SYMBOL_GPL(acpi_bind_one); int acpi_unbind_one(struct device *dev) { struct acpi_device *acpi_dev = ACPI_COMPANION(dev); struct acpi_device_physical_node *entry; if (!acpi_dev) return 0; mutex_lock(&acpi_dev->physical_node_lock); list_for_each_entry(entry, &acpi_dev->physical_node_list, node) if (entry->dev == dev) { char physnode_name[PHYSICAL_NODE_NAME_SIZE]; list_del(&entry->node); acpi_dev->physical_node_count--; acpi_physnode_link_name(physnode_name, entry->node_id); sysfs_remove_link(&acpi_dev->dev.kobj, physnode_name); sysfs_remove_link(&dev->kobj, "firmware_node"); ACPI_COMPANION_SET(dev, NULL); /* Drop references taken by acpi_bind_one(). */ put_device(dev); acpi_dev_put(acpi_dev); kfree(entry); break; } mutex_unlock(&acpi_dev->physical_node_lock); return 0; } EXPORT_SYMBOL_GPL(acpi_unbind_one); void acpi_device_notify(struct device *dev) { struct acpi_device *adev; int ret; ret = acpi_bind_one(dev, NULL); if (ret) { struct acpi_bus_type *type = acpi_get_bus_type(dev); if (!type) goto err; adev = type->find_companion(dev); if (!adev) { dev_dbg(dev, "ACPI companion not found\n"); goto err; } ret = acpi_bind_one(dev, adev); if (ret) goto err; if (type->setup) { type->setup(dev); goto done; } } else { adev = ACPI_COMPANION(dev); if (dev_is_pci(dev)) { pci_acpi_setup(dev, adev); goto done; } else if (dev_is_platform(dev)) { acpi_configure_pmsi_domain(dev); } } if (adev->handler && adev->handler->bind) adev->handler->bind(dev); done: acpi_handle_debug(ACPI_HANDLE(dev), "Bound to device %s\n", dev_name(dev)); return; err: dev_dbg(dev, "No ACPI support\n"); } void acpi_device_notify_remove(struct device *dev) { struct acpi_device *adev = ACPI_COMPANION(dev); if (!adev) return; if (dev_is_pci(dev)) pci_acpi_cleanup(dev, adev); else if (adev->handler && adev->handler->unbind) adev->handler->unbind(dev); acpi_unbind_one(dev); }
4 1 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> */ #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_SYNPROXY.h> #include <net/netfilter/nf_synproxy.h> static unsigned int synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_synproxy_info *info = par->targinfo; struct net *net = xt_net(par); struct synproxy_net *snet = synproxy_pernet(net); struct synproxy_options opts = {}; struct tcphdr *th, _th; if (nf_ip6_checksum(skb, xt_hooknum(par), par->thoff, IPPROTO_TCP)) return NF_DROP; th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th); if (th == NULL) return NF_DROP; if (!synproxy_parse_options(skb, par->thoff, th, &opts)) return NF_DROP; if (th->syn && !(th->ack || th->fin || th->rst)) { /* Initial SYN from client */ this_cpu_inc(snet->stats->syn_received); if (th->ece && th->cwr) opts.options |= XT_SYNPROXY_OPT_ECN; opts.options &= info->options; opts.mss_encode = opts.mss_option; opts.mss_option = info->mss; if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) synproxy_init_timestamp_cookie(info, &opts); else opts.options &= ~(XT_SYNPROXY_OPT_WSCALE | XT_SYNPROXY_OPT_SACK_PERM | XT_SYNPROXY_OPT_ECN); synproxy_send_client_synack_ipv6(net, skb, th, &opts); consume_skb(skb); return NF_STOLEN; } else if (th->ack && !(th->fin || th->rst || th->syn)) { /* ACK from client */ if (synproxy_recv_client_ack_ipv6(net, skb, th, &opts, ntohl(th->seq))) { consume_skb(skb); return NF_STOLEN; } else { return NF_DROP; } } return XT_CONTINUE; } static int synproxy_tg6_check(const struct xt_tgchk_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); const struct ip6t_entry *e = par->entryinfo; int err; if (!(e->ipv6.flags & IP6T_F_PROTO) || e->ipv6.proto != IPPROTO_TCP || e->ipv6.invflags & XT_INV_PROTO) return -EINVAL; err = nf_ct_netns_get(par->net, par->family); if (err) return err; err = nf_synproxy_ipv6_init(snet, par->net); if (err) { nf_ct_netns_put(par->net, par->family); return err; } return err; } static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); nf_synproxy_ipv6_fini(snet, par->net); nf_ct_netns_put(par->net, par->family); } static struct xt_target synproxy_tg6_reg __read_mostly = { .name = "SYNPROXY", .family = NFPROTO_IPV6, .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD), .target = synproxy_tg6, .targetsize = sizeof(struct xt_synproxy_info), .checkentry = synproxy_tg6_check, .destroy = synproxy_tg6_destroy, .me = THIS_MODULE, }; static int __init synproxy_tg6_init(void) { return xt_register_target(&synproxy_tg6_reg); } static void __exit synproxy_tg6_exit(void) { xt_unregister_target(&synproxy_tg6_reg); } module_init(synproxy_tg6_init); module_exit(synproxy_tg6_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Intercept IPv6 TCP connections and establish them using syncookies");
131 1004 462 18739 151 141 8 2 21 4019 1557 127 1438 1562 19253 19015 160 18888 19104 1983 1669 447 448 123 1728 14 61 61 192 192 192 50 47 31 356 2 354 341 354 1091 1093 1051 50 336 186 66 8 33 81 239 181 57 182 258 259 273 274 37 696 357 13 331 77 75 77 16 70 77 77 78 78 8 4 4 177 1 77 98 4 6 13 16048 13 1428 4036 2838 1 1213 244 1439 1428 10 9 16057 10 16072 19181 16128 20 4023 5 15 1398 165 4005 19177 1513 8 17145 3 3 2 1 37 37 37 13 2 37 591 588 13 589 628 591 37 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 // SPDX-License-Identifier: GPL-2.0-only #include <linux/export.h> #include <linux/bvec.h> #include <linux/fault-inject-usercopy.h> #include <linux/uio.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/splice.h> #include <linux/compat.h> #include <linux/scatterlist.h> #include <linux/instrumented.h> #include <linux/iov_iter.h> static __always_inline size_t copy_to_user_iter(void __user *iter_to, size_t progress, size_t len, void *from, void *priv2) { if (should_fail_usercopy()) return len; if (access_ok(iter_to, len)) { from += progress; instrument_copy_to_user(iter_to, from, len); len = raw_copy_to_user(iter_to, from, len); } return len; } static __always_inline size_t copy_to_user_iter_nofault(void __user *iter_to, size_t progress, size_t len, void *from, void *priv2) { ssize_t res; if (should_fail_usercopy()) return len; from += progress; res = copy_to_user_nofault(iter_to, from, len); return res < 0 ? len : res; } static __always_inline size_t copy_from_user_iter(void __user *iter_from, size_t progress, size_t len, void *to, void *priv2) { size_t res = len; if (should_fail_usercopy()) return len; if (access_ok(iter_from, len)) { to += progress; instrument_copy_from_user_before(to, iter_from, len); res = raw_copy_from_user(to, iter_from, len); instrument_copy_from_user_after(to, iter_from, len, res); } return res; } static __always_inline size_t memcpy_to_iter(void *iter_to, size_t progress, size_t len, void *from, void *priv2) { memcpy(iter_to, from + progress, len); return 0; } static __always_inline size_t memcpy_from_iter(void *iter_from, size_t progress, size_t len, void *to, void *priv2) { memcpy(to + progress, iter_from, len); return 0; } /* * fault_in_iov_iter_readable - fault in iov iterator for reading * @i: iterator * @size: maximum length * * Fault in one or more iovecs of the given iov_iter, to a maximum length of * @size. For each iovec, fault in each page that constitutes the iovec. * * Returns the number of bytes not faulted in (like copy_to_user() and * copy_from_user()). * * Always returns 0 for non-userspace iterators. */ size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size) { if (iter_is_ubuf(i)) { size_t n = min(size, iov_iter_count(i)); n -= fault_in_readable(i->ubuf + i->iov_offset, n); return size - n; } else if (iter_is_iovec(i)) { size_t count = min(size, iov_iter_count(i)); const struct iovec *p; size_t skip; size -= count; for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) { size_t len = min(count, p->iov_len - skip); size_t ret; if (unlikely(!len)) continue; ret = fault_in_readable(p->iov_base + skip, len); count -= len - ret; if (ret) break; } return count + size; } return 0; } EXPORT_SYMBOL(fault_in_iov_iter_readable); /* * fault_in_iov_iter_writeable - fault in iov iterator for writing * @i: iterator * @size: maximum length * * Faults in the iterator using get_user_pages(), i.e., without triggering * hardware page faults. This is primarily useful when we already know that * some or all of the pages in @i aren't in memory. * * Returns the number of bytes not faulted in, like copy_to_user() and * copy_from_user(). * * Always returns 0 for non-user-space iterators. */ size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size) { if (iter_is_ubuf(i)) { size_t n = min(size, iov_iter_count(i)); n -= fault_in_safe_writeable(i->ubuf + i->iov_offset, n); return size - n; } else if (iter_is_iovec(i)) { size_t count = min(size, iov_iter_count(i)); const struct iovec *p; size_t skip; size -= count; for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) { size_t len = min(count, p->iov_len - skip); size_t ret; if (unlikely(!len)) continue; ret = fault_in_safe_writeable(p->iov_base + skip, len); count -= len - ret; if (ret) break; } return count + size; } return 0; } EXPORT_SYMBOL(fault_in_iov_iter_writeable); void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, unsigned long nr_segs, size_t count) { WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter) { .iter_type = ITER_IOVEC, .nofault = false, .data_source = direction, .__iov = iov, .nr_segs = nr_segs, .iov_offset = 0, .count = count }; } EXPORT_SYMBOL(iov_iter_init); size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(i->data_source)) return 0; if (user_backed_iter(i)) might_fault(); return iterate_and_advance(i, bytes, (void *)addr, copy_to_user_iter, memcpy_to_iter); } EXPORT_SYMBOL(_copy_to_iter); #ifdef CONFIG_ARCH_HAS_COPY_MC static __always_inline size_t copy_to_user_iter_mc(void __user *iter_to, size_t progress, size_t len, void *from, void *priv2) { if (access_ok(iter_to, len)) { from += progress; instrument_copy_to_user(iter_to, from, len); len = copy_mc_to_user(iter_to, from, len); } return len; } static __always_inline size_t memcpy_to_iter_mc(void *iter_to, size_t progress, size_t len, void *from, void *priv2) { return copy_mc_to_kernel(iter_to, from + progress, len); } /** * _copy_mc_to_iter - copy to iter with source memory error exception handling * @addr: source kernel address * @bytes: total transfer length * @i: destination iterator * * The pmem driver deploys this for the dax operation * (dax_copy_to_iter()) for dax reads (bypass page-cache and the * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes * successfully copied. * * The main differences between this and typical _copy_to_iter(). * * * Typical tail/residue handling after a fault retries the copy * byte-by-byte until the fault happens again. Re-triggering machine * checks is potentially fatal so the implementation uses source * alignment and poison alignment assumptions to avoid re-triggering * hardware exceptions. * * * ITER_KVEC and ITER_BVEC can return short copies. Compare to * copy_to_iter() where only ITER_IOVEC attempts might return a short copy. * * Return: number of bytes copied (may be %0) */ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(i->data_source)) return 0; if (user_backed_iter(i)) might_fault(); return iterate_and_advance(i, bytes, (void *)addr, copy_to_user_iter_mc, memcpy_to_iter_mc); } EXPORT_SYMBOL_GPL(_copy_mc_to_iter); #endif /* CONFIG_ARCH_HAS_COPY_MC */ static __always_inline size_t __copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { return iterate_and_advance(i, bytes, addr, copy_from_user_iter, memcpy_from_iter); } size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(!i->data_source)) return 0; if (user_backed_iter(i)) might_fault(); return __copy_from_iter(addr, bytes, i); } EXPORT_SYMBOL(_copy_from_iter); static __always_inline size_t copy_from_user_iter_nocache(void __user *iter_from, size_t progress, size_t len, void *to, void *priv2) { return __copy_from_user_inatomic_nocache(to + progress, iter_from, len); } size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(!i->data_source)) return 0; return iterate_and_advance(i, bytes, addr, copy_from_user_iter_nocache, memcpy_from_iter); } EXPORT_SYMBOL(_copy_from_iter_nocache); #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE static __always_inline size_t copy_from_user_iter_flushcache(void __user *iter_from, size_t progress, size_t len, void *to, void *priv2) { return __copy_from_user_flushcache(to + progress, iter_from, len); } static __always_inline size_t memcpy_from_iter_flushcache(void *iter_from, size_t progress, size_t len, void *to, void *priv2) { memcpy_flushcache(to + progress, iter_from, len); return 0; } /** * _copy_from_iter_flushcache - write destination through cpu cache * @addr: destination kernel address * @bytes: total transfer length * @i: source iterator * * The pmem driver arranges for filesystem-dax to use this facility via * dax_copy_from_iter() for ensuring that writes to persistent memory * are flushed through the CPU cache. It is differentiated from * _copy_from_iter_nocache() in that guarantees all data is flushed for * all iterator types. The _copy_from_iter_nocache() only attempts to * bypass the cache for the ITER_IOVEC case, and on some archs may use * instructions that strand dirty-data in the cache. * * Return: number of bytes copied (may be %0) */ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(!i->data_source)) return 0; return iterate_and_advance(i, bytes, addr, copy_from_user_iter_flushcache, memcpy_from_iter_flushcache); } EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache); #endif static inline bool page_copy_sane(struct page *page, size_t offset, size_t n) { struct page *head; size_t v = n + offset; /* * The general case needs to access the page order in order * to compute the page size. * However, we mostly deal with order-0 pages and thus can * avoid a possible cache line miss for requests that fit all * page orders. */ if (n <= v && v <= PAGE_SIZE) return true; head = compound_head(page); v += (page - head) << PAGE_SHIFT; if (WARN_ON(n > v || v > page_size(head))) return false; return true; } size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { size_t res = 0; if (!page_copy_sane(page, offset, bytes)) return 0; if (WARN_ON_ONCE(i->data_source)) return 0; page += offset / PAGE_SIZE; // first subpage offset %= PAGE_SIZE; while (1) { void *kaddr = kmap_local_page(page); size_t n = min(bytes, (size_t)PAGE_SIZE - offset); n = _copy_to_iter(kaddr + offset, n, i); kunmap_local(kaddr); res += n; bytes -= n; if (!bytes || !n) break; offset += n; if (offset == PAGE_SIZE) { page++; offset = 0; } } return res; } EXPORT_SYMBOL(copy_page_to_iter); size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes, struct iov_iter *i) { size_t res = 0; if (!page_copy_sane(page, offset, bytes)) return 0; if (WARN_ON_ONCE(i->data_source)) return 0; page += offset / PAGE_SIZE; // first subpage offset %= PAGE_SIZE; while (1) { void *kaddr = kmap_local_page(page); size_t n = min(bytes, (size_t)PAGE_SIZE - offset); n = iterate_and_advance(i, n, kaddr + offset, copy_to_user_iter_nofault, memcpy_to_iter); kunmap_local(kaddr); res += n; bytes -= n; if (!bytes || !n) break; offset += n; if (offset == PAGE_SIZE) { page++; offset = 0; } } return res; } EXPORT_SYMBOL(copy_page_to_iter_nofault); size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { size_t res = 0; if (!page_copy_sane(page, offset, bytes)) return 0; page += offset / PAGE_SIZE; // first subpage offset %= PAGE_SIZE; while (1) { void *kaddr = kmap_local_page(page); size_t n = min(bytes, (size_t)PAGE_SIZE - offset); n = _copy_from_iter(kaddr + offset, n, i); kunmap_local(kaddr); res += n; bytes -= n; if (!bytes || !n) break; offset += n; if (offset == PAGE_SIZE) { page++; offset = 0; } } return res; } EXPORT_SYMBOL(copy_page_from_iter); static __always_inline size_t zero_to_user_iter(void __user *iter_to, size_t progress, size_t len, void *priv, void *priv2) { return clear_user(iter_to, len); } static __always_inline size_t zero_to_iter(void *iter_to, size_t progress, size_t len, void *priv, void *priv2) { memset(iter_to, 0, len); return 0; } size_t iov_iter_zero(size_t bytes, struct iov_iter *i) { return iterate_and_advance(i, bytes, NULL, zero_to_user_iter, zero_to_iter); } EXPORT_SYMBOL(iov_iter_zero); size_t copy_folio_from_iter_atomic(struct folio *folio, size_t offset, size_t bytes, struct iov_iter *i) { size_t n, copied = 0; if (!page_copy_sane(&folio->page, offset, bytes)) return 0; if (WARN_ON_ONCE(!i->data_source)) return 0; do { char *to = kmap_local_folio(folio, offset); n = bytes - copied; if (folio_test_partial_kmap(folio) && n > PAGE_SIZE - offset_in_page(offset)) n = PAGE_SIZE - offset_in_page(offset); pagefault_disable(); n = __copy_from_iter(to, n, i); pagefault_enable(); kunmap_local(to); copied += n; offset += n; } while (copied != bytes && n > 0); return copied; } EXPORT_SYMBOL(copy_folio_from_iter_atomic); static void iov_iter_bvec_advance(struct iov_iter *i, size_t size) { const struct bio_vec *bvec, *end; if (!i->count) return; i->count -= size; size += i->iov_offset; for (bvec = i->bvec, end = bvec + i->nr_segs; bvec < end; bvec++) { if (likely(size < bvec->bv_len)) break; size -= bvec->bv_len; } i->iov_offset = size; i->nr_segs -= bvec - i->bvec; i->bvec = bvec; } static void iov_iter_iovec_advance(struct iov_iter *i, size_t size) { const struct iovec *iov, *end; if (!i->count) return; i->count -= size; size += i->iov_offset; // from beginning of current segment for (iov = iter_iov(i), end = iov + i->nr_segs; iov < end; iov++) { if (likely(size < iov->iov_len)) break; size -= iov->iov_len; } i->iov_offset = size; i->nr_segs -= iov - iter_iov(i); i->__iov = iov; } static void iov_iter_folioq_advance(struct iov_iter *i, size_t size) { const struct folio_queue *folioq = i->folioq; unsigned int slot = i->folioq_slot; if (!i->count) return; i->count -= size; if (slot >= folioq_nr_slots(folioq)) { folioq = folioq->next; slot = 0; } size += i->iov_offset; /* From beginning of current segment. */ do { size_t fsize = folioq_folio_size(folioq, slot); if (likely(size < fsize)) break; size -= fsize; slot++; if (slot >= folioq_nr_slots(folioq) && folioq->next) { folioq = folioq->next; slot = 0; } } while (size); i->iov_offset = size; i->folioq_slot = slot; i->folioq = folioq; } void iov_iter_advance(struct iov_iter *i, size_t size) { if (unlikely(i->count < size)) size = i->count; if (likely(iter_is_ubuf(i)) || unlikely(iov_iter_is_xarray(i))) { i->iov_offset += size; i->count -= size; } else if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) { /* iovec and kvec have identical layouts */ iov_iter_iovec_advance(i, size); } else if (iov_iter_is_bvec(i)) { iov_iter_bvec_advance(i, size); } else if (iov_iter_is_folioq(i)) { iov_iter_folioq_advance(i, size); } else if (iov_iter_is_discard(i)) { i->count -= size; } } EXPORT_SYMBOL(iov_iter_advance); static void iov_iter_folioq_revert(struct iov_iter *i, size_t unroll) { const struct folio_queue *folioq = i->folioq; unsigned int slot = i->folioq_slot; for (;;) { size_t fsize; if (slot == 0) { folioq = folioq->prev; slot = folioq_nr_slots(folioq); } slot--; fsize = folioq_folio_size(folioq, slot); if (unroll <= fsize) { i->iov_offset = fsize - unroll; break; } unroll -= fsize; } i->folioq_slot = slot; i->folioq = folioq; } void iov_iter_revert(struct iov_iter *i, size_t unroll) { if (!unroll) return; if (WARN_ON(unroll > MAX_RW_COUNT)) return; i->count += unroll; if (unlikely(iov_iter_is_discard(i))) return; if (unroll <= i->iov_offset) { i->iov_offset -= unroll; return; } unroll -= i->iov_offset; if (iov_iter_is_xarray(i) || iter_is_ubuf(i)) { BUG(); /* We should never go beyond the start of the specified * range since we might then be straying into pages that * aren't pinned. */ } else if (iov_iter_is_bvec(i)) { const struct bio_vec *bvec = i->bvec; while (1) { size_t n = (--bvec)->bv_len; i->nr_segs++; if (unroll <= n) { i->bvec = bvec; i->iov_offset = n - unroll; return; } unroll -= n; } } else if (iov_iter_is_folioq(i)) { i->iov_offset = 0; iov_iter_folioq_revert(i, unroll); } else { /* same logics for iovec and kvec */ const struct iovec *iov = iter_iov(i); while (1) { size_t n = (--iov)->iov_len; i->nr_segs++; if (unroll <= n) { i->__iov = iov; i->iov_offset = n - unroll; return; } unroll -= n; } } } EXPORT_SYMBOL(iov_iter_revert); /* * Return the count of just the current iov_iter segment. */ size_t iov_iter_single_seg_count(const struct iov_iter *i) { if (i->nr_segs > 1) { if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return min(i->count, iter_iov(i)->iov_len - i->iov_offset); if (iov_iter_is_bvec(i)) return min(i->count, i->bvec->bv_len - i->iov_offset); } if (unlikely(iov_iter_is_folioq(i))) return !i->count ? 0 : umin(folioq_folio_size(i->folioq, i->folioq_slot), i->count); return i->count; } EXPORT_SYMBOL(iov_iter_single_seg_count); void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec *kvec, unsigned long nr_segs, size_t count) { WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter){ .iter_type = ITER_KVEC, .data_source = direction, .kvec = kvec, .nr_segs = nr_segs, .iov_offset = 0, .count = count }; } EXPORT_SYMBOL(iov_iter_kvec); void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count) { WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter){ .iter_type = ITER_BVEC, .data_source = direction, .bvec = bvec, .nr_segs = nr_segs, .iov_offset = 0, .count = count }; } EXPORT_SYMBOL(iov_iter_bvec); /** * iov_iter_folio_queue - Initialise an I/O iterator to use the folios in a folio queue * @i: The iterator to initialise. * @direction: The direction of the transfer. * @folioq: The starting point in the folio queue. * @first_slot: The first slot in the folio queue to use * @offset: The offset into the folio in the first slot to start at * @count: The size of the I/O buffer in bytes. * * Set up an I/O iterator to either draw data out of the pages attached to an * inode or to inject data into those pages. The pages *must* be prevented * from evaporation, either by taking a ref on them or locking them by the * caller. */ void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction, const struct folio_queue *folioq, unsigned int first_slot, unsigned int offset, size_t count) { BUG_ON(direction & ~1); *i = (struct iov_iter) { .iter_type = ITER_FOLIOQ, .data_source = direction, .folioq = folioq, .folioq_slot = first_slot, .count = count, .iov_offset = offset, }; } EXPORT_SYMBOL(iov_iter_folio_queue); /** * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray * @i: The iterator to initialise. * @direction: The direction of the transfer. * @xarray: The xarray to access. * @start: The start file position. * @count: The size of the I/O buffer in bytes. * * Set up an I/O iterator to either draw data out of the pages attached to an * inode or to inject data into those pages. The pages *must* be prevented * from evaporation, either by taking a ref on them or locking them by the * caller. */ void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray, loff_t start, size_t count) { BUG_ON(direction & ~1); *i = (struct iov_iter) { .iter_type = ITER_XARRAY, .data_source = direction, .xarray = xarray, .xarray_start = start, .count = count, .iov_offset = 0 }; } EXPORT_SYMBOL(iov_iter_xarray); /** * iov_iter_discard - Initialise an I/O iterator that discards data * @i: The iterator to initialise. * @direction: The direction of the transfer. * @count: The size of the I/O buffer in bytes. * * Set up an I/O iterator that just discards everything that's written to it. * It's only available as a READ iterator. */ void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count) { BUG_ON(direction != READ); *i = (struct iov_iter){ .iter_type = ITER_DISCARD, .data_source = false, .count = count, .iov_offset = 0 }; } EXPORT_SYMBOL(iov_iter_discard); static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) { const struct iovec *iov = iter_iov(i); unsigned long res = 0; size_t size = i->count; size_t skip = i->iov_offset; do { size_t len = iov->iov_len - skip; if (len) { res |= (unsigned long)iov->iov_base + skip; if (len > size) len = size; res |= len; size -= len; } iov++; skip = 0; } while (size); return res; } static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i) { const struct bio_vec *bvec = i->bvec; unsigned res = 0; size_t size = i->count; unsigned skip = i->iov_offset; do { size_t len = bvec->bv_len - skip; res |= (unsigned long)bvec->bv_offset + skip; if (len > size) len = size; res |= len; bvec++; size -= len; skip = 0; } while (size); return res; } unsigned long iov_iter_alignment(const struct iov_iter *i) { if (likely(iter_is_ubuf(i))) { size_t size = i->count; if (size) return ((unsigned long)i->ubuf + i->iov_offset) | size; return 0; } /* iovec and kvec have identical layouts */ if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return iov_iter_alignment_iovec(i); if (iov_iter_is_bvec(i)) return iov_iter_alignment_bvec(i); /* With both xarray and folioq types, we're dealing with whole folios. */ if (iov_iter_is_folioq(i)) return i->iov_offset | i->count; if (iov_iter_is_xarray(i)) return (i->xarray_start + i->iov_offset) | i->count; return 0; } EXPORT_SYMBOL(iov_iter_alignment); unsigned long iov_iter_gap_alignment(const struct iov_iter *i) { unsigned long res = 0; unsigned long v = 0; size_t size = i->count; unsigned k; if (iter_is_ubuf(i)) return 0; if (WARN_ON(!iter_is_iovec(i))) return ~0U; for (k = 0; k < i->nr_segs; k++) { const struct iovec *iov = iter_iov(i) + k; if (iov->iov_len) { unsigned long base = (unsigned long)iov->iov_base; if (v) // if not the first one res |= base | v; // this start | previous end v = base + iov->iov_len; if (size <= iov->iov_len) break; size -= iov->iov_len; } } return res; } EXPORT_SYMBOL(iov_iter_gap_alignment); static int want_pages_array(struct page ***res, size_t size, size_t start, unsigned int maxpages) { unsigned int count = DIV_ROUND_UP(size + start, PAGE_SIZE); if (count > maxpages) count = maxpages; WARN_ON(!count); // caller should've prevented that if (!*res) { *res = kvmalloc_array(count, sizeof(struct page *), GFP_KERNEL); if (!*res) return 0; } return count; } static ssize_t iter_folioq_get_pages(struct iov_iter *iter, struct page ***ppages, size_t maxsize, unsigned maxpages, size_t *_start_offset) { const struct folio_queue *folioq = iter->folioq; struct page **pages; unsigned int slot = iter->folioq_slot; size_t extracted = 0, count = iter->count, iov_offset = iter->iov_offset; if (slot >= folioq_nr_slots(folioq)) { folioq = folioq->next; slot = 0; if (WARN_ON(iov_offset != 0)) return -EIO; } maxpages = want_pages_array(ppages, maxsize, iov_offset & ~PAGE_MASK, maxpages); if (!maxpages) return -ENOMEM; *_start_offset = iov_offset & ~PAGE_MASK; pages = *ppages; for (;;) { struct folio *folio = folioq_folio(folioq, slot); size_t offset = iov_offset, fsize = folioq_folio_size(folioq, slot); size_t part = PAGE_SIZE - offset % PAGE_SIZE; if (offset < fsize) { part = umin(part, umin(maxsize - extracted, fsize - offset)); count -= part; iov_offset += part; extracted += part; *pages = folio_page(folio, offset / PAGE_SIZE); get_page(*pages); pages++; maxpages--; } if (maxpages == 0 || extracted >= maxsize) break; if (iov_offset >= fsize) { iov_offset = 0; slot++; if (slot == folioq_nr_slots(folioq) && folioq->next) { folioq = folioq->next; slot = 0; } } } iter->count = count; iter->iov_offset = iov_offset; iter->folioq = folioq; iter->folioq_slot = slot; return extracted; } static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa, pgoff_t index, unsigned int nr_pages) { XA_STATE(xas, xa, index); struct folio *folio; unsigned int ret = 0; rcu_read_lock(); for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { if (xas_retry(&xas, folio)) continue; /* Has the folio moved or been split? */ if (unlikely(folio != xas_reload(&xas))) { xas_reset(&xas); continue; } pages[ret] = folio_file_page(folio, xas.xa_index); folio_get(folio); if (++ret == nr_pages) break; } rcu_read_unlock(); return ret; } static ssize_t iter_xarray_get_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned maxpages, size_t *_start_offset) { unsigned nr, offset, count; pgoff_t index; loff_t pos; pos = i->xarray_start + i->iov_offset; index = pos >> PAGE_SHIFT; offset = pos & ~PAGE_MASK; *_start_offset = offset; count = want_pages_array(pages, maxsize, offset, maxpages); if (!count) return -ENOMEM; nr = iter_xarray_populate_pages(*pages, i->xarray, index, count); if (nr == 0) return 0; maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize); i->iov_offset += maxsize; i->count -= maxsize; return maxsize; } /* must be done on non-empty ITER_UBUF or ITER_IOVEC one */ static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size) { size_t skip; long k; if (iter_is_ubuf(i)) return (unsigned long)i->ubuf + i->iov_offset; for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) { const struct iovec *iov = iter_iov(i) + k; size_t len = iov->iov_len - skip; if (unlikely(!len)) continue; if (*size > len) *size = len; return (unsigned long)iov->iov_base + skip; } BUG(); // if it had been empty, we wouldn't get called } /* must be done on non-empty ITER_BVEC one */ static struct page *first_bvec_segment(const struct iov_iter *i, size_t *size, size_t *start) { struct page *page; size_t skip = i->iov_offset, len; len = i->bvec->bv_len - skip; if (*size > len) *size = len; skip += i->bvec->bv_offset; page = i->bvec->bv_page + skip / PAGE_SIZE; *start = skip % PAGE_SIZE; return page; } static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, size_t *start) { unsigned int n, gup_flags = 0; if (maxsize > i->count) maxsize = i->count; if (!maxsize) return 0; if (maxsize > MAX_RW_COUNT) maxsize = MAX_RW_COUNT; if (likely(user_backed_iter(i))) { unsigned long addr; int res; if (iov_iter_rw(i) != WRITE) gup_flags |= FOLL_WRITE; if (i->nofault) gup_flags |= FOLL_NOFAULT; addr = first_iovec_segment(i, &maxsize); *start = addr % PAGE_SIZE; addr &= PAGE_MASK; n = want_pages_array(pages, maxsize, *start, maxpages); if (!n) return -ENOMEM; res = get_user_pages_fast(addr, n, gup_flags, *pages); if (unlikely(res <= 0)) return res; maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - *start); iov_iter_advance(i, maxsize); return maxsize; } if (iov_iter_is_bvec(i)) { struct page **p; struct page *page; page = first_bvec_segment(i, &maxsize, start); n = want_pages_array(pages, maxsize, *start, maxpages); if (!n) return -ENOMEM; p = *pages; for (int k = 0; k < n; k++) { struct folio *folio = page_folio(page + k); p[k] = page + k; if (!folio_test_slab(folio)) folio_get(folio); } maxsize = min_t(size_t, maxsize, n * PAGE_SIZE - *start); i->count -= maxsize; i->iov_offset += maxsize; if (i->iov_offset == i->bvec->bv_len) { i->iov_offset = 0; i->bvec++; i->nr_segs--; } return maxsize; } if (iov_iter_is_folioq(i)) return iter_folioq_get_pages(i, pages, maxsize, maxpages, start); if (iov_iter_is_xarray(i)) return iter_xarray_get_pages(i, pages, maxsize, maxpages, start); return -EFAULT; } ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start) { if (!maxpages) return 0; BUG_ON(!pages); return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, start); } EXPORT_SYMBOL(iov_iter_get_pages2); ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start) { ssize_t len; *pages = NULL; len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start); if (len <= 0) { kvfree(*pages); *pages = NULL; } return len; } EXPORT_SYMBOL(iov_iter_get_pages_alloc2); static int iov_npages(const struct iov_iter *i, int maxpages) { size_t skip = i->iov_offset, size = i->count; const struct iovec *p; int npages = 0; for (p = iter_iov(i); size; skip = 0, p++) { unsigned offs = offset_in_page(p->iov_base + skip); size_t len = min(p->iov_len - skip, size); if (len) { size -= len; npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); if (unlikely(npages > maxpages)) return maxpages; } } return npages; } static int bvec_npages(const struct iov_iter *i, int maxpages) { size_t skip = i->iov_offset, size = i->count; const struct bio_vec *p; int npages = 0; for (p = i->bvec; size; skip = 0, p++) { unsigned offs = (p->bv_offset + skip) % PAGE_SIZE; size_t len = min(p->bv_len - skip, size); size -= len; npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); if (unlikely(npages > maxpages)) return maxpages; } return npages; } int iov_iter_npages(const struct iov_iter *i, int maxpages) { if (unlikely(!i->count)) return 0; if (likely(iter_is_ubuf(i))) { unsigned offs = offset_in_page(i->ubuf + i->iov_offset); int npages = DIV_ROUND_UP(offs + i->count, PAGE_SIZE); return min(npages, maxpages); } /* iovec and kvec have identical layouts */ if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return iov_npages(i, maxpages); if (iov_iter_is_bvec(i)) return bvec_npages(i, maxpages); if (iov_iter_is_folioq(i)) { unsigned offset = i->iov_offset % PAGE_SIZE; int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); return min(npages, maxpages); } if (iov_iter_is_xarray(i)) { unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE; int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); return min(npages, maxpages); } return 0; } EXPORT_SYMBOL(iov_iter_npages); const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) { *new = *old; if (iov_iter_is_bvec(new)) return new->bvec = kmemdup(new->bvec, new->nr_segs * sizeof(struct bio_vec), flags); else if (iov_iter_is_kvec(new) || iter_is_iovec(new)) /* iovec and kvec have identical layout */ return new->__iov = kmemdup(new->__iov, new->nr_segs * sizeof(struct iovec), flags); return NULL; } EXPORT_SYMBOL(dup_iter); static __noclone int copy_compat_iovec_from_user(struct iovec *iov, const struct iovec __user *uvec, u32 nr_segs) { const struct compat_iovec __user *uiov = (const struct compat_iovec __user *)uvec; int ret = -EFAULT; u32 i; if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) return -EFAULT; for (i = 0; i < nr_segs; i++) { compat_uptr_t buf; compat_ssize_t len; unsafe_get_user(len, &uiov[i].iov_len, uaccess_end); unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end); /* check for compat_size_t not fitting in compat_ssize_t .. */ if (len < 0) { ret = -EINVAL; goto uaccess_end; } iov[i].iov_base = compat_ptr(buf); iov[i].iov_len = len; } ret = 0; uaccess_end: user_access_end(); return ret; } static __noclone int copy_iovec_from_user(struct iovec *iov, const struct iovec __user *uiov, unsigned long nr_segs) { int ret = -EFAULT; if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) return -EFAULT; do { void __user *buf; ssize_t len; unsafe_get_user(len, &uiov->iov_len, uaccess_end); unsafe_get_user(buf, &uiov->iov_base, uaccess_end); /* check for size_t not fitting in ssize_t .. */ if (unlikely(len < 0)) { ret = -EINVAL; goto uaccess_end; } iov->iov_base = buf; iov->iov_len = len; uiov++; iov++; } while (--nr_segs); ret = 0; uaccess_end: user_access_end(); return ret; } struct iovec *iovec_from_user(const struct iovec __user *uvec, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_iov, bool compat) { struct iovec *iov = fast_iov; int ret; /* * SuS says "The readv() function *may* fail if the iovcnt argument was * less than or equal to 0, or greater than {IOV_MAX}. Linux has * traditionally returned zero for zero segments, so... */ if (nr_segs == 0) return iov; if (nr_segs > UIO_MAXIOV) return ERR_PTR(-EINVAL); if (nr_segs > fast_segs) { iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL); if (!iov) return ERR_PTR(-ENOMEM); } if (unlikely(compat)) ret = copy_compat_iovec_from_user(iov, uvec, nr_segs); else ret = copy_iovec_from_user(iov, uvec, nr_segs); if (ret) { if (iov != fast_iov) kfree(iov); return ERR_PTR(ret); } return iov; } /* * Single segment iovec supplied by the user, import it as ITER_UBUF. */ static ssize_t __import_iovec_ubuf(int type, const struct iovec __user *uvec, struct iovec **iovp, struct iov_iter *i, bool compat) { struct iovec *iov = *iovp; ssize_t ret; *iovp = NULL; if (compat) ret = copy_compat_iovec_from_user(iov, uvec, 1); else ret = copy_iovec_from_user(iov, uvec, 1); if (unlikely(ret)) return ret; ret = import_ubuf(type, iov->iov_base, iov->iov_len, i); if (unlikely(ret)) return ret; return i->count; } ssize_t __import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i, bool compat) { ssize_t total_len = 0; unsigned long seg; struct iovec *iov; if (nr_segs == 1) return __import_iovec_ubuf(type, uvec, iovp, i, compat); iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat); if (IS_ERR(iov)) { *iovp = NULL; return PTR_ERR(iov); } /* * According to the Single Unix Specification we should return EINVAL if * an element length is < 0 when cast to ssize_t or if the total length * would overflow the ssize_t return value of the system call. * * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the * overflow case. */ for (seg = 0; seg < nr_segs; seg++) { ssize_t len = (ssize_t)iov[seg].iov_len; if (!access_ok(iov[seg].iov_base, len)) { if (iov != *iovp) kfree(iov); *iovp = NULL; return -EFAULT; } if (len > MAX_RW_COUNT - total_len) { len = MAX_RW_COUNT - total_len; iov[seg].iov_len = len; } total_len += len; } iov_iter_init(i, type, iov, nr_segs, total_len); if (iov == *iovp) *iovp = NULL; else *iovp = iov; return total_len; } /** * import_iovec() - Copy an array of &struct iovec from userspace * into the kernel, check that it is valid, and initialize a new * &struct iov_iter iterator to access it. * * @type: One of %READ or %WRITE. * @uvec: Pointer to the userspace array. * @nr_segs: Number of elements in userspace array. * @fast_segs: Number of elements in @iov. * @iovp: (input and output parameter) Pointer to pointer to (usually small * on-stack) kernel array. * @i: Pointer to iterator that will be initialized on success. * * If the array pointed to by *@iov is large enough to hold all @nr_segs, * then this function places %NULL in *@iov on return. Otherwise, a new * array will be allocated and the result placed in *@iov. This means that * the caller may call kfree() on *@iov regardless of whether the small * on-stack array was used or not (and regardless of whether this function * returns an error or not). * * Return: Negative error code on error, bytes imported on success */ ssize_t import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i) { return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i, in_compat_syscall()); } EXPORT_SYMBOL(import_iovec); int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i) { if (len > MAX_RW_COUNT) len = MAX_RW_COUNT; if (unlikely(!access_ok(buf, len))) return -EFAULT; iov_iter_ubuf(i, rw, buf, len); return 0; } EXPORT_SYMBOL_GPL(import_ubuf); /** * iov_iter_restore() - Restore a &struct iov_iter to the same state as when * iov_iter_save_state() was called. * * @i: &struct iov_iter to restore * @state: state to restore from * * Used after iov_iter_save_state() to bring restore @i, if operations may * have advanced it. * * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC */ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) { if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) && !iter_is_ubuf(i)) && !iov_iter_is_kvec(i)) return; i->iov_offset = state->iov_offset; i->count = state->count; if (iter_is_ubuf(i)) return; /* * For the *vec iters, nr_segs + iov is constant - if we increment * the vec, then we also decrement the nr_segs count. Hence we don't * need to track both of these, just one is enough and we can deduct * the other from that. ITER_KVEC and ITER_IOVEC are the same struct * size, so we can just increment the iov pointer as they are unionzed. * ITER_BVEC _may_ be the same size on some archs, but on others it is * not. Be safe and handle it separately. */ BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec)); if (iov_iter_is_bvec(i)) i->bvec -= state->nr_segs - i->nr_segs; else i->__iov -= state->nr_segs - i->nr_segs; i->nr_segs = state->nr_segs; } /* * Extract a list of contiguous pages from an ITER_FOLIOQ iterator. This does * not get references on the pages, nor does it get a pin on them. */ static ssize_t iov_iter_extract_folioq_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { const struct folio_queue *folioq = i->folioq; struct page **p; unsigned int nr = 0; size_t extracted = 0, offset, slot = i->folioq_slot; if (slot >= folioq_nr_slots(folioq)) { folioq = folioq->next; slot = 0; if (WARN_ON(i->iov_offset != 0)) return -EIO; } offset = i->iov_offset & ~PAGE_MASK; *offset0 = offset; maxpages = want_pages_array(pages, maxsize, offset, maxpages); if (!maxpages) return -ENOMEM; p = *pages; for (;;) { struct folio *folio = folioq_folio(folioq, slot); size_t offset = i->iov_offset, fsize = folioq_folio_size(folioq, slot); size_t part = PAGE_SIZE - offset % PAGE_SIZE; if (offset < fsize) { part = umin(part, umin(maxsize - extracted, fsize - offset)); i->count -= part; i->iov_offset += part; extracted += part; p[nr++] = folio_page(folio, offset / PAGE_SIZE); } if (nr >= maxpages || extracted >= maxsize) break; if (i->iov_offset >= fsize) { i->iov_offset = 0; slot++; if (slot == folioq_nr_slots(folioq) && folioq->next) { folioq = folioq->next; slot = 0; } } } i->folioq = folioq; i->folioq_slot = slot; return extracted; } /* * Extract a list of contiguous pages from an ITER_XARRAY iterator. This does not * get references on the pages, nor does it get a pin on them. */ static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { struct page **p; struct folio *folio; unsigned int nr = 0, offset; loff_t pos = i->xarray_start + i->iov_offset; XA_STATE(xas, i->xarray, pos >> PAGE_SHIFT); offset = pos & ~PAGE_MASK; *offset0 = offset; maxpages = want_pages_array(pages, maxsize, offset, maxpages); if (!maxpages) return -ENOMEM; p = *pages; rcu_read_lock(); for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { if (xas_retry(&xas, folio)) continue; /* Has the folio moved or been split? */ if (unlikely(folio != xas_reload(&xas))) { xas_reset(&xas); continue; } p[nr++] = folio_file_page(folio, xas.xa_index); if (nr == maxpages) break; } rcu_read_unlock(); maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize); iov_iter_advance(i, maxsize); return maxsize; } /* * Extract a list of virtually contiguous pages from an ITER_BVEC iterator. * This does not get references on the pages, nor does it get a pin on them. */ static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { size_t skip = i->iov_offset, size = 0; struct bvec_iter bi; int k = 0; if (i->nr_segs == 0) return 0; if (i->iov_offset == i->bvec->bv_len) { i->iov_offset = 0; i->nr_segs--; i->bvec++; skip = 0; } bi.bi_idx = 0; bi.bi_size = maxsize; bi.bi_bvec_done = skip; maxpages = want_pages_array(pages, maxsize, skip, maxpages); while (bi.bi_size && bi.bi_idx < i->nr_segs) { struct bio_vec bv = bvec_iter_bvec(i->bvec, bi); /* * The iov_iter_extract_pages interface only allows an offset * into the first page. Break out of the loop if we see an * offset into subsequent pages, the caller will have to call * iov_iter_extract_pages again for the reminder. */ if (k) { if (bv.bv_offset) break; } else { *offset0 = bv.bv_offset; } (*pages)[k++] = bv.bv_page; size += bv.bv_len; if (k >= maxpages) break; /* * We are done when the end of the bvec doesn't align to a page * boundary as that would create a hole in the returned space. * The caller will handle this with another call to * iov_iter_extract_pages. */ if (bv.bv_offset + bv.bv_len != PAGE_SIZE) break; bvec_iter_advance_single(i->bvec, &bi, bv.bv_len); } iov_iter_advance(i, size); return size; } /* * Extract a list of virtually contiguous pages from an ITER_KVEC iterator. * This does not get references on the pages, nor does it get a pin on them. */ static ssize_t iov_iter_extract_kvec_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { struct page **p, *page; const void *kaddr; size_t skip = i->iov_offset, offset, len, size; int k; for (;;) { if (i->nr_segs == 0) return 0; size = min(maxsize, i->kvec->iov_len - skip); if (size) break; i->iov_offset = 0; i->nr_segs--; i->kvec++; skip = 0; } kaddr = i->kvec->iov_base + skip; offset = (unsigned long)kaddr & ~PAGE_MASK; *offset0 = offset; maxpages = want_pages_array(pages, size, offset, maxpages); if (!maxpages) return -ENOMEM; p = *pages; kaddr -= offset; len = offset + size; for (k = 0; k < maxpages; k++) { size_t seg = min_t(size_t, len, PAGE_SIZE); if (is_vmalloc_or_module_addr(kaddr)) page = vmalloc_to_page(kaddr); else page = virt_to_page(kaddr); p[k] = page; len -= seg; kaddr += PAGE_SIZE; } size = min_t(size_t, size, maxpages * PAGE_SIZE - offset); iov_iter_advance(i, size); return size; } /* * Extract a list of contiguous pages from a user iterator and get a pin on * each of them. This should only be used if the iterator is user-backed * (IOBUF/UBUF). * * It does not get refs on the pages, but the pages must be unpinned by the * caller once the transfer is complete. * * This is safe to be used where background IO/DMA *is* going to be modifying * the buffer; using a pin rather than a ref makes forces fork() to give the * child a copy of the page. */ static ssize_t iov_iter_extract_user_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { unsigned long addr; unsigned int gup_flags = 0; size_t offset; int res; if (i->data_source == ITER_DEST) gup_flags |= FOLL_WRITE; if (extraction_flags & ITER_ALLOW_P2PDMA) gup_flags |= FOLL_PCI_P2PDMA; if (i->nofault) gup_flags |= FOLL_NOFAULT; addr = first_iovec_segment(i, &maxsize); *offset0 = offset = addr % PAGE_SIZE; addr &= PAGE_MASK; maxpages = want_pages_array(pages, maxsize, offset, maxpages); if (!maxpages) return -ENOMEM; res = pin_user_pages_fast(addr, maxpages, gup_flags, *pages); if (unlikely(res <= 0)) return res; maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - offset); iov_iter_advance(i, maxsize); return maxsize; } /** * iov_iter_extract_pages - Extract a list of contiguous pages from an iterator * @i: The iterator to extract from * @pages: Where to return the list of pages * @maxsize: The maximum amount of iterator to extract * @maxpages: The maximum size of the list of pages * @extraction_flags: Flags to qualify request * @offset0: Where to return the starting offset into (*@pages)[0] * * Extract a list of contiguous pages from the current point of the iterator, * advancing the iterator. The maximum number of pages and the maximum amount * of page contents can be set. * * If *@pages is NULL, a page list will be allocated to the required size and * *@pages will be set to its base. If *@pages is not NULL, it will be assumed * that the caller allocated a page list at least @maxpages in size and this * will be filled in. * * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA * be allowed on the pages extracted. * * The iov_iter_extract_will_pin() function can be used to query how cleanup * should be performed. * * Extra refs or pins on the pages may be obtained as follows: * * (*) If the iterator is user-backed (ITER_IOVEC/ITER_UBUF), pins will be * added to the pages, but refs will not be taken. * iov_iter_extract_will_pin() will return true. * * (*) If the iterator is ITER_KVEC, ITER_BVEC, ITER_FOLIOQ or ITER_XARRAY, the * pages are merely listed; no extra refs or pins are obtained. * iov_iter_extract_will_pin() will return 0. * * Note also: * * (*) Use with ITER_DISCARD is not supported as that has no content. * * On success, the function sets *@pages to the new pagelist, if allocated, and * sets *offset0 to the offset into the first page. * * It may also return -ENOMEM and -EFAULT. */ ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { maxsize = min_t(size_t, min_t(size_t, maxsize, i->count), MAX_RW_COUNT); if (!maxsize) return 0; if (likely(user_backed_iter(i))) return iov_iter_extract_user_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); if (iov_iter_is_kvec(i)) return iov_iter_extract_kvec_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); if (iov_iter_is_bvec(i)) return iov_iter_extract_bvec_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); if (iov_iter_is_folioq(i)) return iov_iter_extract_folioq_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); if (iov_iter_is_xarray(i)) return iov_iter_extract_xarray_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); return -EFAULT; } EXPORT_SYMBOL_GPL(iov_iter_extract_pages);
7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 /* SPDX-License-Identifier: GPL-2.0 */ /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Definitions for SMC Connections, Link Groups and Links * * Copyright IBM Corp. 2016 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> */ #ifndef _SMC_CORE_H #define _SMC_CORE_H #include <linux/atomic.h> #include <linux/types.h> #include <linux/smc.h> #include <linux/pci.h> #include <rdma/ib_verbs.h> #include <net/genetlink.h> #include <net/smc.h> #include "smc.h" #include "smc_ib.h" #include "smc_clc.h" #define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */ #define SMC_CONN_PER_LGR_MIN 16 /* min. # of connections per link group */ #define SMC_CONN_PER_LGR_MAX 255 /* max. # of connections per link group, * also is the default value for SMC-R v1 and v2.0 */ #define SMC_CONN_PER_LGR_PREFER 255 /* Preferred connections per link group used for * SMC-R v2.1 and later negotiation, vendors or * distributions may modify it to a value between * 16-255 as needed. */ struct smc_lgr_list { /* list of link group definition */ struct list_head list; spinlock_t lock; /* protects list of link groups */ u32 num; /* unique link group number */ }; enum smc_lgr_role { /* possible roles of a link group */ SMC_CLNT, /* client */ SMC_SERV /* server */ }; enum smc_link_state { /* possible states of a link */ SMC_LNK_UNUSED, /* link is unused */ SMC_LNK_INACTIVE, /* link is inactive */ SMC_LNK_ACTIVATING, /* link is being activated */ SMC_LNK_ACTIVE, /* link is active */ }; #define SMC_WR_BUF_SIZE 48 /* size of work request buffer */ #define SMC_WR_BUF_V2_SIZE 8192 /* size of v2 work request buffer */ struct smc_wr_buf { u8 raw[SMC_WR_BUF_SIZE]; }; struct smc_wr_v2_buf { u8 raw[SMC_WR_BUF_V2_SIZE]; }; #define SMC_WR_REG_MR_WAIT_TIME (5 * HZ)/* wait time for ib_wr_reg_mr result */ enum smc_wr_reg_state { POSTED, /* ib_wr_reg_mr request posted */ CONFIRMED, /* ib_wr_reg_mr response: successful */ FAILED /* ib_wr_reg_mr response: failure */ }; struct smc_rdma_sge { /* sges for RDMA writes */ struct ib_sge wr_tx_rdma_sge[SMC_IB_MAX_SEND_SGE]; }; #define SMC_MAX_RDMA_WRITES 2 /* max. # of RDMA writes per * message send */ struct smc_rdma_sges { /* sges per message send */ struct smc_rdma_sge tx_rdma_sge[SMC_MAX_RDMA_WRITES]; }; struct smc_rdma_wr { /* work requests per message * send */ struct ib_rdma_wr wr_tx_rdma[SMC_MAX_RDMA_WRITES]; }; #define SMC_LGR_ID_SIZE 4 struct smc_link { struct smc_ib_device *smcibdev; /* ib-device */ u8 ibport; /* port - values 1 | 2 */ struct ib_pd *roce_pd; /* IB protection domain, * unique for every RoCE QP */ struct ib_qp *roce_qp; /* IB queue pair */ struct ib_qp_attr qp_attr; /* IB queue pair attributes */ struct smc_wr_buf *wr_tx_bufs; /* WR send payload buffers */ struct ib_send_wr *wr_tx_ibs; /* WR send meta data */ struct ib_sge *wr_tx_sges; /* WR send gather meta data */ struct smc_rdma_sges *wr_tx_rdma_sges;/*RDMA WRITE gather meta data*/ struct smc_rdma_wr *wr_tx_rdmas; /* WR RDMA WRITE */ struct smc_wr_tx_pend *wr_tx_pends; /* WR send waiting for CQE */ struct completion *wr_tx_compl; /* WR send CQE completion */ /* above four vectors have wr_tx_cnt elements and use the same index */ struct ib_send_wr *wr_tx_v2_ib; /* WR send v2 meta data */ struct ib_sge *wr_tx_v2_sge; /* WR send v2 gather meta data*/ struct smc_wr_tx_pend *wr_tx_v2_pend; /* WR send v2 waiting for CQE */ dma_addr_t wr_tx_dma_addr; /* DMA address of wr_tx_bufs */ dma_addr_t wr_tx_v2_dma_addr; /* DMA address of v2 tx buf*/ atomic_long_t wr_tx_id; /* seq # of last sent WR */ unsigned long *wr_tx_mask; /* bit mask of used indexes */ u32 wr_tx_cnt; /* number of WR send buffers */ wait_queue_head_t wr_tx_wait; /* wait for free WR send buf */ struct { struct percpu_ref wr_tx_refs; } ____cacheline_aligned_in_smp; struct completion tx_ref_comp; u8 *wr_rx_bufs; /* WR recv payload buffers */ struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */ struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */ /* above three vectors have wr_rx_cnt elements and use the same index */ int wr_rx_sge_cnt; /* rx sge, V1 is 1, V2 is either 2 or 1 */ int wr_rx_buflen; /* buffer len for the first sge, len for the * second sge is lgr shared if rx sge is 2. */ dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */ dma_addr_t wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/ u64 wr_rx_id; /* seq # of last recv WR */ u64 wr_rx_id_compl; /* seq # of last completed WR */ u32 wr_rx_cnt; /* number of WR recv buffers */ unsigned long wr_rx_tstamp; /* jiffies when last buf rx */ wait_queue_head_t wr_rx_empty_wait; /* wait for RQ empty */ struct ib_reg_wr wr_reg; /* WR register memory region */ wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */ struct { struct percpu_ref wr_reg_refs; } ____cacheline_aligned_in_smp; struct completion reg_ref_comp; enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */ u8 gid[SMC_GID_SIZE];/* gid matching used vlan id*/ u8 sgid_index; /* gid index for vlan id */ u32 peer_qpn; /* QP number of peer */ enum ib_mtu path_mtu; /* used mtu */ enum ib_mtu peer_mtu; /* mtu size of peer */ u32 psn_initial; /* QP tx initial packet seqno */ u32 peer_psn; /* QP rx initial packet seqno */ u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */ u8 peer_gid[SMC_GID_SIZE]; /* gid of peer*/ u8 link_id; /* unique # within link group */ u8 link_uid[SMC_LGR_ID_SIZE]; /* unique lnk id */ u8 peer_link_uid[SMC_LGR_ID_SIZE]; /* peer uid */ u8 link_idx; /* index in lgr link array */ u8 link_is_asym; /* is link asymmetric? */ u8 clearing : 1; /* link is being cleared */ refcount_t refcnt; /* link reference count */ struct smc_link_group *lgr; /* parent link group */ struct work_struct link_down_wrk; /* wrk to bring link down */ char ibname[IB_DEVICE_NAME_MAX]; /* ib device name */ int ndev_ifidx; /* network device ifindex */ enum smc_link_state state; /* state of link */ struct delayed_work llc_testlink_wrk; /* testlink worker */ struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ atomic_t conn_cnt; /* connections on this link */ }; /* For now we just allow one parallel link per link group. The SMC protocol * allows more (up to 8). */ #define SMC_LINKS_PER_LGR_MAX 3 #define SMC_SINGLE_LINK 0 #define SMC_LINKS_ADD_LNK_MIN 1 /* min. # of links per link group */ #define SMC_LINKS_ADD_LNK_MAX 2 /* max. # of links per link group, also is the * default value for smc-r v1.0 and v2.0 */ #define SMC_LINKS_PER_LGR_MAX_PREFER 2 /* Preferred max links per link group used for * SMC-R v2.1 and later negotiation, vendors or * distributions may modify it to a value between * 1-2 as needed. */ /* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */ struct smc_buf_desc { struct list_head list; void *cpu_addr; /* virtual address of buffer */ struct page *pages; int len; /* length of buffer */ u32 used; /* currently used / unused */ union { struct { /* SMC-R */ struct sg_table sgt[SMC_LINKS_PER_LGR_MAX]; /* virtual buffer */ struct ib_mr *mr[SMC_LINKS_PER_LGR_MAX]; /* memory region: for rmb and * vzalloced sndbuf * incl. rkey provided to peer * and lkey provided to local */ u32 order; /* allocation order */ u8 is_conf_rkey; /* confirm_rkey done */ u8 is_reg_mr[SMC_LINKS_PER_LGR_MAX]; /* mem region registered */ u8 is_map_ib[SMC_LINKS_PER_LGR_MAX]; /* mem region mapped to lnk */ u8 is_dma_need_sync; u8 is_reg_err; /* buffer registration err */ u8 is_vm; /* virtually contiguous */ }; struct { /* SMC-D */ /* SMC-D tx buffer */ bool is_attached; /* no need for explicit writes */ /* SMC-D rx buffer: */ unsigned short sba_idx; /* SBA index number */ u64 token; /* DMB token number */ dma_addr_t dma_addr; /* DMA address */ }; }; }; struct smc_rtoken { /* address/key of remote RMB */ u64 dma_addr; u32 rkey; }; #define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */ #define SMC_RMBE_SIZES 16 /* number of distinct RMBE sizes */ /* theoretically, the RFC states that largest size would be 512K, * i.e. compressed 5 and thus 6 sizes (0..5), despite * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15) */ struct smcd_dev; enum smc_lgr_type { /* redundancy state of lgr */ SMC_LGR_NONE, /* no active links, lgr to be deleted */ SMC_LGR_SINGLE, /* 1 active RNIC on each peer */ SMC_LGR_SYMMETRIC, /* 2 active RNICs on each peer */ SMC_LGR_ASYMMETRIC_PEER, /* local has 2, peer 1 active RNICs */ SMC_LGR_ASYMMETRIC_LOCAL, /* local has 1, peer 2 active RNICs */ }; enum smcr_buf_type { /* types of SMC-R sndbufs and RMBs */ SMCR_PHYS_CONT_BUFS = 0, SMCR_VIRT_CONT_BUFS = 1, SMCR_MIXED_BUFS = 2, }; enum smc_llc_flowtype { SMC_LLC_FLOW_NONE = 0, SMC_LLC_FLOW_ADD_LINK = 2, SMC_LLC_FLOW_DEL_LINK = 4, SMC_LLC_FLOW_REQ_ADD_LINK = 5, SMC_LLC_FLOW_RKEY = 6, }; struct smc_llc_qentry; struct smc_llc_flow { enum smc_llc_flowtype type; struct smc_llc_qentry *qentry; }; struct smc_link_group { struct list_head list; struct rb_root conns_all; /* connection tree */ rwlock_t conns_lock; /* protects conns_all */ unsigned int conns_num; /* current # of connections */ unsigned short vlan_id; /* vlan id of link group */ struct list_head sndbufs[SMC_RMBE_SIZES];/* tx buffers */ struct rw_semaphore sndbufs_lock; /* protects tx buffers */ struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */ struct rw_semaphore rmbs_lock; /* protects rx buffers */ u64 alloc_sndbufs; /* stats of tx buffers */ u64 alloc_rmbs; /* stats of rx buffers */ u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ struct delayed_work free_work; /* delayed freeing of an lgr */ struct work_struct terminate_work; /* abnormal lgr termination */ struct workqueue_struct *tx_wq; /* wq for conn. tx workers */ u8 sync_err : 1; /* lgr no longer fits to peer */ u8 terminating : 1;/* lgr is terminating */ u8 freeing : 1; /* lgr is being freed */ refcount_t refcnt; /* lgr reference count */ bool is_smcd; /* SMC-R or SMC-D */ u8 smc_version; u8 negotiated_eid[SMC_MAX_EID_LEN]; u8 peer_os; /* peer operating system */ u8 peer_smc_release; u8 peer_hostname[SMC_MAX_HOSTNAME_LEN]; union { struct { /* SMC-R */ enum smc_lgr_role role; /* client or server */ struct smc_link lnk[SMC_LINKS_PER_LGR_MAX]; /* smc link */ struct smc_wr_v2_buf *wr_rx_buf_v2; /* WR v2 recv payload buffer */ struct smc_wr_v2_buf *wr_tx_buf_v2; /* WR v2 send payload buffer */ char peer_systemid[SMC_SYSTEMID_LEN]; /* unique system_id of peer */ struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX] [SMC_LINKS_PER_LGR_MAX]; /* remote addr/key pairs */ DECLARE_BITMAP(rtokens_used_mask, SMC_RMBS_PER_LGR_MAX); /* used rtoken elements */ u8 next_link_id; enum smc_lgr_type type; enum smcr_buf_type buf_type; /* redundancy state */ u8 pnet_id[SMC_MAX_PNETID_LEN + 1]; /* pnet id of this lgr */ struct list_head llc_event_q; /* queue for llc events */ spinlock_t llc_event_q_lock; /* protects llc_event_q */ struct rw_semaphore llc_conf_mutex; /* protects lgr reconfig. */ struct work_struct llc_add_link_work; struct work_struct llc_del_link_work; struct work_struct llc_event_work; /* llc event worker */ wait_queue_head_t llc_flow_waiter; /* w4 next llc event */ wait_queue_head_t llc_msg_waiter; /* w4 next llc msg */ struct smc_llc_flow llc_flow_lcl; /* llc local control field */ struct smc_llc_flow llc_flow_rmt; /* llc remote control field */ struct smc_llc_qentry *delayed_event; /* arrived when flow active */ spinlock_t llc_flow_lock; /* protects llc flow */ int llc_testlink_time; /* link keep alive time */ u32 llc_termination_rsn; /* rsn code for termination */ u8 nexthop_mac[ETH_ALEN]; u8 uses_gateway; __be32 saddr; /* net namespace */ struct net *net; u8 max_conns; /* max conn can be assigned to lgr */ u8 max_links; /* max links can be added in lgr */ }; struct { /* SMC-D */ struct smcd_gid peer_gid; /* Peer GID (remote) */ struct smcd_dev *smcd; /* ISM device for VLAN reg. */ u8 peer_shutdown : 1; /* peer triggered shutdownn */ }; }; }; struct smc_clc_msg_local; #define GID_LIST_SIZE 2 struct smc_gidlist { u8 len; u8 list[GID_LIST_SIZE][SMC_GID_SIZE]; }; struct smc_init_info_smcrv2 { /* Input fields */ __be32 saddr; struct sock *clc_sk; __be32 daddr; /* Output fields when saddr is set */ struct smc_ib_device *ib_dev_v2; u8 ib_port_v2; u8 ib_gid_v2[SMC_GID_SIZE]; /* Additional output fields when clc_sk and daddr is set as well */ u8 uses_gateway; u8 nexthop_mac[ETH_ALEN]; struct smc_gidlist gidlist; }; #define SMC_MAX_V2_ISM_DEVS SMCD_CLC_MAX_V2_GID_ENTRIES /* max # of proposed non-native ISM devices, * which can't exceed the max # of CHID-GID * entries in CLC proposal SMC-Dv2 extension. */ struct smc_init_info { u8 is_smcd; u8 smc_type_v1; u8 smc_type_v2; u8 release_nr; u8 max_conns; u8 max_links; u8 first_contact_peer; u8 first_contact_local; u16 feature_mask; unsigned short vlan_id; u32 rc; u8 negotiated_eid[SMC_MAX_EID_LEN]; /* SMC-R */ u8 smcr_version; u8 check_smcrv2; u8 peer_gid[SMC_GID_SIZE]; u8 peer_mac[ETH_ALEN]; u8 peer_systemid[SMC_SYSTEMID_LEN]; struct smc_ib_device *ib_dev; u8 ib_gid[SMC_GID_SIZE]; u8 ib_port; u32 ib_clcqpn; struct smc_init_info_smcrv2 smcrv2; /* SMC-D */ struct smcd_gid ism_peer_gid[SMC_MAX_V2_ISM_DEVS + 1]; struct smcd_dev *ism_dev[SMC_MAX_V2_ISM_DEVS + 1]; u16 ism_chid[SMC_MAX_V2_ISM_DEVS + 1]; u8 ism_offered_cnt; /* # of ISM devices offered */ u8 ism_selected; /* index of selected ISM dev*/ u8 smcd_version; }; /* Find the connection associated with the given alert token in the link group. * To use rbtrees we have to implement our own search core. * Requires @conns_lock * @token alert token to search for * @lgr link group to search in * Returns connection associated with token if found, NULL otherwise. */ static inline struct smc_connection *smc_lgr_find_conn( u32 token, struct smc_link_group *lgr) { struct smc_connection *res = NULL; struct rb_node *node; node = lgr->conns_all.rb_node; while (node) { struct smc_connection *cur = rb_entry(node, struct smc_connection, alert_node); if (cur->alert_token_local > token) { node = node->rb_left; } else { if (cur->alert_token_local < token) { node = node->rb_right; } else { res = cur; break; } } } return res; } static inline bool smc_conn_lgr_valid(struct smc_connection *conn) { return conn->lgr && conn->alert_token_local; } /* * Returns true if the specified link is usable. * * usable means the link is ready to receive RDMA messages, map memory * on the link, etc. This doesn't ensure we are able to send RDMA messages * on this link, if sending RDMA messages is needed, use smc_link_sendable() */ static inline bool smc_link_usable(struct smc_link *lnk) { if (lnk->state == SMC_LNK_UNUSED || lnk->state == SMC_LNK_INACTIVE) return false; return true; } /* * Returns true if the specified link is ready to receive AND send RDMA * messages. * * For the client side in first contact, the underlying QP may still in * RESET or RTR when the link state is ACTIVATING, checks in smc_link_usable() * is not strong enough. For those places that need to send any CDC or LLC * messages, use smc_link_sendable(), otherwise, use smc_link_usable() instead */ static inline bool smc_link_sendable(struct smc_link *lnk) { return smc_link_usable(lnk) && lnk->qp_attr.cur_qp_state == IB_QPS_RTS; } static inline bool smc_link_active(struct smc_link *lnk) { return lnk->state == SMC_LNK_ACTIVE; } static inline bool smc_link_shared_v2_rxbuf(struct smc_link *lnk) { return lnk->wr_rx_sge_cnt > 1; } static inline void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw) { sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x", be16_to_cpu(((__be16 *)gid_raw)[0]), be16_to_cpu(((__be16 *)gid_raw)[1]), be16_to_cpu(((__be16 *)gid_raw)[2]), be16_to_cpu(((__be16 *)gid_raw)[3]), be16_to_cpu(((__be16 *)gid_raw)[4]), be16_to_cpu(((__be16 *)gid_raw)[5]), be16_to_cpu(((__be16 *)gid_raw)[6]), be16_to_cpu(((__be16 *)gid_raw)[7])); } struct smc_pci_dev { __u32 pci_fid; __u16 pci_pchid; __u16 pci_vendor; __u16 pci_device; __u8 pci_id[SMC_PCI_ID_STR_LEN]; }; static inline void smc_set_pci_values(struct pci_dev *pci_dev, struct smc_pci_dev *smc_dev) { smc_dev->pci_vendor = pci_dev->vendor; smc_dev->pci_device = pci_dev->device; snprintf(smc_dev->pci_id, sizeof(smc_dev->pci_id), "%s", pci_name(pci_dev)); #if IS_ENABLED(CONFIG_S390) { /* Set s390 specific PCI information */ struct zpci_dev *zdev; zdev = to_zpci(pci_dev); smc_dev->pci_fid = zdev->fid; smc_dev->pci_pchid = zdev->pchid; } #endif } struct smc_sock; struct smc_clc_msg_accept_confirm; void smc_lgr_cleanup_early(struct smc_link_group *lgr); void smc_lgr_terminate_sched(struct smc_link_group *lgr); void smc_lgr_hold(struct smc_link_group *lgr); void smc_lgr_put(struct smc_link_group *lgr); void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport); void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport); void smc_smcd_terminate(struct smcd_dev *dev, struct smcd_gid *peer_gid, unsigned short vlan); void smc_smcd_terminate_all(struct smcd_dev *dev); void smc_smcr_terminate_all(struct smc_ib_device *smcibdev); int smc_buf_create(struct smc_sock *smc, bool is_smcd); int smcd_buf_attach(struct smc_sock *smc); int smc_uncompress_bufsize(u8 compressed); int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *link, struct smc_clc_msg_accept_confirm *clc); int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey); int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey); void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new, __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey); void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id, __be64 nw_vaddr, __be32 nw_rkey); void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini); void smc_conn_free(struct smc_connection *conn); int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini); int smc_core_init(void); void smc_core_exit(void); int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, u8 link_idx, struct smc_init_info *ini); void smcr_link_clear(struct smc_link *lnk, bool log); void smcr_link_hold(struct smc_link *lnk); void smcr_link_put(struct smc_link *lnk); void smc_switch_link_and_count(struct smc_connection *conn, struct smc_link *to_lnk); int smcr_buf_map_lgr(struct smc_link *lnk); int smcr_buf_reg_lgr(struct smc_link *lnk); void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type); void smcr_lgr_set_type_asym(struct smc_link_group *lgr, enum smc_lgr_type new_type, int asym_lnk_idx); int smcr_link_reg_buf(struct smc_link *link, struct smc_buf_desc *rmb_desc); struct smc_link *smc_switch_conns(struct smc_link_group *lgr, struct smc_link *from_lnk, bool is_dev_err); void smcr_link_down_cond(struct smc_link *lnk); void smcr_link_down_cond_sched(struct smc_link *lnk); int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb); int smcr_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb); int smcr_nl_get_link(struct sk_buff *skb, struct netlink_callback *cb); int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb); static inline struct smc_link_group *smc_get_lgr(struct smc_link *link) { return link->lgr; } #endif
18 18 30 17 17 3 15 1 1 19 20 8 67 8 8 8 8 67 65 8 8 8 67 8 8 8 1 8 1 8 8 8 8 13 13 22 13 8 20 20 20 20 20 20 20 20 6 15 76 1 1 2 10 14 6 9 2 1 2 10 2 10 4 10 10 2 10 10 17 1 2 7 3 14 20 1 19 19 3 18 18 1 5 2 17 93 66 67 67 67 93 93 93 93 93 66 67 67 67 67 67 67 67 67 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 // SPDX-License-Identifier: GPL-2.0-only /* * IEEE 802.1Q Multiple Registration Protocol (MRP) * * Copyright (c) 2012 Massachusetts Institute of Technology * * Adapted from code in net/802/garp.c * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> */ #include <linux/kernel.h> #include <linux/timer.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/module.h> #include <net/mrp.h> #include <linux/unaligned.h> static unsigned int mrp_join_time __read_mostly = 200; module_param(mrp_join_time, uint, 0644); MODULE_PARM_DESC(mrp_join_time, "Join time in ms (default 200ms)"); static unsigned int mrp_periodic_time __read_mostly = 1000; module_param(mrp_periodic_time, uint, 0644); MODULE_PARM_DESC(mrp_periodic_time, "Periodic time in ms (default 1s)"); MODULE_DESCRIPTION("IEEE 802.1Q Multiple Registration Protocol (MRP)"); MODULE_LICENSE("GPL"); static const u8 mrp_applicant_state_table[MRP_APPLICANT_MAX + 1][MRP_EVENT_MAX + 1] = { [MRP_APPLICANT_VO] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_VP, [MRP_EVENT_LV] = MRP_APPLICANT_VO, [MRP_EVENT_TX] = MRP_APPLICANT_VO, [MRP_EVENT_R_NEW] = MRP_APPLICANT_VO, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_AO, [MRP_EVENT_R_IN] = MRP_APPLICANT_VO, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_VO, [MRP_EVENT_R_MT] = MRP_APPLICANT_VO, [MRP_EVENT_R_LV] = MRP_APPLICANT_VO, [MRP_EVENT_R_LA] = MRP_APPLICANT_VO, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VO, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_VO, }, [MRP_APPLICANT_VP] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_VP, [MRP_EVENT_LV] = MRP_APPLICANT_VO, [MRP_EVENT_TX] = MRP_APPLICANT_AA, [MRP_EVENT_R_NEW] = MRP_APPLICANT_VP, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_AP, [MRP_EVENT_R_IN] = MRP_APPLICANT_VP, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_VP, [MRP_EVENT_R_MT] = MRP_APPLICANT_VP, [MRP_EVENT_R_LV] = MRP_APPLICANT_VP, [MRP_EVENT_R_LA] = MRP_APPLICANT_VP, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_VP, }, [MRP_APPLICANT_VN] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_VN, [MRP_EVENT_LV] = MRP_APPLICANT_LA, [MRP_EVENT_TX] = MRP_APPLICANT_AN, [MRP_EVENT_R_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_VN, [MRP_EVENT_R_IN] = MRP_APPLICANT_VN, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_VN, [MRP_EVENT_R_MT] = MRP_APPLICANT_VN, [MRP_EVENT_R_LV] = MRP_APPLICANT_VN, [MRP_EVENT_R_LA] = MRP_APPLICANT_VN, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VN, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_VN, }, [MRP_APPLICANT_AN] = { [MRP_EVENT_NEW] = MRP_APPLICANT_AN, [MRP_EVENT_JOIN] = MRP_APPLICANT_AN, [MRP_EVENT_LV] = MRP_APPLICANT_LA, [MRP_EVENT_TX] = MRP_APPLICANT_QA, [MRP_EVENT_R_NEW] = MRP_APPLICANT_AN, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_AN, [MRP_EVENT_R_IN] = MRP_APPLICANT_AN, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AN, [MRP_EVENT_R_MT] = MRP_APPLICANT_AN, [MRP_EVENT_R_LV] = MRP_APPLICANT_VN, [MRP_EVENT_R_LA] = MRP_APPLICANT_VN, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VN, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AN, }, [MRP_APPLICANT_AA] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_AA, [MRP_EVENT_LV] = MRP_APPLICANT_LA, [MRP_EVENT_TX] = MRP_APPLICANT_QA, [MRP_EVENT_R_NEW] = MRP_APPLICANT_AA, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QA, [MRP_EVENT_R_IN] = MRP_APPLICANT_AA, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AA, [MRP_EVENT_R_MT] = MRP_APPLICANT_AA, [MRP_EVENT_R_LV] = MRP_APPLICANT_VP, [MRP_EVENT_R_LA] = MRP_APPLICANT_VP, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AA, }, [MRP_APPLICANT_QA] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_QA, [MRP_EVENT_LV] = MRP_APPLICANT_LA, [MRP_EVENT_TX] = MRP_APPLICANT_QA, [MRP_EVENT_R_NEW] = MRP_APPLICANT_QA, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QA, [MRP_EVENT_R_IN] = MRP_APPLICANT_QA, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AA, [MRP_EVENT_R_MT] = MRP_APPLICANT_AA, [MRP_EVENT_R_LV] = MRP_APPLICANT_VP, [MRP_EVENT_R_LA] = MRP_APPLICANT_VP, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AA, }, [MRP_APPLICANT_LA] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_AA, [MRP_EVENT_LV] = MRP_APPLICANT_LA, [MRP_EVENT_TX] = MRP_APPLICANT_VO, [MRP_EVENT_R_NEW] = MRP_APPLICANT_LA, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_LA, [MRP_EVENT_R_IN] = MRP_APPLICANT_LA, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_LA, [MRP_EVENT_R_MT] = MRP_APPLICANT_LA, [MRP_EVENT_R_LV] = MRP_APPLICANT_LA, [MRP_EVENT_R_LA] = MRP_APPLICANT_LA, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_LA, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_LA, }, [MRP_APPLICANT_AO] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_AP, [MRP_EVENT_LV] = MRP_APPLICANT_AO, [MRP_EVENT_TX] = MRP_APPLICANT_AO, [MRP_EVENT_R_NEW] = MRP_APPLICANT_AO, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QO, [MRP_EVENT_R_IN] = MRP_APPLICANT_AO, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AO, [MRP_EVENT_R_MT] = MRP_APPLICANT_AO, [MRP_EVENT_R_LV] = MRP_APPLICANT_VO, [MRP_EVENT_R_LA] = MRP_APPLICANT_VO, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VO, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AO, }, [MRP_APPLICANT_QO] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_QP, [MRP_EVENT_LV] = MRP_APPLICANT_QO, [MRP_EVENT_TX] = MRP_APPLICANT_QO, [MRP_EVENT_R_NEW] = MRP_APPLICANT_QO, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QO, [MRP_EVENT_R_IN] = MRP_APPLICANT_QO, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AO, [MRP_EVENT_R_MT] = MRP_APPLICANT_AO, [MRP_EVENT_R_LV] = MRP_APPLICANT_VO, [MRP_EVENT_R_LA] = MRP_APPLICANT_VO, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VO, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_QO, }, [MRP_APPLICANT_AP] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_AP, [MRP_EVENT_LV] = MRP_APPLICANT_AO, [MRP_EVENT_TX] = MRP_APPLICANT_QA, [MRP_EVENT_R_NEW] = MRP_APPLICANT_AP, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QP, [MRP_EVENT_R_IN] = MRP_APPLICANT_AP, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AP, [MRP_EVENT_R_MT] = MRP_APPLICANT_AP, [MRP_EVENT_R_LV] = MRP_APPLICANT_VP, [MRP_EVENT_R_LA] = MRP_APPLICANT_VP, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AP, }, [MRP_APPLICANT_QP] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_QP, [MRP_EVENT_LV] = MRP_APPLICANT_QO, [MRP_EVENT_TX] = MRP_APPLICANT_QP, [MRP_EVENT_R_NEW] = MRP_APPLICANT_QP, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QP, [MRP_EVENT_R_IN] = MRP_APPLICANT_QP, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AP, [MRP_EVENT_R_MT] = MRP_APPLICANT_AP, [MRP_EVENT_R_LV] = MRP_APPLICANT_VP, [MRP_EVENT_R_LA] = MRP_APPLICANT_VP, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AP, }, }; static const u8 mrp_tx_action_table[MRP_APPLICANT_MAX + 1] = { [MRP_APPLICANT_VO] = MRP_TX_ACTION_S_IN_OPTIONAL, [MRP_APPLICANT_VP] = MRP_TX_ACTION_S_JOIN_IN, [MRP_APPLICANT_VN] = MRP_TX_ACTION_S_NEW, [MRP_APPLICANT_AN] = MRP_TX_ACTION_S_NEW, [MRP_APPLICANT_AA] = MRP_TX_ACTION_S_JOIN_IN, [MRP_APPLICANT_QA] = MRP_TX_ACTION_S_JOIN_IN_OPTIONAL, [MRP_APPLICANT_LA] = MRP_TX_ACTION_S_LV, [MRP_APPLICANT_AO] = MRP_TX_ACTION_S_IN_OPTIONAL, [MRP_APPLICANT_QO] = MRP_TX_ACTION_S_IN_OPTIONAL, [MRP_APPLICANT_AP] = MRP_TX_ACTION_S_JOIN_IN, [MRP_APPLICANT_QP] = MRP_TX_ACTION_S_IN_OPTIONAL, }; static void mrp_attrvalue_inc(void *value, u8 len) { u8 *v = (u8 *)value; /* Add 1 to the last byte. If it becomes zero, * go to the previous byte and repeat. */ while (len > 0 && !++v[--len]) ; } static int mrp_attr_cmp(const struct mrp_attr *attr, const void *value, u8 len, u8 type) { if (attr->type != type) return attr->type - type; if (attr->len != len) return attr->len - len; return memcmp(attr->value, value, len); } static struct mrp_attr *mrp_attr_lookup(const struct mrp_applicant *app, const void *value, u8 len, u8 type) { struct rb_node *parent = app->mad.rb_node; struct mrp_attr *attr; int d; while (parent) { attr = rb_entry(parent, struct mrp_attr, node); d = mrp_attr_cmp(attr, value, len, type); if (d > 0) parent = parent->rb_left; else if (d < 0) parent = parent->rb_right; else return attr; } return NULL; } static struct mrp_attr *mrp_attr_create(struct mrp_applicant *app, const void *value, u8 len, u8 type) { struct rb_node *parent = NULL, **p = &app->mad.rb_node; struct mrp_attr *attr; int d; while (*p) { parent = *p; attr = rb_entry(parent, struct mrp_attr, node); d = mrp_attr_cmp(attr, value, len, type); if (d > 0) p = &parent->rb_left; else if (d < 0) p = &parent->rb_right; else { /* The attribute already exists; re-use it. */ return attr; } } attr = kmalloc(sizeof(*attr) + len, GFP_ATOMIC); if (!attr) return attr; attr->state = MRP_APPLICANT_VO; attr->type = type; attr->len = len; memcpy(attr->value, value, len); rb_link_node(&attr->node, parent, p); rb_insert_color(&attr->node, &app->mad); return attr; } static void mrp_attr_destroy(struct mrp_applicant *app, struct mrp_attr *attr) { rb_erase(&attr->node, &app->mad); kfree(attr); } static void mrp_attr_destroy_all(struct mrp_applicant *app) { struct rb_node *node, *next; struct mrp_attr *attr; for (node = rb_first(&app->mad); next = node ? rb_next(node) : NULL, node != NULL; node = next) { attr = rb_entry(node, struct mrp_attr, node); mrp_attr_destroy(app, attr); } } static int mrp_pdu_init(struct mrp_applicant *app) { struct sk_buff *skb; struct mrp_pdu_hdr *ph; skb = alloc_skb(app->dev->mtu + LL_RESERVED_SPACE(app->dev), GFP_ATOMIC); if (!skb) return -ENOMEM; skb->dev = app->dev; skb->protocol = app->app->pkttype.type; skb_reserve(skb, LL_RESERVED_SPACE(app->dev)); skb_reset_network_header(skb); skb_reset_transport_header(skb); ph = __skb_put(skb, sizeof(*ph)); ph->version = app->app->version; app->pdu = skb; return 0; } static int mrp_pdu_append_end_mark(struct mrp_applicant *app) { __be16 *endmark; if (skb_tailroom(app->pdu) < sizeof(*endmark)) return -1; endmark = __skb_put(app->pdu, sizeof(*endmark)); put_unaligned(MRP_END_MARK, endmark); return 0; } static void mrp_pdu_queue(struct mrp_applicant *app) { if (!app->pdu) return; if (mrp_cb(app->pdu)->mh) mrp_pdu_append_end_mark(app); mrp_pdu_append_end_mark(app); dev_hard_header(app->pdu, app->dev, ntohs(app->app->pkttype.type), app->app->group_address, app->dev->dev_addr, app->pdu->len); skb_queue_tail(&app->queue, app->pdu); app->pdu = NULL; } static void mrp_queue_xmit(struct mrp_applicant *app) { struct sk_buff *skb; while ((skb = skb_dequeue(&app->queue))) dev_queue_xmit(skb); } static int mrp_pdu_append_msg_hdr(struct mrp_applicant *app, u8 attrtype, u8 attrlen) { struct mrp_msg_hdr *mh; if (mrp_cb(app->pdu)->mh) { if (mrp_pdu_append_end_mark(app) < 0) return -1; mrp_cb(app->pdu)->mh = NULL; mrp_cb(app->pdu)->vah = NULL; } if (skb_tailroom(app->pdu) < sizeof(*mh)) return -1; mh = __skb_put(app->pdu, sizeof(*mh)); mh->attrtype = attrtype; mh->attrlen = attrlen; mrp_cb(app->pdu)->mh = mh; return 0; } static int mrp_pdu_append_vecattr_hdr(struct mrp_applicant *app, const void *firstattrvalue, u8 attrlen) { struct mrp_vecattr_hdr *vah; if (skb_tailroom(app->pdu) < sizeof(*vah) + attrlen) return -1; vah = __skb_put(app->pdu, sizeof(*vah) + attrlen); put_unaligned(0, &vah->lenflags); memcpy(vah->firstattrvalue, firstattrvalue, attrlen); mrp_cb(app->pdu)->vah = vah; memcpy(mrp_cb(app->pdu)->attrvalue, firstattrvalue, attrlen); return 0; } static int mrp_pdu_append_vecattr_event(struct mrp_applicant *app, const struct mrp_attr *attr, enum mrp_vecattr_event vaevent) { u16 len, pos; u8 *vaevents; int err; again: if (!app->pdu) { err = mrp_pdu_init(app); if (err < 0) return err; } /* If there is no Message header in the PDU, or the Message header is * for a different attribute type, add an EndMark (if necessary) and a * new Message header to the PDU. */ if (!mrp_cb(app->pdu)->mh || mrp_cb(app->pdu)->mh->attrtype != attr->type || mrp_cb(app->pdu)->mh->attrlen != attr->len) { if (mrp_pdu_append_msg_hdr(app, attr->type, attr->len) < 0) goto queue; } /* If there is no VectorAttribute header for this Message in the PDU, * or this attribute's value does not sequentially follow the previous * attribute's value, add a new VectorAttribute header to the PDU. */ if (!mrp_cb(app->pdu)->vah || memcmp(mrp_cb(app->pdu)->attrvalue, attr->value, attr->len)) { if (mrp_pdu_append_vecattr_hdr(app, attr->value, attr->len) < 0) goto queue; } len = be16_to_cpu(get_unaligned(&mrp_cb(app->pdu)->vah->lenflags)); pos = len % 3; /* Events are packed into Vectors in the PDU, three to a byte. Add a * byte to the end of the Vector if necessary. */ if (!pos) { if (skb_tailroom(app->pdu) < sizeof(u8)) goto queue; vaevents = __skb_put(app->pdu, sizeof(u8)); } else { vaevents = (u8 *)(skb_tail_pointer(app->pdu) - sizeof(u8)); } switch (pos) { case 0: *vaevents = vaevent * (__MRP_VECATTR_EVENT_MAX * __MRP_VECATTR_EVENT_MAX); break; case 1: *vaevents += vaevent * __MRP_VECATTR_EVENT_MAX; break; case 2: *vaevents += vaevent; break; default: WARN_ON(1); } /* Increment the length of the VectorAttribute in the PDU, as well as * the value of the next attribute that would continue its Vector. */ put_unaligned(cpu_to_be16(++len), &mrp_cb(app->pdu)->vah->lenflags); mrp_attrvalue_inc(mrp_cb(app->pdu)->attrvalue, attr->len); return 0; queue: mrp_pdu_queue(app); goto again; } static void mrp_attr_event(struct mrp_applicant *app, struct mrp_attr *attr, enum mrp_event event) { enum mrp_applicant_state state; state = mrp_applicant_state_table[attr->state][event]; if (state == MRP_APPLICANT_INVALID) { WARN_ON(1); return; } if (event == MRP_EVENT_TX) { /* When appending the attribute fails, don't update its state * in order to retry at the next TX event. */ switch (mrp_tx_action_table[attr->state]) { case MRP_TX_ACTION_NONE: case MRP_TX_ACTION_S_JOIN_IN_OPTIONAL: case MRP_TX_ACTION_S_IN_OPTIONAL: break; case MRP_TX_ACTION_S_NEW: if (mrp_pdu_append_vecattr_event( app, attr, MRP_VECATTR_EVENT_NEW) < 0) return; break; case MRP_TX_ACTION_S_JOIN_IN: if (mrp_pdu_append_vecattr_event( app, attr, MRP_VECATTR_EVENT_JOIN_IN) < 0) return; break; case MRP_TX_ACTION_S_LV: if (mrp_pdu_append_vecattr_event( app, attr, MRP_VECATTR_EVENT_LV) < 0) return; /* As a pure applicant, sending a leave message * implies that the attribute was unregistered and * can be destroyed. */ mrp_attr_destroy(app, attr); return; default: WARN_ON(1); } } attr->state = state; } int mrp_request_join(const struct net_device *dev, const struct mrp_application *appl, const void *value, u8 len, u8 type) { struct mrp_port *port = rtnl_dereference(dev->mrp_port); struct mrp_applicant *app = rtnl_dereference( port->applicants[appl->type]); struct mrp_attr *attr; if (sizeof(struct mrp_skb_cb) + len > sizeof_field(struct sk_buff, cb)) return -ENOMEM; spin_lock_bh(&app->lock); attr = mrp_attr_create(app, value, len, type); if (!attr) { spin_unlock_bh(&app->lock); return -ENOMEM; } mrp_attr_event(app, attr, MRP_EVENT_JOIN); spin_unlock_bh(&app->lock); return 0; } EXPORT_SYMBOL_GPL(mrp_request_join); void mrp_request_leave(const struct net_device *dev, const struct mrp_application *appl, const void *value, u8 len, u8 type) { struct mrp_port *port = rtnl_dereference(dev->mrp_port); struct mrp_applicant *app = rtnl_dereference( port->applicants[appl->type]); struct mrp_attr *attr; if (sizeof(struct mrp_skb_cb) + len > sizeof_field(struct sk_buff, cb)) return; spin_lock_bh(&app->lock); attr = mrp_attr_lookup(app, value, len, type); if (!attr) { spin_unlock_bh(&app->lock); return; } mrp_attr_event(app, attr, MRP_EVENT_LV); spin_unlock_bh(&app->lock); } EXPORT_SYMBOL_GPL(mrp_request_leave); static void mrp_mad_event(struct mrp_applicant *app, enum mrp_event event) { struct rb_node *node, *next; struct mrp_attr *attr; for (node = rb_first(&app->mad); next = node ? rb_next(node) : NULL, node != NULL; node = next) { attr = rb_entry(node, struct mrp_attr, node); mrp_attr_event(app, attr, event); } } static void mrp_join_timer_arm(struct mrp_applicant *app) { unsigned long delay; delay = get_random_u32_below(msecs_to_jiffies(mrp_join_time)); mod_timer(&app->join_timer, jiffies + delay); } static void mrp_join_timer(struct timer_list *t) { struct mrp_applicant *app = timer_container_of(app, t, join_timer); spin_lock(&app->lock); mrp_mad_event(app, MRP_EVENT_TX); mrp_pdu_queue(app); spin_unlock(&app->lock); mrp_queue_xmit(app); spin_lock(&app->lock); if (likely(app->active)) mrp_join_timer_arm(app); spin_unlock(&app->lock); } static void mrp_periodic_timer_arm(struct mrp_applicant *app) { mod_timer(&app->periodic_timer, jiffies + msecs_to_jiffies(mrp_periodic_time)); } static void mrp_periodic_timer(struct timer_list *t) { struct mrp_applicant *app = timer_container_of(app, t, periodic_timer); spin_lock(&app->lock); if (likely(app->active)) { mrp_mad_event(app, MRP_EVENT_PERIODIC); mrp_pdu_queue(app); mrp_periodic_timer_arm(app); } spin_unlock(&app->lock); } static int mrp_pdu_parse_end_mark(struct sk_buff *skb, int *offset) { __be16 endmark; if (skb_copy_bits(skb, *offset, &endmark, sizeof(endmark)) < 0) return -1; if (endmark == MRP_END_MARK) { *offset += sizeof(endmark); return -1; } return 0; } static void mrp_pdu_parse_vecattr_event(struct mrp_applicant *app, struct sk_buff *skb, enum mrp_vecattr_event vaevent) { struct mrp_attr *attr; enum mrp_event event; attr = mrp_attr_lookup(app, mrp_cb(skb)->attrvalue, mrp_cb(skb)->mh->attrlen, mrp_cb(skb)->mh->attrtype); if (attr == NULL) return; switch (vaevent) { case MRP_VECATTR_EVENT_NEW: event = MRP_EVENT_R_NEW; break; case MRP_VECATTR_EVENT_JOIN_IN: event = MRP_EVENT_R_JOIN_IN; break; case MRP_VECATTR_EVENT_IN: event = MRP_EVENT_R_IN; break; case MRP_VECATTR_EVENT_JOIN_MT: event = MRP_EVENT_R_JOIN_MT; break; case MRP_VECATTR_EVENT_MT: event = MRP_EVENT_R_MT; break; case MRP_VECATTR_EVENT_LV: event = MRP_EVENT_R_LV; break; default: return; } mrp_attr_event(app, attr, event); } static int mrp_pdu_parse_vecattr(struct mrp_applicant *app, struct sk_buff *skb, int *offset) { struct mrp_vecattr_hdr _vah; u16 valen; u8 vaevents, vaevent; mrp_cb(skb)->vah = skb_header_pointer(skb, *offset, sizeof(_vah), &_vah); if (!mrp_cb(skb)->vah) return -1; *offset += sizeof(_vah); if (get_unaligned(&mrp_cb(skb)->vah->lenflags) & MRP_VECATTR_HDR_FLAG_LA) mrp_mad_event(app, MRP_EVENT_R_LA); valen = be16_to_cpu(get_unaligned(&mrp_cb(skb)->vah->lenflags) & MRP_VECATTR_HDR_LEN_MASK); /* The VectorAttribute structure in a PDU carries event information * about one or more attributes having consecutive values. Only the * value for the first attribute is contained in the structure. So * we make a copy of that value, and then increment it each time we * advance to the next event in its Vector. */ if (sizeof(struct mrp_skb_cb) + mrp_cb(skb)->mh->attrlen > sizeof_field(struct sk_buff, cb)) return -1; if (skb_copy_bits(skb, *offset, mrp_cb(skb)->attrvalue, mrp_cb(skb)->mh->attrlen) < 0) return -1; *offset += mrp_cb(skb)->mh->attrlen; /* In a VectorAttribute, the Vector contains events which are packed * three to a byte. We process one byte of the Vector at a time. */ while (valen > 0) { if (skb_copy_bits(skb, *offset, &vaevents, sizeof(vaevents)) < 0) return -1; *offset += sizeof(vaevents); /* Extract and process the first event. */ vaevent = vaevents / (__MRP_VECATTR_EVENT_MAX * __MRP_VECATTR_EVENT_MAX); if (vaevent >= __MRP_VECATTR_EVENT_MAX) { /* The byte is malformed; stop processing. */ return -1; } mrp_pdu_parse_vecattr_event(app, skb, vaevent); /* If present, extract and process the second event. */ if (!--valen) break; mrp_attrvalue_inc(mrp_cb(skb)->attrvalue, mrp_cb(skb)->mh->attrlen); vaevents %= (__MRP_VECATTR_EVENT_MAX * __MRP_VECATTR_EVENT_MAX); vaevent = vaevents / __MRP_VECATTR_EVENT_MAX; mrp_pdu_parse_vecattr_event(app, skb, vaevent); /* If present, extract and process the third event. */ if (!--valen) break; mrp_attrvalue_inc(mrp_cb(skb)->attrvalue, mrp_cb(skb)->mh->attrlen); vaevents %= __MRP_VECATTR_EVENT_MAX; vaevent = vaevents; mrp_pdu_parse_vecattr_event(app, skb, vaevent); } return 0; } static int mrp_pdu_parse_msg(struct mrp_applicant *app, struct sk_buff *skb, int *offset) { struct mrp_msg_hdr _mh; mrp_cb(skb)->mh = skb_header_pointer(skb, *offset, sizeof(_mh), &_mh); if (!mrp_cb(skb)->mh) return -1; *offset += sizeof(_mh); if (mrp_cb(skb)->mh->attrtype == 0 || mrp_cb(skb)->mh->attrtype > app->app->maxattr || mrp_cb(skb)->mh->attrlen == 0) return -1; while (skb->len > *offset) { if (mrp_pdu_parse_end_mark(skb, offset) < 0) break; if (mrp_pdu_parse_vecattr(app, skb, offset) < 0) return -1; } return 0; } static int mrp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct mrp_application *appl = container_of(pt, struct mrp_application, pkttype); struct mrp_port *port; struct mrp_applicant *app; struct mrp_pdu_hdr _ph; const struct mrp_pdu_hdr *ph; int offset = skb_network_offset(skb); /* If the interface is in promiscuous mode, drop the packet if * it was unicast to another host. */ if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) goto out; skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) goto out; port = rcu_dereference(dev->mrp_port); if (unlikely(!port)) goto out; app = rcu_dereference(port->applicants[appl->type]); if (unlikely(!app)) goto out; ph = skb_header_pointer(skb, offset, sizeof(_ph), &_ph); if (!ph) goto out; offset += sizeof(_ph); if (ph->version != app->app->version) goto out; spin_lock(&app->lock); while (skb->len > offset) { if (mrp_pdu_parse_end_mark(skb, &offset) < 0) break; if (mrp_pdu_parse_msg(app, skb, &offset) < 0) break; } spin_unlock(&app->lock); out: kfree_skb(skb); return 0; } static int mrp_init_port(struct net_device *dev) { struct mrp_port *port; port = kzalloc(sizeof(*port), GFP_KERNEL); if (!port) return -ENOMEM; rcu_assign_pointer(dev->mrp_port, port); return 0; } static void mrp_release_port(struct net_device *dev) { struct mrp_port *port = rtnl_dereference(dev->mrp_port); unsigned int i; for (i = 0; i <= MRP_APPLICATION_MAX; i++) { if (rtnl_dereference(port->applicants[i])) return; } RCU_INIT_POINTER(dev->mrp_port, NULL); kfree_rcu(port, rcu); } int mrp_init_applicant(struct net_device *dev, struct mrp_application *appl) { struct mrp_applicant *app; int err; ASSERT_RTNL(); if (!rtnl_dereference(dev->mrp_port)) { err = mrp_init_port(dev); if (err < 0) goto err1; } err = -ENOMEM; app = kzalloc(sizeof(*app), GFP_KERNEL); if (!app) goto err2; err = dev_mc_add(dev, appl->group_address); if (err < 0) goto err3; app->dev = dev; app->app = appl; app->mad = RB_ROOT; app->active = true; spin_lock_init(&app->lock); skb_queue_head_init(&app->queue); rcu_assign_pointer(dev->mrp_port->applicants[appl->type], app); timer_setup(&app->join_timer, mrp_join_timer, 0); mrp_join_timer_arm(app); timer_setup(&app->periodic_timer, mrp_periodic_timer, 0); mrp_periodic_timer_arm(app); return 0; err3: kfree(app); err2: mrp_release_port(dev); err1: return err; } EXPORT_SYMBOL_GPL(mrp_init_applicant); void mrp_uninit_applicant(struct net_device *dev, struct mrp_application *appl) { struct mrp_port *port = rtnl_dereference(dev->mrp_port); struct mrp_applicant *app = rtnl_dereference( port->applicants[appl->type]); ASSERT_RTNL(); RCU_INIT_POINTER(port->applicants[appl->type], NULL); spin_lock_bh(&app->lock); app->active = false; spin_unlock_bh(&app->lock); /* Delete timer and generate a final TX event to flush out * all pending messages before the applicant is gone. */ timer_shutdown_sync(&app->join_timer); timer_shutdown_sync(&app->periodic_timer); spin_lock_bh(&app->lock); mrp_mad_event(app, MRP_EVENT_TX); mrp_attr_destroy_all(app); mrp_pdu_queue(app); spin_unlock_bh(&app->lock); mrp_queue_xmit(app); dev_mc_del(dev, appl->group_address); kfree_rcu(app, rcu); mrp_release_port(dev); } EXPORT_SYMBOL_GPL(mrp_uninit_applicant); int mrp_register_application(struct mrp_application *appl) { appl->pkttype.func = mrp_rcv; dev_add_pack(&appl->pkttype); return 0; } EXPORT_SYMBOL_GPL(mrp_register_application); void mrp_unregister_application(struct mrp_application *appl) { dev_remove_pack(&appl->pkttype); } EXPORT_SYMBOL_GPL(mrp_unregister_application);
1 2 2 11 19 3 1 15 2 19 19 6 13 4 13 2 10 10 4 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 // SPDX-License-Identifier: GPL-2.0-only /* * linux/net/netfilter/xt_IDLETIMER.c * * Netfilter module to trigger a timer when packet matches. * After timer expires a kevent will be sent. * * Copyright (C) 2004, 2010 Nokia Corporation * Written by Timo Teras <ext-timo.teras@nokia.com> * * Converted to x_tables and reworked for upstream inclusion * by Luciano Coelho <luciano.coelho@nokia.com> * * Contact: Luciano Coelho <luciano.coelho@nokia.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/timer.h> #include <linux/alarmtimer.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/netfilter.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_IDLETIMER.h> #include <linux/kdev_t.h> #include <linux/kobject.h> #include <linux/workqueue.h> #include <linux/sysfs.h> struct idletimer_tg { struct list_head entry; struct alarm alarm; struct timer_list timer; struct work_struct work; struct kobject *kobj; struct device_attribute attr; unsigned int refcnt; u8 timer_type; }; static LIST_HEAD(idletimer_tg_list); static DEFINE_MUTEX(list_mutex); static struct kobject *idletimer_tg_kobj; static struct idletimer_tg *__idletimer_tg_find_by_label(const char *label) { struct idletimer_tg *entry; list_for_each_entry(entry, &idletimer_tg_list, entry) { if (!strcmp(label, entry->attr.attr.name)) return entry; } return NULL; } static ssize_t idletimer_tg_show(struct device *dev, struct device_attribute *attr, char *buf) { struct idletimer_tg *timer; unsigned long expires = 0; struct timespec64 ktimespec = {}; long time_diff = 0; mutex_lock(&list_mutex); timer = __idletimer_tg_find_by_label(attr->attr.name); if (timer) { if (timer->timer_type & XT_IDLETIMER_ALARM) { ktime_t expires_alarm = alarm_expires_remaining(&timer->alarm); ktimespec = ktime_to_timespec64(expires_alarm); time_diff = ktimespec.tv_sec; } else { expires = timer->timer.expires; time_diff = jiffies_to_msecs(expires - jiffies) / 1000; } } mutex_unlock(&list_mutex); if (time_after(expires, jiffies) || ktimespec.tv_sec > 0) return sysfs_emit(buf, "%ld\n", time_diff); return sysfs_emit(buf, "0\n"); } static void idletimer_tg_work(struct work_struct *work) { struct idletimer_tg *timer = container_of(work, struct idletimer_tg, work); sysfs_notify(idletimer_tg_kobj, NULL, timer->attr.attr.name); } static void idletimer_tg_expired(struct timer_list *t) { struct idletimer_tg *timer = timer_container_of(timer, t, timer); pr_debug("timer %s expired\n", timer->attr.attr.name); schedule_work(&timer->work); } static void idletimer_tg_alarmproc(struct alarm *alarm, ktime_t now) { struct idletimer_tg *timer = alarm->data; pr_debug("alarm %s expired\n", timer->attr.attr.name); schedule_work(&timer->work); } static int idletimer_check_sysfs_name(const char *name, unsigned int size) { int ret; ret = xt_check_proc_name(name, size); if (ret < 0) return ret; if (!strcmp(name, "power") || !strcmp(name, "subsystem") || !strcmp(name, "uevent")) return -EINVAL; return 0; } static int idletimer_tg_create(struct idletimer_tg_info *info) { int ret; info->timer = kzalloc(sizeof(*info->timer), GFP_KERNEL); if (!info->timer) { ret = -ENOMEM; goto out; } ret = idletimer_check_sysfs_name(info->label, sizeof(info->label)); if (ret < 0) goto out_free_timer; sysfs_attr_init(&info->timer->attr.attr); info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL); if (!info->timer->attr.attr.name) { ret = -ENOMEM; goto out_free_timer; } info->timer->attr.attr.mode = 0444; info->timer->attr.show = idletimer_tg_show; ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr); if (ret < 0) { pr_debug("couldn't add file to sysfs"); goto out_free_attr; } list_add(&info->timer->entry, &idletimer_tg_list); timer_setup(&info->timer->timer, idletimer_tg_expired, 0); info->timer->refcnt = 1; INIT_WORK(&info->timer->work, idletimer_tg_work); mod_timer(&info->timer->timer, secs_to_jiffies(info->timeout) + jiffies); return 0; out_free_attr: kfree(info->timer->attr.attr.name); out_free_timer: kfree(info->timer); out: return ret; } static int idletimer_tg_create_v1(struct idletimer_tg_info_v1 *info) { int ret; info->timer = kmalloc(sizeof(*info->timer), GFP_KERNEL); if (!info->timer) { ret = -ENOMEM; goto out; } ret = idletimer_check_sysfs_name(info->label, sizeof(info->label)); if (ret < 0) goto out_free_timer; sysfs_attr_init(&info->timer->attr.attr); info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL); if (!info->timer->attr.attr.name) { ret = -ENOMEM; goto out_free_timer; } info->timer->attr.attr.mode = 0444; info->timer->attr.show = idletimer_tg_show; ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr); if (ret < 0) { pr_debug("couldn't add file to sysfs"); goto out_free_attr; } /* notify userspace */ kobject_uevent(idletimer_tg_kobj,KOBJ_ADD); list_add(&info->timer->entry, &idletimer_tg_list); pr_debug("timer type value is %u", info->timer_type); info->timer->timer_type = info->timer_type; info->timer->refcnt = 1; INIT_WORK(&info->timer->work, idletimer_tg_work); if (info->timer->timer_type & XT_IDLETIMER_ALARM) { ktime_t tout; alarm_init(&info->timer->alarm, ALARM_BOOTTIME, idletimer_tg_alarmproc); info->timer->alarm.data = info->timer; tout = ktime_set(info->timeout, 0); alarm_start_relative(&info->timer->alarm, tout); } else { timer_setup(&info->timer->timer, idletimer_tg_expired, 0); mod_timer(&info->timer->timer, secs_to_jiffies(info->timeout) + jiffies); } return 0; out_free_attr: kfree(info->timer->attr.attr.name); out_free_timer: kfree(info->timer); out: return ret; } /* * The actual xt_tables plugin. */ static unsigned int idletimer_tg_target(struct sk_buff *skb, const struct xt_action_param *par) { const struct idletimer_tg_info *info = par->targinfo; pr_debug("resetting timer %s, timeout period %u\n", info->label, info->timeout); mod_timer(&info->timer->timer, secs_to_jiffies(info->timeout) + jiffies); return XT_CONTINUE; } /* * The actual xt_tables plugin. */ static unsigned int idletimer_tg_target_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct idletimer_tg_info_v1 *info = par->targinfo; pr_debug("resetting timer %s, timeout period %u\n", info->label, info->timeout); if (info->timer->timer_type & XT_IDLETIMER_ALARM) { ktime_t tout = ktime_set(info->timeout, 0); alarm_start_relative(&info->timer->alarm, tout); } else { mod_timer(&info->timer->timer, secs_to_jiffies(info->timeout) + jiffies); } return XT_CONTINUE; } static int idletimer_tg_helper(struct idletimer_tg_info *info) { if (info->timeout == 0) { pr_debug("timeout value is zero\n"); return -EINVAL; } if (info->timeout >= INT_MAX / 1000) { pr_debug("timeout value is too big\n"); return -EINVAL; } if (info->label[0] == '\0' || strnlen(info->label, MAX_IDLETIMER_LABEL_SIZE) == MAX_IDLETIMER_LABEL_SIZE) { pr_debug("label is empty or not nul-terminated\n"); return -EINVAL; } return 0; } static int idletimer_tg_checkentry(const struct xt_tgchk_param *par) { struct idletimer_tg_info *info = par->targinfo; int ret; pr_debug("checkentry targinfo%s\n", info->label); ret = idletimer_tg_helper(info); if(ret < 0) { pr_debug("checkentry helper return invalid\n"); return -EINVAL; } mutex_lock(&list_mutex); info->timer = __idletimer_tg_find_by_label(info->label); if (info->timer) { info->timer->refcnt++; mod_timer(&info->timer->timer, secs_to_jiffies(info->timeout) + jiffies); pr_debug("increased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); } else { ret = idletimer_tg_create(info); if (ret < 0) { pr_debug("failed to create timer\n"); mutex_unlock(&list_mutex); return ret; } } mutex_unlock(&list_mutex); return 0; } static int idletimer_tg_checkentry_v1(const struct xt_tgchk_param *par) { struct idletimer_tg_info_v1 *info = par->targinfo; int ret; pr_debug("checkentry targinfo%s\n", info->label); if (info->send_nl_msg) return -EOPNOTSUPP; ret = idletimer_tg_helper((struct idletimer_tg_info *)info); if(ret < 0) { pr_debug("checkentry helper return invalid\n"); return -EINVAL; } if (info->timer_type > XT_IDLETIMER_ALARM) { pr_debug("invalid value for timer type\n"); return -EINVAL; } mutex_lock(&list_mutex); info->timer = __idletimer_tg_find_by_label(info->label); if (info->timer) { if (info->timer->timer_type != info->timer_type) { pr_debug("Adding/Replacing rule with same label and different timer type is not allowed\n"); mutex_unlock(&list_mutex); return -EINVAL; } info->timer->refcnt++; if (info->timer_type & XT_IDLETIMER_ALARM) { /* calculate remaining expiry time */ ktime_t tout = alarm_expires_remaining(&info->timer->alarm); struct timespec64 ktimespec = ktime_to_timespec64(tout); if (ktimespec.tv_sec > 0) { pr_debug("time_expiry_remaining %lld\n", ktimespec.tv_sec); alarm_start_relative(&info->timer->alarm, tout); } } else { mod_timer(&info->timer->timer, secs_to_jiffies(info->timeout) + jiffies); } pr_debug("increased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); } else { ret = idletimer_tg_create_v1(info); if (ret < 0) { pr_debug("failed to create timer\n"); mutex_unlock(&list_mutex); return ret; } } mutex_unlock(&list_mutex); return 0; } static void idletimer_tg_destroy(const struct xt_tgdtor_param *par) { const struct idletimer_tg_info *info = par->targinfo; pr_debug("destroy targinfo %s\n", info->label); mutex_lock(&list_mutex); if (--info->timer->refcnt > 0) { pr_debug("decreased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); mutex_unlock(&list_mutex); return; } pr_debug("deleting timer %s\n", info->label); list_del(&info->timer->entry); mutex_unlock(&list_mutex); timer_shutdown_sync(&info->timer->timer); cancel_work_sync(&info->timer->work); sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); kfree(info->timer->attr.attr.name); kfree(info->timer); } static void idletimer_tg_destroy_v1(const struct xt_tgdtor_param *par) { const struct idletimer_tg_info_v1 *info = par->targinfo; pr_debug("destroy targinfo %s\n", info->label); mutex_lock(&list_mutex); if (--info->timer->refcnt > 0) { pr_debug("decreased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); mutex_unlock(&list_mutex); return; } pr_debug("deleting timer %s\n", info->label); list_del(&info->timer->entry); mutex_unlock(&list_mutex); if (info->timer->timer_type & XT_IDLETIMER_ALARM) { alarm_cancel(&info->timer->alarm); } else { timer_shutdown_sync(&info->timer->timer); } cancel_work_sync(&info->timer->work); sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); kfree(info->timer->attr.attr.name); kfree(info->timer); } static struct xt_target idletimer_tg[] __read_mostly = { { .name = "IDLETIMER", .family = NFPROTO_IPV4, .target = idletimer_tg_target, .targetsize = sizeof(struct idletimer_tg_info), .usersize = offsetof(struct idletimer_tg_info, timer), .checkentry = idletimer_tg_checkentry, .destroy = idletimer_tg_destroy, .me = THIS_MODULE, }, { .name = "IDLETIMER", .family = NFPROTO_IPV4, .revision = 1, .target = idletimer_tg_target_v1, .targetsize = sizeof(struct idletimer_tg_info_v1), .usersize = offsetof(struct idletimer_tg_info_v1, timer), .checkentry = idletimer_tg_checkentry_v1, .destroy = idletimer_tg_destroy_v1, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "IDLETIMER", .family = NFPROTO_IPV6, .target = idletimer_tg_target, .targetsize = sizeof(struct idletimer_tg_info), .usersize = offsetof(struct idletimer_tg_info, timer), .checkentry = idletimer_tg_checkentry, .destroy = idletimer_tg_destroy, .me = THIS_MODULE, }, { .name = "IDLETIMER", .family = NFPROTO_IPV6, .revision = 1, .target = idletimer_tg_target_v1, .targetsize = sizeof(struct idletimer_tg_info_v1), .usersize = offsetof(struct idletimer_tg_info_v1, timer), .checkentry = idletimer_tg_checkentry_v1, .destroy = idletimer_tg_destroy_v1, .me = THIS_MODULE, }, #endif }; static struct class *idletimer_tg_class; static struct device *idletimer_tg_device; static int __init idletimer_tg_init(void) { int err; idletimer_tg_class = class_create("xt_idletimer"); err = PTR_ERR(idletimer_tg_class); if (IS_ERR(idletimer_tg_class)) { pr_debug("couldn't register device class\n"); goto out; } idletimer_tg_device = device_create(idletimer_tg_class, NULL, MKDEV(0, 0), NULL, "timers"); err = PTR_ERR(idletimer_tg_device); if (IS_ERR(idletimer_tg_device)) { pr_debug("couldn't register system device\n"); goto out_class; } idletimer_tg_kobj = &idletimer_tg_device->kobj; err = xt_register_targets(idletimer_tg, ARRAY_SIZE(idletimer_tg)); if (err < 0) { pr_debug("couldn't register xt target\n"); goto out_dev; } return 0; out_dev: device_destroy(idletimer_tg_class, MKDEV(0, 0)); out_class: class_destroy(idletimer_tg_class); out: return err; } static void __exit idletimer_tg_exit(void) { xt_unregister_targets(idletimer_tg, ARRAY_SIZE(idletimer_tg)); device_destroy(idletimer_tg_class, MKDEV(0, 0)); class_destroy(idletimer_tg_class); } module_init(idletimer_tg_init); module_exit(idletimer_tg_exit); MODULE_AUTHOR("Timo Teras <ext-timo.teras@nokia.com>"); MODULE_AUTHOR("Luciano Coelho <luciano.coelho@nokia.com>"); MODULE_DESCRIPTION("Xtables: idle time monitor"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS("ipt_IDLETIMER"); MODULE_ALIAS("ip6t_IDLETIMER");
1317 1317 1314 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 /* SPDX-License-Identifier: GPL-2.0 OR MIT */ /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include <asm/cpufeature.h> #include <asm/fpu/api.h> #include <asm/processor.h> #include <asm/simd.h> #include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/sizes.h> asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state, const u8 *block, const size_t nblocks, const u32 inc); asmlinkage void blake2s_compress_avx512(struct blake2s_state *state, const u8 *block, const size_t nblocks, const u32 inc); static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3); static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512); static void blake2s_compress(struct blake2s_state *state, const u8 *block, size_t nblocks, const u32 inc) { /* SIMD disables preemption, so relax after processing each page. */ BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8); if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) { blake2s_compress_generic(state, block, nblocks, inc); return; } do { const size_t blocks = min_t(size_t, nblocks, SZ_4K / BLAKE2S_BLOCK_SIZE); kernel_fpu_begin(); if (static_branch_likely(&blake2s_use_avx512)) blake2s_compress_avx512(state, block, blocks, inc); else blake2s_compress_ssse3(state, block, blocks, inc); kernel_fpu_end(); nblocks -= blocks; block += blocks * BLAKE2S_BLOCK_SIZE; } while (nblocks); } #define blake2s_mod_init_arch blake2s_mod_init_arch static void blake2s_mod_init_arch(void) { if (boot_cpu_has(X86_FEATURE_SSSE3)) static_branch_enable(&blake2s_use_ssse3); if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) && boot_cpu_has(X86_FEATURE_AVX512VL) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL)) static_branch_enable(&blake2s_use_avx512); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_VMALLOC_H #define _LINUX_VMALLOC_H #include <linux/alloc_tag.h> #include <linux/sched.h> #include <linux/spinlock.h> #include <linux/init.h> #include <linux/list.h> #include <linux/llist.h> #include <asm/page.h> /* pgprot_t */ #include <linux/rbtree.h> #include <linux/overflow.h> #include <asm/vmalloc.h> struct vm_area_struct; /* vma defining user mapping in mm_types.h */ struct notifier_block; /* in notifier.h */ struct iov_iter; /* in uio.h */ /* bits in flags of vmalloc's vm_struct below */ #define VM_IOREMAP 0x00000001 /* ioremap() and friends */ #define VM_ALLOC 0x00000002 /* vmalloc() */ #define VM_MAP 0x00000004 /* vmap()ed pages */ #define VM_USERMAP 0x00000008 /* suitable for remap_vmalloc_range */ #define VM_DMA_COHERENT 0x00000010 /* dma_alloc_coherent */ #define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */ #define VM_NO_GUARD 0x00000040 /* ***DANGEROUS*** don't add guard page */ #define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ #define VM_FLUSH_RESET_PERMS 0x00000100 /* reset direct map and flush TLB on unmap, can't be freed in atomic context */ #define VM_MAP_PUT_PAGES 0x00000200 /* put pages and free array in vfree */ #define VM_ALLOW_HUGE_VMAP 0x00000400 /* Allow for huge pages on archs with HAVE_ARCH_HUGE_VMALLOC */ #if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \ !defined(CONFIG_KASAN_VMALLOC) #define VM_DEFER_KMEMLEAK 0x00000800 /* defer kmemleak object creation */ #else #define VM_DEFER_KMEMLEAK 0 #endif #define VM_SPARSE 0x00001000 /* sparse vm_area. not all pages are present. */ /* bits [20..32] reserved for arch specific ioremap internals */ /* * Maximum alignment for ioremap() regions. * Can be overridden by arch-specific value. */ #ifndef IOREMAP_MAX_ORDER #define IOREMAP_MAX_ORDER (7 + PAGE_SHIFT) /* 128 pages */ #endif struct vm_struct { struct vm_struct *next; void *addr; unsigned long size; unsigned long flags; struct page **pages; #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC unsigned int page_order; #endif unsigned int nr_pages; phys_addr_t phys_addr; const void *caller; unsigned long requested_size; }; struct vmap_area { unsigned long va_start; unsigned long va_end; struct rb_node rb_node; /* address sorted rbtree */ struct list_head list; /* address sorted list */ /* * The following two variables can be packed, because * a vmap_area object can be either: * 1) in "free" tree (root is free_vmap_area_root) * 2) or "busy" tree (root is vmap_area_root) */ union { unsigned long subtree_max_size; /* in "free" tree */ struct vm_struct *vm; /* in "busy" tree */ }; unsigned long flags; /* mark type of vm_map_ram area */ }; /* archs that select HAVE_ARCH_HUGE_VMAP should override one or more of these */ #ifndef arch_vmap_p4d_supported static inline bool arch_vmap_p4d_supported(pgprot_t prot) { return false; } #endif #ifndef arch_vmap_pud_supported static inline bool arch_vmap_pud_supported(pgprot_t prot) { return false; } #endif #ifndef arch_vmap_pmd_supported static inline bool arch_vmap_pmd_supported(pgprot_t prot) { return false; } #endif #ifndef arch_vmap_pte_range_map_size static inline unsigned long arch_vmap_pte_range_map_size(unsigned long addr, unsigned long end, u64 pfn, unsigned int max_page_shift) { return PAGE_SIZE; } #endif #ifndef arch_vmap_pte_range_unmap_size static inline unsigned long arch_vmap_pte_range_unmap_size(unsigned long addr, pte_t *ptep) { return PAGE_SIZE; } #endif #ifndef arch_vmap_pte_supported_shift static inline int arch_vmap_pte_supported_shift(unsigned long size) { return PAGE_SHIFT; } #endif #ifndef arch_vmap_pgprot_tagged static inline pgprot_t arch_vmap_pgprot_tagged(pgprot_t prot) { return prot; } #endif /* * Highlevel APIs for driver use */ extern void vm_unmap_ram(const void *mem, unsigned int count); extern void *vm_map_ram(struct page **pages, unsigned int count, int node); extern void vm_unmap_aliases(void); extern void *vmalloc_noprof(unsigned long size) __alloc_size(1); #define vmalloc(...) alloc_hooks(vmalloc_noprof(__VA_ARGS__)) extern void *vzalloc_noprof(unsigned long size) __alloc_size(1); #define vzalloc(...) alloc_hooks(vzalloc_noprof(__VA_ARGS__)) extern void *vmalloc_user_noprof(unsigned long size) __alloc_size(1); #define vmalloc_user(...) alloc_hooks(vmalloc_user_noprof(__VA_ARGS__)) extern void *vmalloc_node_noprof(unsigned long size, int node) __alloc_size(1); #define vmalloc_node(...) alloc_hooks(vmalloc_node_noprof(__VA_ARGS__)) extern void *vzalloc_node_noprof(unsigned long size, int node) __alloc_size(1); #define vzalloc_node(...) alloc_hooks(vzalloc_node_noprof(__VA_ARGS__)) extern void *vmalloc_32_noprof(unsigned long size) __alloc_size(1); #define vmalloc_32(...) alloc_hooks(vmalloc_32_noprof(__VA_ARGS__)) extern void *vmalloc_32_user_noprof(unsigned long size) __alloc_size(1); #define vmalloc_32_user(...) alloc_hooks(vmalloc_32_user_noprof(__VA_ARGS__)) extern void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) __alloc_size(1); #define __vmalloc(...) alloc_hooks(__vmalloc_noprof(__VA_ARGS__)) extern void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller) __alloc_size(1); #define __vmalloc_node_range(...) alloc_hooks(__vmalloc_node_range_noprof(__VA_ARGS__)) void *__vmalloc_node_noprof(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller) __alloc_size(1); #define __vmalloc_node(...) alloc_hooks(__vmalloc_node_noprof(__VA_ARGS__)) void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node) __alloc_size(1); #define vmalloc_huge_node(...) alloc_hooks(vmalloc_huge_node_noprof(__VA_ARGS__)) static inline void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) { return vmalloc_huge_node(size, gfp_mask, NUMA_NO_NODE); } extern void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2); #define __vmalloc_array(...) alloc_hooks(__vmalloc_array_noprof(__VA_ARGS__)) extern void *vmalloc_array_noprof(size_t n, size_t size) __alloc_size(1, 2); #define vmalloc_array(...) alloc_hooks(vmalloc_array_noprof(__VA_ARGS__)) extern void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2); #define __vcalloc(...) alloc_hooks(__vcalloc_noprof(__VA_ARGS__)) extern void *vcalloc_noprof(size_t n, size_t size) __alloc_size(1, 2); #define vcalloc(...) alloc_hooks(vcalloc_noprof(__VA_ARGS__)) void *__must_check vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align, gfp_t flags, int nid) __realloc_size(2); #define vrealloc_node_noprof(_p, _s, _f, _nid) \ vrealloc_node_align_noprof(_p, _s, 1, _f, _nid) #define vrealloc_noprof(_p, _s, _f) \ vrealloc_node_align_noprof(_p, _s, 1, _f, NUMA_NO_NODE) #define vrealloc_node_align(...) alloc_hooks(vrealloc_node_align_noprof(__VA_ARGS__)) #define vrealloc_node(...) alloc_hooks(vrealloc_node_noprof(__VA_ARGS__)) #define vrealloc(...) alloc_hooks(vrealloc_noprof(__VA_ARGS__)) extern void vfree(const void *addr); extern void vfree_atomic(const void *addr); extern void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot); void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot); extern void vunmap(const void *addr); extern int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, void *kaddr, unsigned long pgoff, unsigned long size); extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, unsigned long pgoff); int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift); /* * Lowlevel-APIs (not for driver use!) */ static inline size_t get_vm_area_size(const struct vm_struct *area) { if (!(area->flags & VM_NO_GUARD)) /* return actual size without guard page */ return area->size - PAGE_SIZE; else return area->size; } extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags); extern struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, const void *caller); extern struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, const void *caller); void free_vm_area(struct vm_struct *area); extern struct vm_struct *remove_vm_area(const void *addr); extern struct vm_struct *find_vm_area(const void *addr); struct vmap_area *find_vmap_area(unsigned long addr); static inline bool is_vm_area_hugepages(const void *addr) { /* * This may not 100% tell if the area is mapped with > PAGE_SIZE * page table entries, if for some reason the architecture indicates * larger sizes are available but decides not to use them, nothing * prevents that. This only indicates the size of the physical page * allocated in the vmalloc layer. */ #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC return find_vm_area(addr)->page_order > 0; #else return false; #endif } /* for /proc/kcore */ long vread_iter(struct iov_iter *iter, const char *addr, size_t count); /* * Internals. Don't use.. */ __init void vm_area_add_early(struct vm_struct *vm); __init void vm_area_register_early(struct vm_struct *vm, size_t align); int register_vmap_purge_notifier(struct notifier_block *nb); int unregister_vmap_purge_notifier(struct notifier_block *nb); #ifdef CONFIG_MMU #define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) unsigned long vmalloc_nr_pages(void); int vm_area_map_pages(struct vm_struct *area, unsigned long start, unsigned long end, struct page **pages); void vm_area_unmap_pages(struct vm_struct *area, unsigned long start, unsigned long end); void vunmap_range(unsigned long addr, unsigned long end); static inline void set_vm_flush_reset_perms(void *addr) { struct vm_struct *vm = find_vm_area(addr); if (vm) vm->flags |= VM_FLUSH_RESET_PERMS; } #else /* !CONFIG_MMU */ #define VMALLOC_TOTAL 0UL static inline unsigned long vmalloc_nr_pages(void) { return 0; } static inline void set_vm_flush_reset_perms(void *addr) {} #endif /* CONFIG_MMU */ #if defined(CONFIG_MMU) && defined(CONFIG_SMP) struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, size_t align); void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms); # else static inline struct vm_struct ** pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, size_t align) { return NULL; } static inline void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) {} #endif #if defined(CONFIG_MMU) && defined(CONFIG_PRINTK) bool vmalloc_dump_obj(void *object); #else static inline bool vmalloc_dump_obj(void *object) { return false; } #endif #endif /* _LINUX_VMALLOC_H */
984 233 986 215 10 207 205 20 20 10 14 3 3 3 3 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 // SPDX-License-Identifier: GPL-2.0-only /* * Helpers for formatting and printing strings * * Copyright 31 August 2008 James Bottomley * Copyright (C) 2013, Intel Corporation */ #include <linux/bug.h> #include <linux/kernel.h> #include <linux/math64.h> #include <linux/export.h> #include <linux/ctype.h> #include <linux/device.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/limits.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/string_helpers.h> #include <kunit/test.h> #include <kunit/test-bug.h> /** * string_get_size - get the size in the specified units * @size: The size to be converted in blocks * @blk_size: Size of the block (use 1 for size in bytes) * @units: Units to use (powers of 1000 or 1024), whether to include space separator * @buf: buffer to format to * @len: length of buffer * * This function returns a string formatted to 3 significant figures * giving the size in the required units. @buf should have room for * at least 9 bytes and will always be zero terminated. * * Return value: number of characters of output that would have been written * (which may be greater than len, if output was truncated). */ int string_get_size(u64 size, u64 blk_size, const enum string_size_units units, char *buf, int len) { enum string_size_units units_base = units & STRING_UNITS_MASK; static const char *const units_10[] = { "", "k", "M", "G", "T", "P", "E", "Z", "Y", }; static const char *const units_2[] = { "", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi", "Yi", }; static const char *const *const units_str[] = { [STRING_UNITS_10] = units_10, [STRING_UNITS_2] = units_2, }; static const unsigned int divisor[] = { [STRING_UNITS_10] = 1000, [STRING_UNITS_2] = 1024, }; static const unsigned int rounding[] = { 500, 50, 5 }; int i = 0, j; u32 remainder = 0, sf_cap; char tmp[12]; const char *unit; tmp[0] = '\0'; if (blk_size == 0) size = 0; if (size == 0) goto out; /* This is Napier's algorithm. Reduce the original block size to * * coefficient * divisor[units_base]^i * * we do the reduction so both coefficients are just under 32 bits so * that multiplying them together won't overflow 64 bits and we keep * as much precision as possible in the numbers. * * Note: it's safe to throw away the remainders here because all the * precision is in the coefficients. */ while (blk_size >> 32) { do_div(blk_size, divisor[units_base]); i++; } while (size >> 32) { do_div(size, divisor[units_base]); i++; } /* now perform the actual multiplication keeping i as the sum of the * two logarithms */ size *= blk_size; /* and logarithmically reduce it until it's just under the divisor */ while (size >= divisor[units_base]) { remainder = do_div(size, divisor[units_base]); i++; } /* work out in j how many digits of precision we need from the * remainder */ sf_cap = size; for (j = 0; sf_cap*10 < 1000; j++) sf_cap *= 10; if (units_base == STRING_UNITS_2) { /* express the remainder as a decimal. It's currently the * numerator of a fraction whose denominator is * divisor[units_base], which is 1 << 10 for STRING_UNITS_2 */ remainder *= 1000; remainder >>= 10; } /* add a 5 to the digit below what will be printed to ensure * an arithmetical round up and carry it through to size */ remainder += rounding[j]; if (remainder >= 1000) { remainder -= 1000; size += 1; } if (j) { snprintf(tmp, sizeof(tmp), ".%03u", remainder); tmp[j+1] = '\0'; } out: if (i >= ARRAY_SIZE(units_2)) unit = "UNK"; else unit = units_str[units_base][i]; return snprintf(buf, len, "%u%s%s%s%s", (u32)size, tmp, (units & STRING_UNITS_NO_SPACE) ? "" : " ", unit, (units & STRING_UNITS_NO_BYTES) ? "" : "B"); } EXPORT_SYMBOL(string_get_size); int parse_int_array(const char *buf, size_t count, int **array) { int *ints, nints; get_options(buf, 0, &nints); if (!nints) return -ENOENT; ints = kcalloc(nints + 1, sizeof(*ints), GFP_KERNEL); if (!ints) return -ENOMEM; get_options(buf, nints + 1, ints); *array = ints; return 0; } EXPORT_SYMBOL(parse_int_array); /** * parse_int_array_user - Split string into a sequence of integers * @from: The user space buffer to read from * @count: The maximum number of bytes to read * @array: Returned pointer to sequence of integers * * On success @array is allocated and initialized with a sequence of * integers extracted from the @from plus an additional element that * begins the sequence and specifies the integers count. * * Caller takes responsibility for freeing @array when it is no longer * needed. */ int parse_int_array_user(const char __user *from, size_t count, int **array) { char *buf; int ret; buf = memdup_user_nul(from, count); if (IS_ERR(buf)) return PTR_ERR(buf); ret = parse_int_array(buf, count, array); kfree(buf); return ret; } EXPORT_SYMBOL(parse_int_array_user); static bool unescape_space(char **src, char **dst) { char *p = *dst, *q = *src; switch (*q) { case 'n': *p = '\n'; break; case 'r': *p = '\r'; break; case 't': *p = '\t'; break; case 'v': *p = '\v'; break; case 'f': *p = '\f'; break; default: return false; } *dst += 1; *src += 1; return true; } static bool unescape_octal(char **src, char **dst) { char *p = *dst, *q = *src; u8 num; if (isodigit(*q) == 0) return false; num = (*q++) & 7; while (num < 32 && isodigit(*q) && (q - *src < 3)) { num <<= 3; num += (*q++) & 7; } *p = num; *dst += 1; *src = q; return true; } static bool unescape_hex(char **src, char **dst) { char *p = *dst, *q = *src; int digit; u8 num; if (*q++ != 'x') return false; num = digit = hex_to_bin(*q++); if (digit < 0) return false; digit = hex_to_bin(*q); if (digit >= 0) { q++; num = (num << 4) | digit; } *p = num; *dst += 1; *src = q; return true; } static bool unescape_special(char **src, char **dst) { char *p = *dst, *q = *src; switch (*q) { case '\"': *p = '\"'; break; case '\\': *p = '\\'; break; case 'a': *p = '\a'; break; case 'e': *p = '\e'; break; default: return false; } *dst += 1; *src += 1; return true; } /** * string_unescape - unquote characters in the given string * @src: source buffer (escaped) * @dst: destination buffer (unescaped) * @size: size of the destination buffer (0 to unlimit) * @flags: combination of the flags. * * Description: * The function unquotes characters in the given string. * * Because the size of the output will be the same as or less than the size of * the input, the transformation may be performed in place. * * Caller must provide valid source and destination pointers. Be aware that * destination buffer will always be NULL-terminated. Source string must be * NULL-terminated as well. The supported flags are:: * * UNESCAPE_SPACE: * '\f' - form feed * '\n' - new line * '\r' - carriage return * '\t' - horizontal tab * '\v' - vertical tab * UNESCAPE_OCTAL: * '\NNN' - byte with octal value NNN (1 to 3 digits) * UNESCAPE_HEX: * '\xHH' - byte with hexadecimal value HH (1 to 2 digits) * UNESCAPE_SPECIAL: * '\"' - double quote * '\\' - backslash * '\a' - alert (BEL) * '\e' - escape * UNESCAPE_ANY: * all previous together * * Return: * The amount of the characters processed to the destination buffer excluding * trailing '\0' is returned. */ int string_unescape(char *src, char *dst, size_t size, unsigned int flags) { char *out = dst; if (!size) size = SIZE_MAX; while (*src && --size) { if (src[0] == '\\' && src[1] != '\0' && size > 1) { src++; size--; if (flags & UNESCAPE_SPACE && unescape_space(&src, &out)) continue; if (flags & UNESCAPE_OCTAL && unescape_octal(&src, &out)) continue; if (flags & UNESCAPE_HEX && unescape_hex(&src, &out)) continue; if (flags & UNESCAPE_SPECIAL && unescape_special(&src, &out)) continue; *out++ = '\\'; } *out++ = *src++; } *out = '\0'; return out - dst; } EXPORT_SYMBOL(string_unescape); static bool escape_passthrough(unsigned char c, char **dst, char *end) { char *out = *dst; if (out < end) *out = c; *dst = out + 1; return true; } static bool escape_space(unsigned char c, char **dst, char *end) { char *out = *dst; unsigned char to; switch (c) { case '\n': to = 'n'; break; case '\r': to = 'r'; break; case '\t': to = 't'; break; case '\v': to = 'v'; break; case '\f': to = 'f'; break; default: return false; } if (out < end) *out = '\\'; ++out; if (out < end) *out = to; ++out; *dst = out; return true; } static bool escape_special(unsigned char c, char **dst, char *end) { char *out = *dst; unsigned char to; switch (c) { case '\\': to = '\\'; break; case '\a': to = 'a'; break; case '\e': to = 'e'; break; case '"': to = '"'; break; default: return false; } if (out < end) *out = '\\'; ++out; if (out < end) *out = to; ++out; *dst = out; return true; } static bool escape_null(unsigned char c, char **dst, char *end) { char *out = *dst; if (c) return false; if (out < end) *out = '\\'; ++out; if (out < end) *out = '0'; ++out; *dst = out; return true; } static bool escape_octal(unsigned char c, char **dst, char *end) { char *out = *dst; if (out < end) *out = '\\'; ++out; if (out < end) *out = ((c >> 6) & 0x07) + '0'; ++out; if (out < end) *out = ((c >> 3) & 0x07) + '0'; ++out; if (out < end) *out = ((c >> 0) & 0x07) + '0'; ++out; *dst = out; return true; } static bool escape_hex(unsigned char c, char **dst, char *end) { char *out = *dst; if (out < end) *out = '\\'; ++out; if (out < end) *out = 'x'; ++out; if (out < end) *out = hex_asc_hi(c); ++out; if (out < end) *out = hex_asc_lo(c); ++out; *dst = out; return true; } /** * string_escape_mem - quote characters in the given memory buffer * @src: source buffer (unescaped) * @isz: source buffer size * @dst: destination buffer (escaped) * @osz: destination buffer size * @flags: combination of the flags * @only: NULL-terminated string containing characters used to limit * the selected escape class. If characters are included in @only * that would not normally be escaped by the classes selected * in @flags, they will be copied to @dst unescaped. * * Description: * The process of escaping byte buffer includes several parts. They are applied * in the following sequence. * * 1. The character is not matched to the one from @only string and thus * must go as-is to the output. * 2. The character is matched to the printable and ASCII classes, if asked, * and in case of match it passes through to the output. * 3. The character is matched to the printable or ASCII class, if asked, * and in case of match it passes through to the output. * 4. The character is checked if it falls into the class given by @flags. * %ESCAPE_OCTAL and %ESCAPE_HEX are going last since they cover any * character. Note that they actually can't go together, otherwise * %ESCAPE_HEX will be ignored. * * Caller must provide valid source and destination pointers. Be aware that * destination buffer will not be NULL-terminated, thus caller have to append * it if needs. The supported flags are:: * * %ESCAPE_SPACE: (special white space, not space itself) * '\f' - form feed * '\n' - new line * '\r' - carriage return * '\t' - horizontal tab * '\v' - vertical tab * %ESCAPE_SPECIAL: * '\"' - double quote * '\\' - backslash * '\a' - alert (BEL) * '\e' - escape * %ESCAPE_NULL: * '\0' - null * %ESCAPE_OCTAL: * '\NNN' - byte with octal value NNN (3 digits) * %ESCAPE_ANY: * all previous together * %ESCAPE_NP: * escape only non-printable characters, checked by isprint() * %ESCAPE_ANY_NP: * all previous together * %ESCAPE_HEX: * '\xHH' - byte with hexadecimal value HH (2 digits) * %ESCAPE_NA: * escape only non-ascii characters, checked by isascii() * %ESCAPE_NAP: * escape only non-printable or non-ascii characters * %ESCAPE_APPEND: * append characters from @only to be escaped by the given classes * * %ESCAPE_APPEND would help to pass additional characters to the escaped, when * one of %ESCAPE_NP, %ESCAPE_NA, or %ESCAPE_NAP is provided. * * One notable caveat, the %ESCAPE_NAP, %ESCAPE_NP and %ESCAPE_NA have the * higher priority than the rest of the flags (%ESCAPE_NAP is the highest). * It doesn't make much sense to use either of them without %ESCAPE_OCTAL * or %ESCAPE_HEX, because they cover most of the other character classes. * %ESCAPE_NAP can utilize %ESCAPE_SPACE or %ESCAPE_SPECIAL in addition to * the above. * * Return: * The total size of the escaped output that would be generated for * the given input and flags. To check whether the output was * truncated, compare the return value to osz. There is room left in * dst for a '\0' terminator if and only if ret < osz. */ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz, unsigned int flags, const char *only) { char *p = dst; char *end = p + osz; bool is_dict = only && *only; bool is_append = flags & ESCAPE_APPEND; while (isz--) { unsigned char c = *src++; bool in_dict = is_dict && strchr(only, c); /* * Apply rules in the following sequence: * - the @only string is supplied and does not contain a * character under question * - the character is printable and ASCII, when @flags has * %ESCAPE_NAP bit set * - the character is printable, when @flags has * %ESCAPE_NP bit set * - the character is ASCII, when @flags has * %ESCAPE_NA bit set * - the character doesn't fall into a class of symbols * defined by given @flags * In these cases we just pass through a character to the * output buffer. * * When %ESCAPE_APPEND is passed, the characters from @only * have been excluded from the %ESCAPE_NAP, %ESCAPE_NP, and * %ESCAPE_NA cases. */ if (!(is_append || in_dict) && is_dict && escape_passthrough(c, &p, end)) continue; if (!(is_append && in_dict) && isascii(c) && isprint(c) && flags & ESCAPE_NAP && escape_passthrough(c, &p, end)) continue; if (!(is_append && in_dict) && isprint(c) && flags & ESCAPE_NP && escape_passthrough(c, &p, end)) continue; if (!(is_append && in_dict) && isascii(c) && flags & ESCAPE_NA && escape_passthrough(c, &p, end)) continue; if (flags & ESCAPE_SPACE && escape_space(c, &p, end)) continue; if (flags & ESCAPE_SPECIAL && escape_special(c, &p, end)) continue; if (flags & ESCAPE_NULL && escape_null(c, &p, end)) continue; /* ESCAPE_OCTAL and ESCAPE_HEX always go last */ if (flags & ESCAPE_OCTAL && escape_octal(c, &p, end)) continue; if (flags & ESCAPE_HEX && escape_hex(c, &p, end)) continue; escape_passthrough(c, &p, end); } return p - dst; } EXPORT_SYMBOL(string_escape_mem); /* * Return an allocated string that has been escaped of special characters * and double quotes, making it safe to log in quotes. */ char *kstrdup_quotable(const char *src, gfp_t gfp) { size_t slen, dlen; char *dst; const int flags = ESCAPE_HEX; const char esc[] = "\f\n\r\t\v\a\e\\\""; if (!src) return NULL; slen = strlen(src); dlen = string_escape_mem(src, slen, NULL, 0, flags, esc); dst = kmalloc(dlen + 1, gfp); if (!dst) return NULL; WARN_ON(string_escape_mem(src, slen, dst, dlen, flags, esc) != dlen); dst[dlen] = '\0'; return dst; } EXPORT_SYMBOL_GPL(kstrdup_quotable); /* * Returns allocated NULL-terminated string containing process * command line, with inter-argument NULLs replaced with spaces, * and other special characters escaped. */ char *kstrdup_quotable_cmdline(struct task_struct *task, gfp_t gfp) { char *buffer, *quoted; int i, res; buffer = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!buffer) return NULL; res = get_cmdline(task, buffer, PAGE_SIZE - 1); buffer[res] = '\0'; /* Collapse trailing NULLs, leave res pointing to last non-NULL. */ while (--res >= 0 && buffer[res] == '\0') ; /* Replace inter-argument NULLs. */ for (i = 0; i <= res; i++) if (buffer[i] == '\0') buffer[i] = ' '; /* Make sure result is printable. */ quoted = kstrdup_quotable(buffer, gfp); kfree(buffer); return quoted; } EXPORT_SYMBOL_GPL(kstrdup_quotable_cmdline); /* * Returns allocated NULL-terminated string containing pathname, * with special characters escaped, able to be safely logged. If * there is an error, the leading character will be "<". */ char *kstrdup_quotable_file(struct file *file, gfp_t gfp) { char *temp, *pathname; if (!file) return kstrdup("<unknown>", gfp); /* We add 11 spaces for ' (deleted)' to be appended */ temp = kmalloc(PATH_MAX + 11, GFP_KERNEL); if (!temp) return kstrdup("<no_memory>", gfp); pathname = file_path(file, temp, PATH_MAX + 11); if (IS_ERR(pathname)) pathname = kstrdup("<too_long>", gfp); else pathname = kstrdup_quotable(pathname, gfp); kfree(temp); return pathname; } EXPORT_SYMBOL_GPL(kstrdup_quotable_file); /* * Returns duplicate string in which the @old characters are replaced by @new. */ char *kstrdup_and_replace(const char *src, char old, char new, gfp_t gfp) { char *dst; dst = kstrdup(src, gfp); if (!dst) return NULL; return strreplace(dst, old, new); } EXPORT_SYMBOL_GPL(kstrdup_and_replace); /** * kasprintf_strarray - allocate and fill array of sequential strings * @gfp: flags for the slab allocator * @prefix: prefix to be used * @n: amount of lines to be allocated and filled * * Allocates and fills @n strings using pattern "%s-%zu", where prefix * is provided by caller. The caller is responsible to free them with * kfree_strarray() after use. * * Returns array of strings or NULL when memory can't be allocated. */ char **kasprintf_strarray(gfp_t gfp, const char *prefix, size_t n) { char **names; size_t i; names = kcalloc(n + 1, sizeof(char *), gfp); if (!names) return NULL; for (i = 0; i < n; i++) { names[i] = kasprintf(gfp, "%s-%zu", prefix, i); if (!names[i]) { kfree_strarray(names, i); return NULL; } } return names; } EXPORT_SYMBOL_GPL(kasprintf_strarray); /** * kfree_strarray - free a number of dynamically allocated strings contained * in an array and the array itself * * @array: Dynamically allocated array of strings to free. * @n: Number of strings (starting from the beginning of the array) to free. * * Passing a non-NULL @array and @n == 0 as well as NULL @array are valid * use-cases. If @array is NULL, the function does nothing. */ void kfree_strarray(char **array, size_t n) { unsigned int i; if (!array) return; for (i = 0; i < n; i++) kfree(array[i]); kfree(array); } EXPORT_SYMBOL_GPL(kfree_strarray); struct strarray { char **array; size_t n; }; static void devm_kfree_strarray(struct device *dev, void *res) { struct strarray *array = res; kfree_strarray(array->array, array->n); } char **devm_kasprintf_strarray(struct device *dev, const char *prefix, size_t n) { struct strarray *ptr; ptr = devres_alloc(devm_kfree_strarray, sizeof(*ptr), GFP_KERNEL); if (!ptr) return ERR_PTR(-ENOMEM); ptr->array = kasprintf_strarray(GFP_KERNEL, prefix, n); if (!ptr->array) { devres_free(ptr); return ERR_PTR(-ENOMEM); } ptr->n = n; devres_add(dev, ptr); return ptr->array; } EXPORT_SYMBOL_GPL(devm_kasprintf_strarray); /** * skip_spaces - Removes leading whitespace from @str. * @str: The string to be stripped. * * Returns a pointer to the first non-whitespace character in @str. */ char *skip_spaces(const char *str) { while (isspace(*str)) ++str; return (char *)str; } EXPORT_SYMBOL(skip_spaces); /** * strim - Removes leading and trailing whitespace from @s. * @s: The string to be stripped. * * Note that the first trailing whitespace is replaced with a %NUL-terminator * in the given string @s. Returns a pointer to the first non-whitespace * character in @s. */ char *strim(char *s) { size_t size; char *end; size = strlen(s); if (!size) return s; end = s + size - 1; while (end >= s && isspace(*end)) end--; *(end + 1) = '\0'; return skip_spaces(s); } EXPORT_SYMBOL(strim); /** * sysfs_streq - return true if strings are equal, modulo trailing newline * @s1: one string * @s2: another string * * This routine returns true iff two strings are equal, treating both * NUL and newline-then-NUL as equivalent string terminations. It's * geared for use with sysfs input strings, which generally terminate * with newlines but are compared against values without newlines. */ bool sysfs_streq(const char *s1, const char *s2) { while (*s1 && *s1 == *s2) { s1++; s2++; } if (*s1 == *s2) return true; if (!*s1 && *s2 == '\n' && !s2[1]) return true; if (*s1 == '\n' && !s1[1] && !*s2) return true; return false; } EXPORT_SYMBOL(sysfs_streq); /** * match_string - matches given string in an array * @array: array of strings * @n: number of strings in the array or -1 for NULL terminated arrays * @string: string to match with * * This routine will look for a string in an array of strings up to the * n-th element in the array or until the first NULL element. * * Historically the value of -1 for @n, was used to search in arrays that * are NULL terminated. However, the function does not make a distinction * when finishing the search: either @n elements have been compared OR * the first NULL element was found. * * Return: * index of a @string in the @array if matches, or %-EINVAL otherwise. */ int match_string(const char * const *array, size_t n, const char *string) { int index; const char *item; for (index = 0; index < n; index++) { item = array[index]; if (!item) break; if (!strcmp(item, string)) return index; } return -EINVAL; } EXPORT_SYMBOL(match_string); /** * __sysfs_match_string - matches given string in an array * @array: array of strings * @n: number of strings in the array or -1 for NULL terminated arrays * @str: string to match with * * Returns index of @str in the @array or -EINVAL, just like match_string(). * Uses sysfs_streq instead of strcmp for matching. * * This routine will look for a string in an array of strings up to the * n-th element in the array or until the first NULL element. * * Historically the value of -1 for @n, was used to search in arrays that * are NULL terminated. However, the function does not make a distinction * when finishing the search: either @n elements have been compared OR * the first NULL element was found. */ int __sysfs_match_string(const char * const *array, size_t n, const char *str) { const char *item; int index; for (index = 0; index < n; index++) { item = array[index]; if (!item) break; if (sysfs_streq(item, str)) return index; } return -EINVAL; } EXPORT_SYMBOL(__sysfs_match_string); /** * strreplace - Replace all occurrences of character in string. * @str: The string to operate on. * @old: The character being replaced. * @new: The character @old is replaced with. * * Replaces the each @old character with a @new one in the given string @str. * * Return: pointer to the string @str itself. */ char *strreplace(char *str, char old, char new) { char *s = str; for (; *s; ++s) if (*s == old) *s = new; return str; } EXPORT_SYMBOL(strreplace); /** * memcpy_and_pad - Copy one buffer to another with padding * @dest: Where to copy to * @dest_len: The destination buffer size * @src: Where to copy from * @count: The number of bytes to copy * @pad: Character to use for padding if space is left in destination. */ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, int pad) { if (dest_len > count) { memcpy(dest, src, count); memset(dest + count, pad, dest_len - count); } else { memcpy(dest, src, dest_len); } } EXPORT_SYMBOL(memcpy_and_pad); #ifdef CONFIG_FORTIFY_SOURCE /* These are placeholders for fortify compile-time warnings. */ void __read_overflow2_field(size_t avail, size_t wanted) { } EXPORT_SYMBOL(__read_overflow2_field); void __write_overflow_field(size_t avail, size_t wanted) { } EXPORT_SYMBOL(__write_overflow_field); static const char * const fortify_func_name[] = { #define MAKE_FORTIFY_FUNC_NAME(func) [MAKE_FORTIFY_FUNC(func)] = #func EACH_FORTIFY_FUNC(MAKE_FORTIFY_FUNC_NAME) #undef MAKE_FORTIFY_FUNC_NAME }; void __fortify_report(const u8 reason, const size_t avail, const size_t size) { const u8 func = FORTIFY_REASON_FUNC(reason); const bool write = FORTIFY_REASON_DIR(reason); const char *name; name = fortify_func_name[umin(func, FORTIFY_FUNC_UNKNOWN)]; WARN(1, "%s: detected buffer overflow: %zu byte %s of buffer size %zu\n", name, size, str_read_write(!write), avail); } EXPORT_SYMBOL(__fortify_report); void __fortify_panic(const u8 reason, const size_t avail, const size_t size) { __fortify_report(reason, avail, size); BUG(); } EXPORT_SYMBOL(__fortify_panic); #endif /* CONFIG_FORTIFY_SOURCE */
9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 // SPDX-License-Identifier: GPL-2.0-or-later /* * PTP virtual clock driver * * Copyright 2021 NXP */ #include <linux/slab.h> #include <linux/hashtable.h> #include "ptp_private.h" #define PTP_VCLOCK_CC_SHIFT 31 #define PTP_VCLOCK_CC_MULT (1 << PTP_VCLOCK_CC_SHIFT) #define PTP_VCLOCK_FADJ_SHIFT 9 #define PTP_VCLOCK_FADJ_DENOMINATOR 15625ULL #define PTP_VCLOCK_REFRESH_INTERVAL (HZ * 2) /* protects vclock_hash addition/deletion */ static DEFINE_SPINLOCK(vclock_hash_lock); static DEFINE_READ_MOSTLY_HASHTABLE(vclock_hash, 8); static void ptp_vclock_hash_add(struct ptp_vclock *vclock) { spin_lock(&vclock_hash_lock); hlist_add_head_rcu(&vclock->vclock_hash_node, &vclock_hash[vclock->clock->index % HASH_SIZE(vclock_hash)]); spin_unlock(&vclock_hash_lock); } static void ptp_vclock_hash_del(struct ptp_vclock *vclock) { spin_lock(&vclock_hash_lock); hlist_del_init_rcu(&vclock->vclock_hash_node); spin_unlock(&vclock_hash_lock); synchronize_rcu(); } static int ptp_vclock_adjfine(struct ptp_clock_info *ptp, long scaled_ppm) { struct ptp_vclock *vclock = info_to_vclock(ptp); s64 adj; adj = (s64)scaled_ppm << PTP_VCLOCK_FADJ_SHIFT; adj = div_s64(adj, PTP_VCLOCK_FADJ_DENOMINATOR); if (mutex_lock_interruptible(&vclock->lock)) return -EINTR; timecounter_read(&vclock->tc); vclock->cc.mult = PTP_VCLOCK_CC_MULT + adj; mutex_unlock(&vclock->lock); return 0; } static int ptp_vclock_adjtime(struct ptp_clock_info *ptp, s64 delta) { struct ptp_vclock *vclock = info_to_vclock(ptp); if (mutex_lock_interruptible(&vclock->lock)) return -EINTR; timecounter_adjtime(&vclock->tc, delta); mutex_unlock(&vclock->lock); return 0; } static int ptp_vclock_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts) { struct ptp_vclock *vclock = info_to_vclock(ptp); u64 ns; if (mutex_lock_interruptible(&vclock->lock)) return -EINTR; ns = timecounter_read(&vclock->tc); mutex_unlock(&vclock->lock); *ts = ns_to_timespec64(ns); return 0; } static int ptp_vclock_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts, struct ptp_system_timestamp *sts) { struct ptp_vclock *vclock = info_to_vclock(ptp); struct ptp_clock *pptp = vclock->pclock; struct timespec64 pts; int err; u64 ns; err = pptp->info->getcyclesx64(pptp->info, &pts, sts); if (err) return err; if (mutex_lock_interruptible(&vclock->lock)) return -EINTR; ns = timecounter_cyc2time(&vclock->tc, timespec64_to_ns(&pts)); mutex_unlock(&vclock->lock); *ts = ns_to_timespec64(ns); return 0; } static int ptp_vclock_settime(struct ptp_clock_info *ptp, const struct timespec64 *ts) { struct ptp_vclock *vclock = info_to_vclock(ptp); u64 ns = timespec64_to_ns(ts); if (mutex_lock_interruptible(&vclock->lock)) return -EINTR; timecounter_init(&vclock->tc, &vclock->cc, ns); mutex_unlock(&vclock->lock); return 0; } static int ptp_vclock_getcrosststamp(struct ptp_clock_info *ptp, struct system_device_crosststamp *xtstamp) { struct ptp_vclock *vclock = info_to_vclock(ptp); struct ptp_clock *pptp = vclock->pclock; int err; u64 ns; err = pptp->info->getcrosscycles(pptp->info, xtstamp); if (err) return err; if (mutex_lock_interruptible(&vclock->lock)) return -EINTR; ns = timecounter_cyc2time(&vclock->tc, ktime_to_ns(xtstamp->device)); mutex_unlock(&vclock->lock); xtstamp->device = ns_to_ktime(ns); return 0; } static long ptp_vclock_refresh(struct ptp_clock_info *ptp) { struct ptp_vclock *vclock = info_to_vclock(ptp); struct timespec64 ts; ptp_vclock_gettime(&vclock->info, &ts); return PTP_VCLOCK_REFRESH_INTERVAL; } static void ptp_vclock_set_subclass(struct ptp_clock *ptp) { lockdep_set_subclass(&ptp->clock.rwsem, PTP_LOCK_VIRTUAL); } static const struct ptp_clock_info ptp_vclock_info = { .owner = THIS_MODULE, .name = "ptp virtual clock", .max_adj = 500000000, .adjfine = ptp_vclock_adjfine, .adjtime = ptp_vclock_adjtime, .settime64 = ptp_vclock_settime, .do_aux_work = ptp_vclock_refresh, }; static u64 ptp_vclock_read(struct cyclecounter *cc) { struct ptp_vclock *vclock = cc_to_vclock(cc); struct ptp_clock *ptp = vclock->pclock; struct timespec64 ts = {}; ptp->info->getcycles64(ptp->info, &ts); return timespec64_to_ns(&ts); } static const struct cyclecounter ptp_vclock_cc = { .read = ptp_vclock_read, .mask = CYCLECOUNTER_MASK(32), .mult = PTP_VCLOCK_CC_MULT, .shift = PTP_VCLOCK_CC_SHIFT, }; struct ptp_vclock *ptp_vclock_register(struct ptp_clock *pclock) { struct ptp_vclock *vclock; vclock = kzalloc(sizeof(*vclock), GFP_KERNEL); if (!vclock) return NULL; vclock->pclock = pclock; vclock->info = ptp_vclock_info; if (pclock->info->getcyclesx64) vclock->info.gettimex64 = ptp_vclock_gettimex; else vclock->info.gettime64 = ptp_vclock_gettime; if (pclock->info->getcrosscycles) vclock->info.getcrosststamp = ptp_vclock_getcrosststamp; vclock->cc = ptp_vclock_cc; snprintf(vclock->info.name, PTP_CLOCK_NAME_LEN, "ptp%d_virt", pclock->index); INIT_HLIST_NODE(&vclock->vclock_hash_node); mutex_init(&vclock->lock); vclock->clock = ptp_clock_register(&vclock->info, &pclock->dev); if (IS_ERR_OR_NULL(vclock->clock)) { kfree(vclock); return NULL; } ptp_vclock_set_subclass(vclock->clock); timecounter_init(&vclock->tc, &vclock->cc, 0); ptp_schedule_worker(vclock->clock, PTP_VCLOCK_REFRESH_INTERVAL); ptp_vclock_hash_add(vclock); return vclock; } void ptp_vclock_unregister(struct ptp_vclock *vclock) { ptp_vclock_hash_del(vclock); ptp_clock_unregister(vclock->clock); kfree(vclock); } #if IS_BUILTIN(CONFIG_PTP_1588_CLOCK) int ptp_get_vclocks_index(int pclock_index, int **vclock_index) { char name[PTP_CLOCK_NAME_LEN] = ""; struct ptp_clock *ptp; struct device *dev; int num = 0; if (pclock_index < 0) return num; snprintf(name, PTP_CLOCK_NAME_LEN, "ptp%d", pclock_index); dev = class_find_device_by_name(&ptp_class, name); if (!dev) return num; ptp = dev_get_drvdata(dev); if (mutex_lock_interruptible(&ptp->n_vclocks_mux)) { put_device(dev); return num; } *vclock_index = kzalloc(sizeof(int) * ptp->n_vclocks, GFP_KERNEL); if (!(*vclock_index)) goto out; memcpy(*vclock_index, ptp->vclock_index, sizeof(int) * ptp->n_vclocks); num = ptp->n_vclocks; out: mutex_unlock(&ptp->n_vclocks_mux); put_device(dev); return num; } EXPORT_SYMBOL(ptp_get_vclocks_index); ktime_t ptp_convert_timestamp(const ktime_t *hwtstamp, int vclock_index) { unsigned int hash = vclock_index % HASH_SIZE(vclock_hash); struct ptp_vclock *vclock; u64 ns; u64 vclock_ns = 0; ns = ktime_to_ns(*hwtstamp); rcu_read_lock(); hlist_for_each_entry_rcu(vclock, &vclock_hash[hash], vclock_hash_node) { if (vclock->clock->index != vclock_index) continue; if (mutex_lock_interruptible(&vclock->lock)) break; vclock_ns = timecounter_cyc2time(&vclock->tc, ns); mutex_unlock(&vclock->lock); break; } rcu_read_unlock(); return ns_to_ktime(vclock_ns); } EXPORT_SYMBOL(ptp_convert_timestamp); #endif
4 4 4 6 5 2 5 2 7 7 6 1 7 6 3 3 7 7 3 1 3 3 3 3 1 1 1 3 3 1 1 1 1 3 3 3 3 3 2 2 2 2 2 2 156 156 7 7 1 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 // SPDX-License-Identifier: GPL-2.0 #include <linux/jhash.h> #include <linux/netfilter.h> #include <linux/rcupdate.h> #include <linux/rhashtable.h> #include <linux/vmalloc.h> #include <net/genetlink.h> #include <net/netns/generic.h> #include <uapi/linux/genetlink.h> #include "ila.h" struct ila_xlat_params { struct ila_params ip; int ifindex; }; struct ila_map { struct ila_xlat_params xp; struct rhash_head node; struct ila_map __rcu *next; struct rcu_head rcu; }; #define MAX_LOCKS 1024 #define LOCKS_PER_CPU 10 static int alloc_ila_locks(struct ila_net *ilan) { return alloc_bucket_spinlocks(&ilan->xlat.locks, &ilan->xlat.locks_mask, MAX_LOCKS, LOCKS_PER_CPU, GFP_KERNEL); } static u32 hashrnd __read_mostly; static __always_inline void __ila_hash_secret_init(void) { net_get_random_once(&hashrnd, sizeof(hashrnd)); } static inline u32 ila_locator_hash(struct ila_locator loc) { u32 *v = (u32 *)loc.v32; __ila_hash_secret_init(); return jhash_2words(v[0], v[1], hashrnd); } static inline spinlock_t *ila_get_lock(struct ila_net *ilan, struct ila_locator loc) { return &ilan->xlat.locks[ila_locator_hash(loc) & ilan->xlat.locks_mask]; } static inline int ila_cmp_wildcards(struct ila_map *ila, struct ila_addr *iaddr, int ifindex) { return (ila->xp.ifindex && ila->xp.ifindex != ifindex); } static inline int ila_cmp_params(struct ila_map *ila, struct ila_xlat_params *xp) { return (ila->xp.ifindex != xp->ifindex); } static int ila_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) { const struct ila_map *ila = obj; return (ila->xp.ip.locator_match.v64 != *(__be64 *)arg->key); } static inline int ila_order(struct ila_map *ila) { int score = 0; if (ila->xp.ifindex) score += 1 << 1; return score; } static const struct rhashtable_params rht_params = { .nelem_hint = 1024, .head_offset = offsetof(struct ila_map, node), .key_offset = offsetof(struct ila_map, xp.ip.locator_match), .key_len = sizeof(u64), /* identifier */ .max_size = 1048576, .min_size = 256, .automatic_shrinking = true, .obj_cmpfn = ila_cmpfn, }; static int parse_nl_config(struct genl_info *info, struct ila_xlat_params *xp) { memset(xp, 0, sizeof(*xp)); if (info->attrs[ILA_ATTR_LOCATOR]) xp->ip.locator.v64 = (__force __be64)nla_get_u64( info->attrs[ILA_ATTR_LOCATOR]); if (info->attrs[ILA_ATTR_LOCATOR_MATCH]) xp->ip.locator_match.v64 = (__force __be64)nla_get_u64( info->attrs[ILA_ATTR_LOCATOR_MATCH]); xp->ip.csum_mode = nla_get_u8_default(info->attrs[ILA_ATTR_CSUM_MODE], ILA_CSUM_NO_ACTION); xp->ip.ident_type = nla_get_u8_default(info->attrs[ILA_ATTR_IDENT_TYPE], ILA_ATYPE_USE_FORMAT); if (info->attrs[ILA_ATTR_IFINDEX]) xp->ifindex = nla_get_s32(info->attrs[ILA_ATTR_IFINDEX]); return 0; } /* Must be called with rcu readlock */ static inline struct ila_map *ila_lookup_wildcards(struct ila_addr *iaddr, int ifindex, struct ila_net *ilan) { struct ila_map *ila; ila = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &iaddr->loc, rht_params); while (ila) { if (!ila_cmp_wildcards(ila, iaddr, ifindex)) return ila; ila = rcu_access_pointer(ila->next); } return NULL; } /* Must be called with rcu readlock */ static inline struct ila_map *ila_lookup_by_params(struct ila_xlat_params *xp, struct ila_net *ilan) { struct ila_map *ila; ila = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &xp->ip.locator_match, rht_params); while (ila) { if (!ila_cmp_params(ila, xp)) return ila; ila = rcu_access_pointer(ila->next); } return NULL; } static inline void ila_release(struct ila_map *ila) { kfree_rcu(ila, rcu); } static void ila_free_node(struct ila_map *ila) { struct ila_map *next; /* Assume rcu_readlock held */ while (ila) { next = rcu_access_pointer(ila->next); ila_release(ila); ila = next; } } static void ila_free_cb(void *ptr, void *arg) { ila_free_node((struct ila_map *)ptr); } static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila); static unsigned int ila_nf_input(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { ila_xlat_addr(skb, false); return NF_ACCEPT; } static const struct nf_hook_ops ila_nf_hook_ops[] = { { .hook = ila_nf_input, .pf = NFPROTO_IPV6, .hooknum = NF_INET_PRE_ROUTING, .priority = -1, }, }; static DEFINE_MUTEX(ila_mutex); static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) { struct ila_net *ilan = net_generic(net, ila_net_id); struct ila_map *ila, *head; spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match); int err = 0, order; if (!READ_ONCE(ilan->xlat.hooks_registered)) { /* We defer registering net hooks in the namespace until the * first mapping is added. */ mutex_lock(&ila_mutex); if (!ilan->xlat.hooks_registered) { err = nf_register_net_hooks(net, ila_nf_hook_ops, ARRAY_SIZE(ila_nf_hook_ops)); if (!err) WRITE_ONCE(ilan->xlat.hooks_registered, true); } mutex_unlock(&ila_mutex); if (err) return err; } ila = kzalloc(sizeof(*ila), GFP_KERNEL); if (!ila) return -ENOMEM; ila_init_saved_csum(&xp->ip); ila->xp = *xp; order = ila_order(ila); spin_lock(lock); head = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &xp->ip.locator_match, rht_params); if (!head) { /* New entry for the rhash_table */ err = rhashtable_lookup_insert_fast(&ilan->xlat.rhash_table, &ila->node, rht_params); } else { struct ila_map *tila = head, *prev = NULL; do { if (!ila_cmp_params(tila, xp)) { err = -EEXIST; goto out; } if (order > ila_order(tila)) break; prev = tila; tila = rcu_dereference_protected(tila->next, lockdep_is_held(lock)); } while (tila); if (prev) { /* Insert in sub list of head */ RCU_INIT_POINTER(ila->next, tila); rcu_assign_pointer(prev->next, ila); } else { /* Make this ila new head */ RCU_INIT_POINTER(ila->next, head); err = rhashtable_replace_fast(&ilan->xlat.rhash_table, &head->node, &ila->node, rht_params); if (err) goto out; } } out: spin_unlock(lock); if (err) kfree(ila); return err; } static int ila_del_mapping(struct net *net, struct ila_xlat_params *xp) { struct ila_net *ilan = net_generic(net, ila_net_id); struct ila_map *ila, *head, *prev; spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match); int err = -ENOENT; spin_lock(lock); head = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &xp->ip.locator_match, rht_params); ila = head; prev = NULL; while (ila) { if (ila_cmp_params(ila, xp)) { prev = ila; ila = rcu_dereference_protected(ila->next, lockdep_is_held(lock)); continue; } err = 0; if (prev) { /* Not head, just delete from list */ rcu_assign_pointer(prev->next, ila->next); } else { /* It is the head. If there is something in the * sublist we need to make a new head. */ head = rcu_dereference_protected(ila->next, lockdep_is_held(lock)); if (head) { /* Put first entry in the sublist into the * table */ err = rhashtable_replace_fast( &ilan->xlat.rhash_table, &ila->node, &head->node, rht_params); if (err) goto out; } else { /* Entry no longer used */ err = rhashtable_remove_fast( &ilan->xlat.rhash_table, &ila->node, rht_params); } } ila_release(ila); break; } out: spin_unlock(lock); return err; } int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct ila_xlat_params p; int err; err = parse_nl_config(info, &p); if (err) return err; return ila_add_mapping(net, &p); } int ila_xlat_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct ila_xlat_params xp; int err; err = parse_nl_config(info, &xp); if (err) return err; ila_del_mapping(net, &xp); return 0; } static inline spinlock_t *lock_from_ila_map(struct ila_net *ilan, struct ila_map *ila) { return ila_get_lock(ilan, ila->xp.ip.locator_match); } int ila_xlat_nl_cmd_flush(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct ila_net *ilan = net_generic(net, ila_net_id); struct rhashtable_iter iter; struct ila_map *ila; spinlock_t *lock; int ret = 0; rhashtable_walk_enter(&ilan->xlat.rhash_table, &iter); rhashtable_walk_start(&iter); for (;;) { ila = rhashtable_walk_next(&iter); if (IS_ERR(ila)) { if (PTR_ERR(ila) == -EAGAIN) continue; ret = PTR_ERR(ila); goto done; } else if (!ila) { break; } lock = lock_from_ila_map(ilan, ila); spin_lock(lock); ret = rhashtable_remove_fast(&ilan->xlat.rhash_table, &ila->node, rht_params); if (!ret) ila_free_node(ila); spin_unlock(lock); if (ret) break; } done: rhashtable_walk_stop(&iter); rhashtable_walk_exit(&iter); return ret; } static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg) { if (nla_put_u64_64bit(msg, ILA_ATTR_LOCATOR, (__force u64)ila->xp.ip.locator.v64, ILA_ATTR_PAD) || nla_put_u64_64bit(msg, ILA_ATTR_LOCATOR_MATCH, (__force u64)ila->xp.ip.locator_match.v64, ILA_ATTR_PAD) || nla_put_s32(msg, ILA_ATTR_IFINDEX, ila->xp.ifindex) || nla_put_u8(msg, ILA_ATTR_CSUM_MODE, ila->xp.ip.csum_mode) || nla_put_u8(msg, ILA_ATTR_IDENT_TYPE, ila->xp.ip.ident_type)) return -1; return 0; } static int ila_dump_info(struct ila_map *ila, u32 portid, u32 seq, u32 flags, struct sk_buff *skb, u8 cmd) { void *hdr; hdr = genlmsg_put(skb, portid, seq, &ila_nl_family, flags, cmd); if (!hdr) return -ENOMEM; if (ila_fill_info(ila, skb) < 0) goto nla_put_failure; genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } int ila_xlat_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct ila_net *ilan = net_generic(net, ila_net_id); struct sk_buff *msg; struct ila_xlat_params xp; struct ila_map *ila; int ret; ret = parse_nl_config(info, &xp); if (ret) return ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; rcu_read_lock(); ret = -ESRCH; ila = ila_lookup_by_params(&xp, ilan); if (ila) { ret = ila_dump_info(ila, info->snd_portid, info->snd_seq, 0, msg, info->genlhdr->cmd); } rcu_read_unlock(); if (ret < 0) goto out_free; return genlmsg_reply(msg, info); out_free: nlmsg_free(msg); return ret; } struct ila_dump_iter { struct rhashtable_iter rhiter; int skip; }; int ila_xlat_nl_dump_start(struct netlink_callback *cb) { struct net *net = sock_net(cb->skb->sk); struct ila_net *ilan = net_generic(net, ila_net_id); struct ila_dump_iter *iter; iter = kmalloc(sizeof(*iter), GFP_KERNEL); if (!iter) return -ENOMEM; rhashtable_walk_enter(&ilan->xlat.rhash_table, &iter->rhiter); iter->skip = 0; cb->args[0] = (long)iter; return 0; } int ila_xlat_nl_dump_done(struct netlink_callback *cb) { struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0]; rhashtable_walk_exit(&iter->rhiter); kfree(iter); return 0; } int ila_xlat_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0]; struct rhashtable_iter *rhiter = &iter->rhiter; int skip = iter->skip; struct ila_map *ila; int ret; rhashtable_walk_start(rhiter); /* Get first entry */ ila = rhashtable_walk_peek(rhiter); if (ila && !IS_ERR(ila) && skip) { /* Skip over visited entries */ while (ila && skip) { /* Skip over any ila entries in this list that we * have already dumped. */ ila = rcu_access_pointer(ila->next); skip--; } } skip = 0; for (;;) { if (IS_ERR(ila)) { ret = PTR_ERR(ila); if (ret == -EAGAIN) { /* Table has changed and iter has reset. Return * -EAGAIN to the application even if we have * written data to the skb. The application * needs to deal with this. */ goto out_ret; } else { break; } } else if (!ila) { ret = 0; break; } while (ila) { ret = ila_dump_info(ila, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, skb, ILA_CMD_GET); if (ret) goto out; skip++; ila = rcu_access_pointer(ila->next); } skip = 0; ila = rhashtable_walk_next(rhiter); } out: iter->skip = skip; ret = (skb->len ? : ret); out_ret: rhashtable_walk_stop(rhiter); return ret; } int ila_xlat_init_net(struct net *net) { struct ila_net *ilan = net_generic(net, ila_net_id); int err; err = alloc_ila_locks(ilan); if (err) return err; err = rhashtable_init(&ilan->xlat.rhash_table, &rht_params); if (err) { free_bucket_spinlocks(ilan->xlat.locks); return err; } return 0; } void ila_xlat_pre_exit_net(struct net *net) { struct ila_net *ilan = net_generic(net, ila_net_id); if (ilan->xlat.hooks_registered) nf_unregister_net_hooks(net, ila_nf_hook_ops, ARRAY_SIZE(ila_nf_hook_ops)); } void ila_xlat_exit_net(struct net *net) { struct ila_net *ilan = net_generic(net, ila_net_id); rhashtable_free_and_destroy(&ilan->xlat.rhash_table, ila_free_cb, NULL); free_bucket_spinlocks(ilan->xlat.locks); } static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila) { struct ila_map *ila; struct ipv6hdr *ip6h = ipv6_hdr(skb); struct net *net = dev_net(skb->dev); struct ila_net *ilan = net_generic(net, ila_net_id); struct ila_addr *iaddr = ila_a2i(&ip6h->daddr); /* Assumes skb contains a valid IPv6 header that is pulled */ /* No check here that ILA type in the mapping matches what is in the * address. We assume that whatever sender gaves us can be translated. * The checksum mode however is relevant. */ rcu_read_lock(); ila = ila_lookup_wildcards(iaddr, skb->dev->ifindex, ilan); if (ila) ila_update_ipv6_locator(skb, &ila->xp.ip, sir2ila); rcu_read_unlock(); return 0; }
5 5 5 5 5 12 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 // SPDX-License-Identifier: GPL-2.0 /* * Tty buffer allocation management */ #include <linux/types.h> #include <linux/errno.h> #include <linux/minmax.h> #include <linux/tty.h> #include <linux/tty_buffer.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/wait.h> #include <linux/bitops.h> #include <linux/delay.h> #include <linux/module.h> #include <linux/ratelimit.h> #include "tty.h" #define MIN_TTYB_SIZE 256 #define TTYB_ALIGN_MASK 0xff /* * Byte threshold to limit memory consumption for flip buffers. * The actual memory limit is > 2x this amount. */ #define TTYB_DEFAULT_MEM_LIMIT (640 * 1024UL) /* * We default to dicing tty buffer allocations to this many characters * in order to avoid multiple page allocations. We know the size of * tty_buffer itself but it must also be taken into account that the * buffer is 256 byte aligned. See tty_buffer_find for the allocation * logic this must match. */ #define TTY_BUFFER_PAGE (((PAGE_SIZE - sizeof(struct tty_buffer)) / 2) & ~TTYB_ALIGN_MASK) /** * tty_buffer_lock_exclusive - gain exclusive access to buffer * @port: tty port owning the flip buffer * * Guarantees safe use of the &tty_ldisc_ops.receive_buf() method by excluding * the buffer work and any pending flush from using the flip buffer. Data can * continue to be added concurrently to the flip buffer from the driver side. * * See also tty_buffer_unlock_exclusive(). */ void tty_buffer_lock_exclusive(struct tty_port *port) { struct tty_bufhead *buf = &port->buf; atomic_inc(&buf->priority); mutex_lock(&buf->lock); } EXPORT_SYMBOL_GPL(tty_buffer_lock_exclusive); /** * tty_buffer_unlock_exclusive - release exclusive access * @port: tty port owning the flip buffer * * The buffer work is restarted if there is data in the flip buffer. * * See also tty_buffer_lock_exclusive(). */ void tty_buffer_unlock_exclusive(struct tty_port *port) { struct tty_bufhead *buf = &port->buf; bool restart = buf->head->commit != buf->head->read; atomic_dec(&buf->priority); mutex_unlock(&buf->lock); if (restart) queue_work(system_unbound_wq, &buf->work); } EXPORT_SYMBOL_GPL(tty_buffer_unlock_exclusive); /** * tty_buffer_space_avail - return unused buffer space * @port: tty port owning the flip buffer * * Returns: the # of bytes which can be written by the driver without reaching * the buffer limit. * * Note: this does not guarantee that memory is available to write the returned * # of bytes (use tty_prepare_flip_string() to pre-allocate if memory * guarantee is required). */ unsigned int tty_buffer_space_avail(struct tty_port *port) { int space = port->buf.mem_limit - atomic_read(&port->buf.mem_used); return max(space, 0); } EXPORT_SYMBOL_GPL(tty_buffer_space_avail); static void tty_buffer_reset(struct tty_buffer *p, size_t size) { p->used = 0; p->size = size; p->next = NULL; p->commit = 0; p->lookahead = 0; p->read = 0; p->flags = true; } /** * tty_buffer_free_all - free buffers used by a tty * @port: tty port to free from * * Remove all the buffers pending on a tty whether queued with data or in the * free ring. Must be called when the tty is no longer in use. */ void tty_buffer_free_all(struct tty_port *port) { struct tty_bufhead *buf = &port->buf; struct tty_buffer *p, *next; struct llist_node *llist; unsigned int freed = 0; int still_used; while ((p = buf->head) != NULL) { buf->head = p->next; freed += p->size; if (p->size > 0) kfree(p); } llist = llist_del_all(&buf->free); llist_for_each_entry_safe(p, next, llist, free) kfree(p); tty_buffer_reset(&buf->sentinel, 0); buf->head = &buf->sentinel; buf->tail = &buf->sentinel; still_used = atomic_xchg(&buf->mem_used, 0); WARN(still_used != freed, "we still have not freed %d bytes!", still_used - freed); } /** * tty_buffer_alloc - allocate a tty buffer * @port: tty port * @size: desired size (characters) * * Allocate a new tty buffer to hold the desired number of characters. We * round our buffers off in 256 character chunks to get better allocation * behaviour. * * Returns: %NULL if out of memory or the allocation would exceed the per * device queue. */ static struct tty_buffer *tty_buffer_alloc(struct tty_port *port, size_t size) { struct llist_node *free; struct tty_buffer *p; /* Round the buffer size out */ size = __ALIGN_MASK(size, TTYB_ALIGN_MASK); if (size <= MIN_TTYB_SIZE) { free = llist_del_first(&port->buf.free); if (free) { p = llist_entry(free, struct tty_buffer, free); goto found; } } /* Should possibly check if this fails for the largest buffer we * have queued and recycle that ? */ if (atomic_read(&port->buf.mem_used) > port->buf.mem_limit) return NULL; p = kmalloc(struct_size(p, data, 2 * size), GFP_ATOMIC | __GFP_NOWARN); if (p == NULL) return NULL; found: tty_buffer_reset(p, size); atomic_add(size, &port->buf.mem_used); return p; } /** * tty_buffer_free - free a tty buffer * @port: tty port owning the buffer * @b: the buffer to free * * Free a tty buffer, or add it to the free list according to our internal * strategy. */ static void tty_buffer_free(struct tty_port *port, struct tty_buffer *b) { struct tty_bufhead *buf = &port->buf; /* Dumb strategy for now - should keep some stats */ WARN_ON(atomic_sub_return(b->size, &buf->mem_used) < 0); if (b->size > MIN_TTYB_SIZE) kfree(b); else if (b->size > 0) llist_add(&b->free, &buf->free); } /** * tty_buffer_flush - flush full tty buffers * @tty: tty to flush * @ld: optional ldisc ptr (must be referenced) * * Flush all the buffers containing receive data. If @ld != %NULL, flush the * ldisc input buffer. * * Locking: takes buffer lock to ensure single-threaded flip buffer 'consumer'. */ void tty_buffer_flush(struct tty_struct *tty, struct tty_ldisc *ld) { struct tty_port *port = tty->port; struct tty_bufhead *buf = &port->buf; struct tty_buffer *next; atomic_inc(&buf->priority); mutex_lock(&buf->lock); /* paired w/ release in __tty_buffer_request_room; ensures there are * no pending memory accesses to the freed buffer */ while ((next = smp_load_acquire(&buf->head->next)) != NULL) { tty_buffer_free(port, buf->head); buf->head = next; } buf->head->read = buf->head->commit; buf->head->lookahead = buf->head->read; if (ld && ld->ops->flush_buffer) ld->ops->flush_buffer(tty); atomic_dec(&buf->priority); mutex_unlock(&buf->lock); } /** * __tty_buffer_request_room - grow tty buffer if needed * @port: tty port * @size: size desired * @flags: buffer has to store flags along character data * * Make at least @size bytes of linear space available for the tty buffer. * * Will change over to a new buffer if the current buffer is encoded as * %TTY_NORMAL (so has no flags buffer) and the new buffer requires a flags * buffer. * * Returns: the size we managed to find. */ static int __tty_buffer_request_room(struct tty_port *port, size_t size, bool flags) { struct tty_bufhead *buf = &port->buf; struct tty_buffer *n, *b = buf->tail; size_t left = (b->flags ? 1 : 2) * b->size - b->used; bool change = !b->flags && flags; if (!change && left >= size) return size; /* This is the slow path - looking for new buffers to use */ n = tty_buffer_alloc(port, size); if (n == NULL) return change ? 0 : left; n->flags = flags; buf->tail = n; /* * Paired w/ acquire in flush_to_ldisc() and lookahead_bufs() * ensures they see all buffer data. */ smp_store_release(&b->commit, b->used); /* * Paired w/ acquire in flush_to_ldisc() and lookahead_bufs() * ensures the latest commit value can be read before the head * is advanced to the next buffer. */ smp_store_release(&b->next, n); return size; } int tty_buffer_request_room(struct tty_port *port, size_t size) { return __tty_buffer_request_room(port, size, true); } EXPORT_SYMBOL_GPL(tty_buffer_request_room); size_t __tty_insert_flip_string_flags(struct tty_port *port, const u8 *chars, const u8 *flags, bool mutable_flags, size_t size) { bool need_flags = mutable_flags || flags[0] != TTY_NORMAL; size_t copied = 0; do { size_t goal = min_t(size_t, size - copied, TTY_BUFFER_PAGE); size_t space = __tty_buffer_request_room(port, goal, need_flags); struct tty_buffer *tb = port->buf.tail; if (unlikely(space == 0)) break; memcpy(char_buf_ptr(tb, tb->used), chars, space); if (mutable_flags) { memcpy(flag_buf_ptr(tb, tb->used), flags, space); flags += space; } else if (tb->flags) { memset(flag_buf_ptr(tb, tb->used), flags[0], space); } else { /* tb->flags should be available once requested */ WARN_ON_ONCE(need_flags); } tb->used += space; copied += space; chars += space; /* There is a small chance that we need to split the data over * several buffers. If this is the case we must loop. */ } while (unlikely(size > copied)); return copied; } EXPORT_SYMBOL(__tty_insert_flip_string_flags); /** * tty_prepare_flip_string - make room for characters * @port: tty port * @chars: return pointer for character write area * @size: desired size * * Prepare a block of space in the buffer for data. * * This is used for drivers that need their own block copy routines into the * buffer. There is no guarantee the buffer is a DMA target! * * Returns: the length available and buffer pointer (@chars) to the space which * is now allocated and accounted for as ready for normal characters. */ size_t tty_prepare_flip_string(struct tty_port *port, u8 **chars, size_t size) { size_t space = __tty_buffer_request_room(port, size, false); if (likely(space)) { struct tty_buffer *tb = port->buf.tail; *chars = char_buf_ptr(tb, tb->used); if (tb->flags) memset(flag_buf_ptr(tb, tb->used), TTY_NORMAL, space); tb->used += space; } return space; } EXPORT_SYMBOL_GPL(tty_prepare_flip_string); /** * tty_ldisc_receive_buf - forward data to line discipline * @ld: line discipline to process input * @p: char buffer * @f: %TTY_NORMAL, %TTY_BREAK, etc. flags buffer * @count: number of bytes to process * * Callers other than flush_to_ldisc() need to exclude the kworker from * concurrent use of the line discipline, see paste_selection(). * * Returns: the number of bytes processed. */ size_t tty_ldisc_receive_buf(struct tty_ldisc *ld, const u8 *p, const u8 *f, size_t count) { if (ld->ops->receive_buf2) count = ld->ops->receive_buf2(ld->tty, p, f, count); else { count = min_t(size_t, count, ld->tty->receive_room); if (count && ld->ops->receive_buf) ld->ops->receive_buf(ld->tty, p, f, count); } return count; } EXPORT_SYMBOL_GPL(tty_ldisc_receive_buf); static void lookahead_bufs(struct tty_port *port, struct tty_buffer *head) { head->lookahead = max(head->lookahead, head->read); while (head) { struct tty_buffer *next; unsigned int count; /* * Paired w/ release in __tty_buffer_request_room(); * ensures commit value read is not stale if the head * is advancing to the next buffer. */ next = smp_load_acquire(&head->next); /* * Paired w/ release in __tty_buffer_request_room() or in * tty_buffer_flush(); ensures we see the committed buffer data. */ count = smp_load_acquire(&head->commit) - head->lookahead; if (!count) { head = next; continue; } if (port->client_ops->lookahead_buf) { u8 *p, *f = NULL; p = char_buf_ptr(head, head->lookahead); if (head->flags) f = flag_buf_ptr(head, head->lookahead); port->client_ops->lookahead_buf(port, p, f, count); } head->lookahead += count; } } static size_t receive_buf(struct tty_port *port, struct tty_buffer *head, size_t count) { u8 *p = char_buf_ptr(head, head->read); const u8 *f = NULL; size_t n; if (head->flags) f = flag_buf_ptr(head, head->read); n = port->client_ops->receive_buf(port, p, f, count); if (n > 0) memset(p, 0, n); return n; } /** * flush_to_ldisc - flush data from buffer to ldisc * @work: tty structure passed from work queue. * * This routine is called out of the software interrupt to flush data from the * buffer chain to the line discipline. * * The receive_buf() method is single threaded for each tty instance. * * Locking: takes buffer lock to ensure single-threaded flip buffer 'consumer'. */ static void flush_to_ldisc(struct work_struct *work) { struct tty_port *port = container_of(work, struct tty_port, buf.work); struct tty_bufhead *buf = &port->buf; mutex_lock(&buf->lock); while (1) { struct tty_buffer *head = buf->head; struct tty_buffer *next; size_t count, rcvd; /* Ldisc or user is trying to gain exclusive access */ if (atomic_read(&buf->priority)) break; /* paired w/ release in __tty_buffer_request_room(); * ensures commit value read is not stale if the head * is advancing to the next buffer */ next = smp_load_acquire(&head->next); /* paired w/ release in __tty_buffer_request_room() or in * tty_buffer_flush(); ensures we see the committed buffer data */ count = smp_load_acquire(&head->commit) - head->read; if (!count) { if (next == NULL) break; buf->head = next; tty_buffer_free(port, head); continue; } rcvd = receive_buf(port, head, count); head->read += rcvd; if (rcvd < count) lookahead_bufs(port, head); if (!rcvd) break; cond_resched(); } mutex_unlock(&buf->lock); } static inline void tty_flip_buffer_commit(struct tty_buffer *tail) { /* * Paired w/ acquire in flush_to_ldisc(); ensures flush_to_ldisc() sees * buffer data. */ smp_store_release(&tail->commit, tail->used); } /** * tty_flip_buffer_push - push terminal buffers * @port: tty port to push * * Queue a push of the terminal flip buffers to the line discipline. Can be * called from IRQ/atomic context. * * In the event of the queue being busy for flipping the work will be held off * and retried later. */ void tty_flip_buffer_push(struct tty_port *port) { struct tty_bufhead *buf = &port->buf; tty_flip_buffer_commit(buf->tail); queue_work(system_unbound_wq, &buf->work); } EXPORT_SYMBOL(tty_flip_buffer_push); /** * tty_insert_flip_string_and_push_buffer - add characters to the tty buffer and * push * @port: tty port * @chars: characters * @size: size * * The function combines tty_insert_flip_string() and tty_flip_buffer_push() * with the exception of properly holding the @port->lock. * * To be used only internally (by pty currently). * * Returns: the number added. */ int tty_insert_flip_string_and_push_buffer(struct tty_port *port, const u8 *chars, size_t size) { struct tty_bufhead *buf = &port->buf; unsigned long flags; spin_lock_irqsave(&port->lock, flags); size = tty_insert_flip_string(port, chars, size); if (size) tty_flip_buffer_commit(buf->tail); spin_unlock_irqrestore(&port->lock, flags); queue_work(system_unbound_wq, &buf->work); return size; } /** * tty_buffer_init - prepare a tty buffer structure * @port: tty port to initialise * * Set up the initial state of the buffer management for a tty device. Must be * called before the other tty buffer functions are used. */ void tty_buffer_init(struct tty_port *port) { struct tty_bufhead *buf = &port->buf; mutex_init(&buf->lock); tty_buffer_reset(&buf->sentinel, 0); buf->head = &buf->sentinel; buf->tail = &buf->sentinel; init_llist_head(&buf->free); atomic_set(&buf->mem_used, 0); atomic_set(&buf->priority, 0); INIT_WORK(&buf->work, flush_to_ldisc); buf->mem_limit = TTYB_DEFAULT_MEM_LIMIT; } /** * tty_buffer_set_limit - change the tty buffer memory limit * @port: tty port to change * @limit: memory limit to set * * Change the tty buffer memory limit. * * Must be called before the other tty buffer functions are used. */ int tty_buffer_set_limit(struct tty_port *port, int limit) { if (limit < MIN_TTYB_SIZE) return -EINVAL; port->buf.mem_limit = limit; return 0; } EXPORT_SYMBOL_GPL(tty_buffer_set_limit); /* slave ptys can claim nested buffer lock when handling BRK and INTR */ void tty_buffer_set_lock_subclass(struct tty_port *port) { lockdep_set_subclass(&port->buf.lock, TTY_LOCK_SLAVE); } bool tty_buffer_restart_work(struct tty_port *port) { return queue_work(system_unbound_wq, &port->buf.work); } bool tty_buffer_cancel_work(struct tty_port *port) { return cancel_work_sync(&port->buf.work); } void tty_buffer_flush_work(struct tty_port *port) { flush_work(&port->buf.work); }
7 7 16 8 8 8 8 8 19 19 22 29 7 9 149 149 147 149 148 9 8 1 7 7 7 7 9 9 2732 2701 141 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2010-2011 EIA Electronics, // Pieter Beyens <pieter.beyens@eia.be> // Copyright (c) 2010-2011 EIA Electronics, // Kurt Van Dijck <kurt.van.dijck@eia.be> // Copyright (c) 2018 Protonic, // Robin van der Gracht <robin@protonic.nl> // Copyright (c) 2017-2019 Pengutronix, // Marc Kleine-Budde <kernel@pengutronix.de> // Copyright (c) 2017-2019 Pengutronix, // Oleksij Rempel <kernel@pengutronix.de> /* Core of can-j1939 that links j1939 to CAN. */ #include <linux/can/can-ml.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/if_arp.h> #include <linux/module.h> #include "j1939-priv.h" MODULE_DESCRIPTION("PF_CAN SAE J1939"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("EIA Electronics (Kurt Van Dijck & Pieter Beyens)"); MODULE_ALIAS("can-proto-" __stringify(CAN_J1939)); /* LOWLEVEL CAN interface */ /* CAN_HDR: #bytes before can_frame data part */ #define J1939_CAN_HDR (offsetof(struct can_frame, data)) /* lowest layer */ static void j1939_can_recv(struct sk_buff *iskb, void *data) { struct j1939_priv *priv = data; struct sk_buff *skb; struct j1939_sk_buff_cb *skcb, *iskcb; struct can_frame *cf; /* make sure we only get Classical CAN frames */ if (!can_is_can_skb(iskb)) return; /* create a copy of the skb * j1939 only delivers the real data bytes, * the header goes into sockaddr. * j1939 may not touch the incoming skb in such way */ skb = skb_clone(iskb, GFP_ATOMIC); if (!skb) return; j1939_priv_get(priv); can_skb_set_owner(skb, iskb->sk); /* get a pointer to the header of the skb * the skb payload (pointer) is moved, so that the next skb_data * returns the actual payload */ cf = (void *)skb->data; skb_pull(skb, J1939_CAN_HDR); /* fix length, set to dlc, with 8 maximum */ skb_trim(skb, min_t(uint8_t, cf->len, 8)); /* set addr */ skcb = j1939_skb_to_cb(skb); memset(skcb, 0, sizeof(*skcb)); iskcb = j1939_skb_to_cb(iskb); skcb->tskey = iskcb->tskey; skcb->priority = (cf->can_id >> 26) & 0x7; skcb->addr.sa = cf->can_id; skcb->addr.pgn = (cf->can_id >> 8) & J1939_PGN_MAX; /* set default message type */ skcb->addr.type = J1939_TP; if (!j1939_address_is_valid(skcb->addr.sa)) { netdev_err_once(priv->ndev, "%s: sa is broadcast address, ignoring!\n", __func__); goto done; } if (j1939_pgn_is_pdu1(skcb->addr.pgn)) { /* Type 1: with destination address */ skcb->addr.da = skcb->addr.pgn; /* normalize pgn: strip dst address */ skcb->addr.pgn &= 0x3ff00; } else { /* set broadcast address */ skcb->addr.da = J1939_NO_ADDR; } /* update localflags */ read_lock_bh(&priv->lock); if (j1939_address_is_unicast(skcb->addr.sa) && priv->ents[skcb->addr.sa].nusers) skcb->flags |= J1939_ECU_LOCAL_SRC; if (j1939_address_is_unicast(skcb->addr.da) && priv->ents[skcb->addr.da].nusers) skcb->flags |= J1939_ECU_LOCAL_DST; read_unlock_bh(&priv->lock); /* deliver into the j1939 stack ... */ j1939_ac_recv(priv, skb); if (j1939_tp_recv(priv, skb)) /* this means the transport layer processed the message */ goto done; j1939_simple_recv(priv, skb); j1939_sk_recv(priv, skb); done: j1939_priv_put(priv); kfree_skb(skb); } /* NETDEV MANAGEMENT */ /* values for can_rx_(un)register */ #define J1939_CAN_ID CAN_EFF_FLAG #define J1939_CAN_MASK (CAN_EFF_FLAG | CAN_RTR_FLAG) static DEFINE_MUTEX(j1939_netdev_lock); static struct j1939_priv *j1939_priv_create(struct net_device *ndev) { struct j1939_priv *priv; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return NULL; rwlock_init(&priv->lock); INIT_LIST_HEAD(&priv->ecus); priv->ndev = ndev; kref_init(&priv->kref); kref_init(&priv->rx_kref); dev_hold(ndev); netdev_dbg(priv->ndev, "%s : 0x%p\n", __func__, priv); return priv; } static inline void j1939_priv_set(struct net_device *ndev, struct j1939_priv *priv) { struct can_ml_priv *can_ml = can_get_ml_priv(ndev); can_ml->j1939_priv = priv; } static void __j1939_priv_release(struct kref *kref) { struct j1939_priv *priv = container_of(kref, struct j1939_priv, kref); struct net_device *ndev = priv->ndev; netdev_dbg(priv->ndev, "%s: 0x%p\n", __func__, priv); WARN_ON_ONCE(!list_empty(&priv->active_session_list)); WARN_ON_ONCE(!list_empty(&priv->ecus)); WARN_ON_ONCE(!list_empty(&priv->j1939_socks)); dev_put(ndev); kfree(priv); } void j1939_priv_put(struct j1939_priv *priv) { kref_put(&priv->kref, __j1939_priv_release); } void j1939_priv_get(struct j1939_priv *priv) { kref_get(&priv->kref); } static int j1939_can_rx_register(struct j1939_priv *priv) { struct net_device *ndev = priv->ndev; int ret; j1939_priv_get(priv); ret = can_rx_register(dev_net(ndev), ndev, J1939_CAN_ID, J1939_CAN_MASK, j1939_can_recv, priv, "j1939", NULL); if (ret < 0) { j1939_priv_put(priv); return ret; } return 0; } static void j1939_can_rx_unregister(struct j1939_priv *priv) { struct net_device *ndev = priv->ndev; can_rx_unregister(dev_net(ndev), ndev, J1939_CAN_ID, J1939_CAN_MASK, j1939_can_recv, priv); /* The last reference of priv is dropped by the RCU deferred * j1939_sk_sock_destruct() of the last socket, so we can * safely drop this reference here. */ j1939_priv_put(priv); } static void __j1939_rx_release(struct kref *kref) __releases(&j1939_netdev_lock) { struct j1939_priv *priv = container_of(kref, struct j1939_priv, rx_kref); j1939_can_rx_unregister(priv); j1939_ecu_unmap_all(priv); j1939_priv_set(priv->ndev, NULL); mutex_unlock(&j1939_netdev_lock); } /* get pointer to priv without increasing ref counter */ static inline struct j1939_priv *j1939_ndev_to_priv(struct net_device *ndev) { struct can_ml_priv *can_ml = can_get_ml_priv(ndev); return can_ml->j1939_priv; } static struct j1939_priv *j1939_priv_get_by_ndev_locked(struct net_device *ndev) { struct j1939_priv *priv; lockdep_assert_held(&j1939_netdev_lock); priv = j1939_ndev_to_priv(ndev); if (priv) j1939_priv_get(priv); return priv; } static struct j1939_priv *j1939_priv_get_by_ndev(struct net_device *ndev) { struct j1939_priv *priv; mutex_lock(&j1939_netdev_lock); priv = j1939_priv_get_by_ndev_locked(ndev); mutex_unlock(&j1939_netdev_lock); return priv; } struct j1939_priv *j1939_netdev_start(struct net_device *ndev) { struct j1939_priv *priv, *priv_new; int ret; mutex_lock(&j1939_netdev_lock); priv = j1939_priv_get_by_ndev_locked(ndev); if (priv) { kref_get(&priv->rx_kref); mutex_unlock(&j1939_netdev_lock); return priv; } mutex_unlock(&j1939_netdev_lock); priv = j1939_priv_create(ndev); if (!priv) return ERR_PTR(-ENOMEM); j1939_tp_init(priv); rwlock_init(&priv->j1939_socks_lock); INIT_LIST_HEAD(&priv->j1939_socks); mutex_lock(&j1939_netdev_lock); priv_new = j1939_priv_get_by_ndev_locked(ndev); if (priv_new) { /* Someone was faster than us, use their priv and roll * back our's. */ kref_get(&priv_new->rx_kref); mutex_unlock(&j1939_netdev_lock); dev_put(ndev); kfree(priv); return priv_new; } j1939_priv_set(ndev, priv); ret = j1939_can_rx_register(priv); if (ret < 0) goto out_priv_put; mutex_unlock(&j1939_netdev_lock); return priv; out_priv_put: j1939_priv_set(ndev, NULL); mutex_unlock(&j1939_netdev_lock); dev_put(ndev); kfree(priv); return ERR_PTR(ret); } void j1939_netdev_stop(struct j1939_priv *priv) { kref_put_mutex(&priv->rx_kref, __j1939_rx_release, &j1939_netdev_lock); j1939_priv_put(priv); } int j1939_send_one(struct j1939_priv *priv, struct sk_buff *skb) { int ret, dlc; canid_t canid; struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); struct can_frame *cf; /* apply sanity checks */ if (j1939_pgn_is_pdu1(skcb->addr.pgn)) skcb->addr.pgn &= J1939_PGN_PDU1_MAX; else skcb->addr.pgn &= J1939_PGN_MAX; if (skcb->priority > 7) skcb->priority = 6; ret = j1939_ac_fixup(priv, skb); if (unlikely(ret)) goto failed; dlc = skb->len; /* re-claim the CAN_HDR from the SKB */ cf = skb_push(skb, J1939_CAN_HDR); /* initialize header structure */ memset(cf, 0, J1939_CAN_HDR); /* make it a full can frame again */ skb_put_zero(skb, 8 - dlc); canid = CAN_EFF_FLAG | (skcb->priority << 26) | (skcb->addr.pgn << 8) | skcb->addr.sa; if (j1939_pgn_is_pdu1(skcb->addr.pgn)) canid |= skcb->addr.da << 8; cf->can_id = canid; cf->len = dlc; return can_send(skb, 1); failed: kfree_skb(skb); return ret; } static int j1939_netdev_notify(struct notifier_block *nb, unsigned long msg, void *data) { struct net_device *ndev = netdev_notifier_info_to_dev(data); struct can_ml_priv *can_ml = can_get_ml_priv(ndev); struct j1939_priv *priv; if (!can_ml) goto notify_done; priv = j1939_priv_get_by_ndev(ndev); if (!priv) goto notify_done; switch (msg) { case NETDEV_DOWN: j1939_cancel_active_session(priv, NULL); j1939_sk_netdev_event_netdown(priv); j1939_ecu_unmap_all(priv); break; case NETDEV_UNREGISTER: j1939_cancel_active_session(priv, NULL); j1939_sk_netdev_event_netdown(priv); j1939_sk_netdev_event_unregister(priv); break; } j1939_priv_put(priv); notify_done: return NOTIFY_DONE; } static struct notifier_block j1939_netdev_notifier = { .notifier_call = j1939_netdev_notify, }; /* MODULE interface */ static __init int j1939_module_init(void) { int ret; pr_info("can: SAE J1939\n"); ret = register_netdevice_notifier(&j1939_netdev_notifier); if (ret) goto fail_notifier; ret = can_proto_register(&j1939_can_proto); if (ret < 0) { pr_err("can: registration of j1939 protocol failed\n"); goto fail_sk; } return 0; fail_sk: unregister_netdevice_notifier(&j1939_netdev_notifier); fail_notifier: return ret; } static __exit void j1939_module_exit(void) { can_proto_unregister(&j1939_can_proto); unregister_netdevice_notifier(&j1939_netdev_notifier); } module_init(j1939_module_init); module_exit(j1939_module_exit);
10 10 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __MAC802154_DRIVER_OPS #define __MAC802154_DRIVER_OPS #include <linux/types.h> #include <linux/rtnetlink.h> #include <net/mac802154.h> #include "ieee802154_i.h" #include "trace.h" static inline int drv_xmit_async(struct ieee802154_local *local, struct sk_buff *skb) { return local->ops->xmit_async(&local->hw, skb); } static inline int drv_xmit_sync(struct ieee802154_local *local, struct sk_buff *skb) { might_sleep(); return local->ops->xmit_sync(&local->hw, skb); } static inline int drv_set_pan_id(struct ieee802154_local *local, __le16 pan_id) { struct ieee802154_hw_addr_filt filt; int ret; might_sleep(); if (!local->ops->set_hw_addr_filt) { WARN_ON(1); return -EOPNOTSUPP; } filt.pan_id = pan_id; trace_802154_drv_set_pan_id(local, pan_id); ret = local->ops->set_hw_addr_filt(&local->hw, &filt, IEEE802154_AFILT_PANID_CHANGED); trace_802154_drv_return_int(local, ret); return ret; } static inline int drv_set_extended_addr(struct ieee802154_local *local, __le64 extended_addr) { struct ieee802154_hw_addr_filt filt; int ret; might_sleep(); if (!local->ops->set_hw_addr_filt) { WARN_ON(1); return -EOPNOTSUPP; } filt.ieee_addr = extended_addr; trace_802154_drv_set_extended_addr(local, extended_addr); ret = local->ops->set_hw_addr_filt(&local->hw, &filt, IEEE802154_AFILT_IEEEADDR_CHANGED); trace_802154_drv_return_int(local, ret); return ret; } static inline int drv_set_short_addr(struct ieee802154_local *local, __le16 short_addr) { struct ieee802154_hw_addr_filt filt; int ret; might_sleep(); if (!local->ops->set_hw_addr_filt) { WARN_ON(1); return -EOPNOTSUPP; } filt.short_addr = short_addr; trace_802154_drv_set_short_addr(local, short_addr); ret = local->ops->set_hw_addr_filt(&local->hw, &filt, IEEE802154_AFILT_SADDR_CHANGED); trace_802154_drv_return_int(local, ret); return ret; } static inline int drv_set_pan_coord(struct ieee802154_local *local, bool is_coord) { struct ieee802154_hw_addr_filt filt; int ret; might_sleep(); if (!local->ops->set_hw_addr_filt) { WARN_ON(1); return -EOPNOTSUPP; } filt.pan_coord = is_coord; trace_802154_drv_set_pan_coord(local, is_coord); ret = local->ops->set_hw_addr_filt(&local->hw, &filt, IEEE802154_AFILT_PANC_CHANGED); trace_802154_drv_return_int(local, ret); return ret; } static inline int drv_set_promiscuous_mode(struct ieee802154_local *local, bool on) { int ret; might_sleep(); if (!local->ops->set_promiscuous_mode) { WARN_ON(1); return -EOPNOTSUPP; } trace_802154_drv_set_promiscuous_mode(local, on); ret = local->ops->set_promiscuous_mode(&local->hw, on); trace_802154_drv_return_int(local, ret); return ret; } static inline int drv_start(struct ieee802154_local *local, enum ieee802154_filtering_level level, const struct ieee802154_hw_addr_filt *addr_filt) { int ret; might_sleep(); /* setup receive mode parameters e.g. address mode */ if (local->hw.flags & IEEE802154_HW_AFILT) { ret = drv_set_pan_id(local, addr_filt->pan_id); if (ret < 0) return ret; ret = drv_set_short_addr(local, addr_filt->short_addr); if (ret < 0) return ret; ret = drv_set_extended_addr(local, addr_filt->ieee_addr); if (ret < 0) return ret; } switch (level) { case IEEE802154_FILTERING_NONE: fallthrough; case IEEE802154_FILTERING_1_FCS: fallthrough; case IEEE802154_FILTERING_2_PROMISCUOUS: /* TODO: Requires a different receive mode setup e.g. * at86rf233 hardware. */ fallthrough; case IEEE802154_FILTERING_3_SCAN: if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) { ret = drv_set_promiscuous_mode(local, true); if (ret < 0) return ret; } else { return -EOPNOTSUPP; } /* In practice other filtering levels can be requested, but as * for now most hardware/drivers only support * IEEE802154_FILTERING_NONE, we fallback to this actual * filtering level in hardware and make our own additional * filtering in mac802154 receive path. * * TODO: Move this logic to the device drivers as hardware may * support more higher level filters. Hardware may also require * a different order how register are set, which could currently * be buggy, so all received parameters need to be moved to the * start() callback and let the driver go into the mode before * it will turn on receive handling. */ local->phy->filtering = IEEE802154_FILTERING_NONE; break; case IEEE802154_FILTERING_4_FRAME_FIELDS: /* Do not error out if IEEE802154_HW_PROMISCUOUS because we * expect the hardware to operate at the level * IEEE802154_FILTERING_4_FRAME_FIELDS anyway. */ if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) { ret = drv_set_promiscuous_mode(local, false); if (ret < 0) return ret; } local->phy->filtering = IEEE802154_FILTERING_4_FRAME_FIELDS; break; default: WARN_ON(1); return -EINVAL; } trace_802154_drv_start(local); local->started = true; smp_mb(); ret = local->ops->start(&local->hw); trace_802154_drv_return_int(local, ret); return ret; } static inline void drv_stop(struct ieee802154_local *local) { might_sleep(); trace_802154_drv_stop(local); local->ops->stop(&local->hw); trace_802154_drv_return_void(local); /* sync away all work on the tasklet before clearing started */ tasklet_disable(&local->tasklet); tasklet_enable(&local->tasklet); barrier(); local->started = false; } static inline int drv_set_channel(struct ieee802154_local *local, u8 page, u8 channel) { int ret; might_sleep(); trace_802154_drv_set_channel(local, page, channel); ret = local->ops->set_channel(&local->hw, page, channel); trace_802154_drv_return_int(local, ret); return ret; } static inline int drv_set_tx_power(struct ieee802154_local *local, s32 mbm) { int ret; might_sleep(); if (!local->ops->set_txpower) { WARN_ON(1); return -EOPNOTSUPP; } trace_802154_drv_set_tx_power(local, mbm); ret = local->ops->set_txpower(&local->hw, mbm); trace_802154_drv_return_int(local, ret); return ret; } static inline int drv_set_cca_mode(struct ieee802154_local *local, const struct wpan_phy_cca *cca) { int ret; might_sleep(); if (!local->ops->set_cca_mode) { WARN_ON(1); return -EOPNOTSUPP; } trace_802154_drv_set_cca_mode(local, cca); ret = local->ops->set_cca_mode(&local->hw, cca); trace_802154_drv_return_int(local, ret); return ret; } static inline int drv_set_lbt_mode(struct ieee802154_local *local, bool mode) { int ret; might_sleep(); if (!local->ops->set_lbt) { WARN_ON(1); return -EOPNOTSUPP; } trace_802154_drv_set_lbt_mode(local, mode); ret = local->ops->set_lbt(&local->hw, mode); trace_802154_drv_return_int(local, ret); return ret; } static inline int drv_set_cca_ed_level(struct ieee802154_local *local, s32 mbm) { int ret; might_sleep(); if (!local->ops->set_cca_ed_level) { WARN_ON(1); return -EOPNOTSUPP; } trace_802154_drv_set_cca_ed_level(local, mbm); ret = local->ops->set_cca_ed_level(&local->hw, mbm); trace_802154_drv_return_int(local, ret); return ret; } static inline int drv_set_csma_params(struct ieee802154_local *local, u8 min_be, u8 max_be, u8 max_csma_backoffs) { int ret; might_sleep(); if (!local->ops->set_csma_params) { WARN_ON(1); return -EOPNOTSUPP; } trace_802154_drv_set_csma_params(local, min_be, max_be, max_csma_backoffs); ret = local->ops->set_csma_params(&local->hw, min_be, max_be, max_csma_backoffs); trace_802154_drv_return_int(local, ret); return ret; } static inline int drv_set_max_frame_retries(struct ieee802154_local *local, s8 max_frame_retries) { int ret; might_sleep(); if (!local->ops->set_frame_retries) { WARN_ON(1); return -EOPNOTSUPP; } trace_802154_drv_set_max_frame_retries(local, max_frame_retries); ret = local->ops->set_frame_retries(&local->hw, max_frame_retries); trace_802154_drv_return_int(local, ret); return ret; } #endif /* __MAC802154_DRIVER_OPS */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 /* SPDX-License-Identifier: GPL-2.0-only */ /* * AppArmor security module * * This file contains AppArmor network mediation definitions. * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2017 Canonical Ltd. */ #ifndef __AA_NET_H #define __AA_NET_H #include <net/sock.h> #include <linux/path.h> #include "apparmorfs.h" #include "label.h" #include "perms.h" #include "policy.h" #define AA_MAY_SEND AA_MAY_WRITE #define AA_MAY_RECEIVE AA_MAY_READ #define AA_MAY_SHUTDOWN AA_MAY_DELETE #define AA_MAY_CONNECT AA_MAY_OPEN #define AA_MAY_ACCEPT 0x00100000 #define AA_MAY_BIND 0x00200000 #define AA_MAY_LISTEN 0x00400000 #define AA_MAY_SETOPT 0x01000000 #define AA_MAY_GETOPT 0x02000000 #define NET_PERMS_MASK (AA_MAY_SEND | AA_MAY_RECEIVE | AA_MAY_CREATE | \ AA_MAY_SHUTDOWN | AA_MAY_BIND | AA_MAY_LISTEN | \ AA_MAY_CONNECT | AA_MAY_ACCEPT | AA_MAY_SETATTR | \ AA_MAY_GETATTR | AA_MAY_SETOPT | AA_MAY_GETOPT) #define NET_FS_PERMS (AA_MAY_SEND | AA_MAY_RECEIVE | AA_MAY_CREATE | \ AA_MAY_SHUTDOWN | AA_MAY_CONNECT | AA_MAY_RENAME |\ AA_MAY_SETATTR | AA_MAY_GETATTR | AA_MAY_CHMOD | \ AA_MAY_CHOWN | AA_MAY_CHGRP | AA_MAY_LOCK | \ AA_MAY_MPROT) #define NET_PEER_MASK (AA_MAY_SEND | AA_MAY_RECEIVE | AA_MAY_CONNECT | \ AA_MAY_ACCEPT) struct aa_sk_ctx { struct aa_label __rcu *label; struct aa_label __rcu *peer; struct aa_label __rcu *peer_lastupdate; /* ptr cmp only, no deref */ }; static inline struct aa_sk_ctx *aa_sock(const struct sock *sk) { return sk->sk_security + apparmor_blob_sizes.lbs_sock; } #define DEFINE_AUDIT_NET(NAME, OP, CRED, SK, F, T, P) \ struct lsm_network_audit NAME ## _net = { .sk = (SK), \ .family = (F)}; \ DEFINE_AUDIT_DATA(NAME, \ ((SK) && (F) != AF_UNIX) ? LSM_AUDIT_DATA_NET : \ LSM_AUDIT_DATA_NONE, \ AA_CLASS_NET, \ OP); \ NAME.common.u.net = &(NAME ## _net); \ NAME.subj_cred = (CRED); \ NAME.net.type = (T); \ NAME.net.protocol = (P) #define DEFINE_AUDIT_SK(NAME, OP, CRED, SK) \ DEFINE_AUDIT_NET(NAME, OP, CRED, SK, (SK)->sk_family, (SK)->sk_type, \ (SK)->sk_protocol) struct aa_secmark { u8 audit; u8 deny; u32 secid; char *label; }; extern struct aa_sfs_entry aa_sfs_entry_network[]; extern struct aa_sfs_entry aa_sfs_entry_networkv9[]; int aa_do_perms(struct aa_profile *profile, struct aa_policydb *policy, aa_state_t state, u32 request, struct aa_perms *p, struct apparmor_audit_data *ad); /* passing in state returned by XXX_mediates_AF() */ aa_state_t aa_match_to_prot(struct aa_policydb *policy, aa_state_t state, u32 request, u16 af, int type, int protocol, struct aa_perms **p, const char **info); void audit_net_cb(struct audit_buffer *ab, void *va); int aa_profile_af_perm(struct aa_profile *profile, struct apparmor_audit_data *ad, u32 request, u16 family, int type, int protocol); int aa_af_perm(const struct cred *subj_cred, struct aa_label *label, const char *op, u32 request, u16 family, int type, int protocol); static inline int aa_profile_af_sk_perm(struct aa_profile *profile, struct apparmor_audit_data *ad, u32 request, struct sock *sk) { return aa_profile_af_perm(profile, ad, request, sk->sk_family, sk->sk_type, sk->sk_protocol); } int aa_sk_perm(const char *op, u32 request, struct sock *sk); int aa_sock_file_perm(const struct cred *subj_cred, struct aa_label *label, const char *op, u32 request, struct file *file); int apparmor_secmark_check(struct aa_label *label, char *op, u32 request, u32 secid, const struct sock *sk); #endif /* __AA_NET_H */
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * include/net/dsa_stubs.h - Stubs for the Distributed Switch Architecture framework */ #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/net_tstamp.h> #include <net/dsa.h> #if IS_ENABLED(CONFIG_NET_DSA) extern const struct dsa_stubs *dsa_stubs; struct dsa_stubs { int (*conduit_hwtstamp_validate)(struct net_device *dev, const struct kernel_hwtstamp_config *config, struct netlink_ext_ack *extack); }; static inline int dsa_conduit_hwtstamp_validate(struct net_device *dev, const struct kernel_hwtstamp_config *config, struct netlink_ext_ack *extack) { if (!netdev_uses_dsa(dev)) return 0; /* rtnl_lock() is a sufficient guarantee, because as long as * netdev_uses_dsa() returns true, the dsa_core module is still * registered, and so, dsa_unregister_stubs() couldn't have run. * For netdev_uses_dsa() to start returning false, it would imply that * dsa_conduit_teardown() has executed, which requires rtnl_lock(). */ ASSERT_RTNL(); return dsa_stubs->conduit_hwtstamp_validate(dev, config, extack); } #else static inline int dsa_conduit_hwtstamp_validate(struct net_device *dev, const struct kernel_hwtstamp_config *config, struct netlink_ext_ack *extack) { return 0; } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 // SPDX-License-Identifier: GPL-2.0-or-later /* Paravirtualization interfaces Copyright (C) 2006 Rusty Russell IBM Corporation 2007 - x86_64 support added by Glauber de Oliveira Costa, Red Hat Inc */ #include <linux/errno.h> #include <linux/init.h> #include <linux/export.h> #include <linux/efi.h> #include <linux/bcd.h> #include <linux/highmem.h> #include <linux/kprobes.h> #include <linux/pgtable.h> #include <linux/static_call.h> #include <asm/bug.h> #include <asm/paravirt.h> #include <asm/debugreg.h> #include <asm/desc.h> #include <asm/setup.h> #include <asm/time.h> #include <asm/pgalloc.h> #include <asm/irq.h> #include <asm/delay.h> #include <asm/fixmap.h> #include <asm/apic.h> #include <asm/tlbflush.h> #include <asm/timer.h> #include <asm/special_insns.h> #include <asm/tlb.h> #include <asm/io_bitmap.h> #include <asm/gsseg.h> #include <asm/msr.h> /* stub always returning 0. */ DEFINE_ASM_FUNC(paravirt_ret0, "xor %eax,%eax", .entry.text); void __init default_banner(void) { printk(KERN_INFO "Booting paravirtualized kernel on %s\n", pv_info.name); } #ifdef CONFIG_PARAVIRT_XXL DEFINE_ASM_FUNC(_paravirt_ident_64, "mov %rdi, %rax", .text); DEFINE_ASM_FUNC(pv_native_save_fl, "pushf; pop %rax", .noinstr.text); DEFINE_ASM_FUNC(pv_native_irq_disable, "cli", .noinstr.text); DEFINE_ASM_FUNC(pv_native_irq_enable, "sti", .noinstr.text); DEFINE_ASM_FUNC(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text); #endif DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key); void __init native_pv_lock_init(void) { if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) static_branch_enable(&virt_spin_lock_key); } struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; static u64 native_steal_clock(int cpu) { return 0; } DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock); DEFINE_STATIC_CALL(pv_sched_clock, native_sched_clock); void paravirt_set_sched_clock(u64 (*func)(void)) { static_call_update(pv_sched_clock, func); } static noinstr void pv_native_safe_halt(void) { native_safe_halt(); } #ifdef CONFIG_PARAVIRT_XXL static noinstr void pv_native_write_cr2(unsigned long val) { native_write_cr2(val); } static noinstr unsigned long pv_native_read_cr3(void) { return __native_read_cr3(); } static noinstr void pv_native_write_cr3(unsigned long cr3) { native_write_cr3(cr3); } static noinstr unsigned long pv_native_get_debugreg(int regno) { return native_get_debugreg(regno); } static noinstr void pv_native_set_debugreg(int regno, unsigned long val) { native_set_debugreg(regno, val); } #endif struct pv_info pv_info = { .name = "bare hardware", #ifdef CONFIG_PARAVIRT_XXL .extra_user_64bit_cs = __USER_CS, #endif }; /* 64-bit pagetable entries */ #define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64) struct paravirt_patch_template pv_ops = { /* Cpu ops. */ .cpu.io_delay = native_io_delay, #ifdef CONFIG_PARAVIRT_XXL .cpu.cpuid = native_cpuid, .cpu.get_debugreg = pv_native_get_debugreg, .cpu.set_debugreg = pv_native_set_debugreg, .cpu.read_cr0 = native_read_cr0, .cpu.write_cr0 = native_write_cr0, .cpu.write_cr4 = native_write_cr4, .cpu.read_msr = native_read_msr, .cpu.write_msr = native_write_msr, .cpu.read_msr_safe = native_read_msr_safe, .cpu.write_msr_safe = native_write_msr_safe, .cpu.read_pmc = native_read_pmc, .cpu.load_tr_desc = native_load_tr_desc, .cpu.set_ldt = native_set_ldt, .cpu.load_gdt = native_load_gdt, .cpu.load_idt = native_load_idt, .cpu.store_tr = native_store_tr, .cpu.load_tls = native_load_tls, .cpu.load_gs_index = native_load_gs_index, .cpu.write_ldt_entry = native_write_ldt_entry, .cpu.write_gdt_entry = native_write_gdt_entry, .cpu.write_idt_entry = native_write_idt_entry, .cpu.alloc_ldt = paravirt_nop, .cpu.free_ldt = paravirt_nop, .cpu.load_sp0 = native_load_sp0, #ifdef CONFIG_X86_IOPL_IOPERM .cpu.invalidate_io_bitmap = native_tss_invalidate_io_bitmap, .cpu.update_io_bitmap = native_tss_update_io_bitmap, #endif .cpu.start_context_switch = paravirt_nop, .cpu.end_context_switch = paravirt_nop, /* Irq ops. */ .irq.save_fl = __PV_IS_CALLEE_SAVE(pv_native_save_fl), .irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable), .irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable), #endif /* CONFIG_PARAVIRT_XXL */ /* Irq HLT ops. */ .irq.safe_halt = pv_native_safe_halt, .irq.halt = native_halt, /* Mmu ops. */ .mmu.flush_tlb_user = native_flush_tlb_local, .mmu.flush_tlb_kernel = native_flush_tlb_global, .mmu.flush_tlb_one_user = native_flush_tlb_one_user, .mmu.flush_tlb_multi = native_flush_tlb_multi, .mmu.exit_mmap = paravirt_nop, .mmu.notify_page_enc_status_changed = paravirt_nop, #ifdef CONFIG_PARAVIRT_XXL .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(pv_native_read_cr2), .mmu.write_cr2 = pv_native_write_cr2, .mmu.read_cr3 = pv_native_read_cr3, .mmu.write_cr3 = pv_native_write_cr3, .mmu.pgd_alloc = __paravirt_pgd_alloc, .mmu.pgd_free = paravirt_nop, .mmu.alloc_pte = paravirt_nop, .mmu.alloc_pmd = paravirt_nop, .mmu.alloc_pud = paravirt_nop, .mmu.alloc_p4d = paravirt_nop, .mmu.release_pte = paravirt_nop, .mmu.release_pmd = paravirt_nop, .mmu.release_pud = paravirt_nop, .mmu.release_p4d = paravirt_nop, .mmu.set_pte = native_set_pte, .mmu.set_pmd = native_set_pmd, .mmu.ptep_modify_prot_start = __ptep_modify_prot_start, .mmu.ptep_modify_prot_commit = __ptep_modify_prot_commit, .mmu.set_pud = native_set_pud, .mmu.pmd_val = PTE_IDENT, .mmu.make_pmd = PTE_IDENT, .mmu.pud_val = PTE_IDENT, .mmu.make_pud = PTE_IDENT, .mmu.set_p4d = native_set_p4d, .mmu.p4d_val = PTE_IDENT, .mmu.make_p4d = PTE_IDENT, .mmu.set_pgd = native_set_pgd, .mmu.pte_val = PTE_IDENT, .mmu.pgd_val = PTE_IDENT, .mmu.make_pte = PTE_IDENT, .mmu.make_pgd = PTE_IDENT, .mmu.enter_mmap = paravirt_nop, .mmu.lazy_mode = { .enter = paravirt_nop, .leave = paravirt_nop, .flush = paravirt_nop, }, .mmu.set_fixmap = native_set_fixmap, #endif /* CONFIG_PARAVIRT_XXL */ #if defined(CONFIG_PARAVIRT_SPINLOCKS) /* Lock ops. */ #ifdef CONFIG_SMP .lock.queued_spin_lock_slowpath = native_queued_spin_lock_slowpath, .lock.queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock), .lock.wait = paravirt_nop, .lock.kick = paravirt_nop, .lock.vcpu_is_preempted = PV_CALLEE_SAVE(__native_vcpu_is_preempted), #endif /* SMP */ #endif }; #ifdef CONFIG_PARAVIRT_XXL NOKPROBE_SYMBOL(native_load_idt); #endif EXPORT_SYMBOL(pv_ops); EXPORT_SYMBOL_GPL(pv_info);
191 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 // SPDX-License-Identifier: BSD-3-Clause /* * linux/net/sunrpc/auth_gss/auth_gss.c * * RPCSEC_GSS client authentication. * * Copyright (c) 2000 The Regents of the University of Michigan. * All rights reserved. * * Dug Song <dugsong@monkey.org> * Andy Adamson <andros@umich.edu> */ #include <linux/module.h> #include <linux/init.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/pagemap.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/auth.h> #include <linux/sunrpc/auth_gss.h> #include <linux/sunrpc/gss_krb5.h> #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/gss_err.h> #include <linux/workqueue.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/sunrpc/gss_api.h> #include <linux/uaccess.h> #include <linux/hashtable.h> #include "auth_gss_internal.h" #include "../netns.h" #include <trace/events/rpcgss.h> static const struct rpc_authops authgss_ops; static const struct rpc_credops gss_credops; static const struct rpc_credops gss_nullops; #define GSS_RETRY_EXPIRED 5 static unsigned int gss_expired_cred_retry_delay = GSS_RETRY_EXPIRED; #define GSS_KEY_EXPIRE_TIMEO 240 static unsigned int gss_key_expire_timeo = GSS_KEY_EXPIRE_TIMEO; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif /* * This compile-time check verifies that we will not exceed the * slack space allotted by the client and server auth_gss code * before they call gss_wrap(). */ #define GSS_KRB5_MAX_SLACK_NEEDED \ (GSS_KRB5_TOK_HDR_LEN /* gss token header */ \ + GSS_KRB5_MAX_CKSUM_LEN /* gss token checksum */ \ + GSS_KRB5_MAX_BLOCKSIZE /* confounder */ \ + GSS_KRB5_MAX_BLOCKSIZE /* possible padding */ \ + GSS_KRB5_TOK_HDR_LEN /* encrypted hdr in v2 token */ \ + GSS_KRB5_MAX_CKSUM_LEN /* encryption hmac */ \ + XDR_UNIT * 2 /* RPC verifier */ \ + GSS_KRB5_TOK_HDR_LEN \ + GSS_KRB5_MAX_CKSUM_LEN) #define GSS_CRED_SLACK (RPC_MAX_AUTH_SIZE * 2) /* length of a krb5 verifier (48), plus data added before arguments when * using integrity (two 4-byte integers): */ #define GSS_VERF_SLACK 100 static DEFINE_HASHTABLE(gss_auth_hash_table, 4); static DEFINE_SPINLOCK(gss_auth_hash_lock); struct gss_pipe { struct rpc_pipe_dir_object pdo; struct rpc_pipe *pipe; struct rpc_clnt *clnt; const char *name; struct kref kref; }; struct gss_auth { struct kref kref; struct hlist_node hash; struct rpc_auth rpc_auth; struct gss_api_mech *mech; enum rpc_gss_svc service; struct rpc_clnt *client; struct net *net; netns_tracker ns_tracker; /* * There are two upcall pipes; dentry[1], named "gssd", is used * for the new text-based upcall; dentry[0] is named after the * mechanism (for example, "krb5") and exists for * backwards-compatibility with older gssd's. */ struct gss_pipe *gss_pipe[2]; const char *target_name; }; /* pipe_version >= 0 if and only if someone has a pipe open. */ static DEFINE_SPINLOCK(pipe_version_lock); static struct rpc_wait_queue pipe_version_rpc_waitqueue; static DECLARE_WAIT_QUEUE_HEAD(pipe_version_waitqueue); static void gss_put_auth(struct gss_auth *gss_auth); static void gss_free_ctx(struct gss_cl_ctx *); static const struct rpc_pipe_ops gss_upcall_ops_v0; static const struct rpc_pipe_ops gss_upcall_ops_v1; static inline struct gss_cl_ctx * gss_get_ctx(struct gss_cl_ctx *ctx) { refcount_inc(&ctx->count); return ctx; } static inline void gss_put_ctx(struct gss_cl_ctx *ctx) { if (refcount_dec_and_test(&ctx->count)) gss_free_ctx(ctx); } /* gss_cred_set_ctx: * called by gss_upcall_callback and gss_create_upcall in order * to set the gss context. The actual exchange of an old context * and a new one is protected by the pipe->lock. */ static void gss_cred_set_ctx(struct rpc_cred *cred, struct gss_cl_ctx *ctx) { struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); if (!test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags)) return; gss_get_ctx(ctx); rcu_assign_pointer(gss_cred->gc_ctx, ctx); set_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); smp_mb__before_atomic(); clear_bit(RPCAUTH_CRED_NEW, &cred->cr_flags); } static struct gss_cl_ctx * gss_cred_get_ctx(struct rpc_cred *cred) { struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_cl_ctx *ctx = NULL; rcu_read_lock(); ctx = rcu_dereference(gss_cred->gc_ctx); if (ctx) gss_get_ctx(ctx); rcu_read_unlock(); return ctx; } static struct gss_cl_ctx * gss_alloc_context(void) { struct gss_cl_ctx *ctx; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (ctx != NULL) { ctx->gc_proc = RPC_GSS_PROC_DATA; ctx->gc_seq = 1; /* NetApp 6.4R1 doesn't accept seq. no. 0 */ spin_lock_init(&ctx->gc_seq_lock); refcount_set(&ctx->count,1); } return ctx; } #define GSSD_MIN_TIMEOUT (60 * 60) static const void * gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct gss_api_mech *gm) { const void *q; unsigned int seclen; unsigned int timeout; unsigned long now = jiffies; u32 window_size; int ret; /* First unsigned int gives the remaining lifetime in seconds of the * credential - e.g. the remaining TGT lifetime for Kerberos or * the -t value passed to GSSD. */ p = simple_get_bytes(p, end, &timeout, sizeof(timeout)); if (IS_ERR(p)) goto err; if (timeout == 0) timeout = GSSD_MIN_TIMEOUT; ctx->gc_expiry = now + ((unsigned long)timeout * HZ); /* Sequence number window. Determines the maximum number of * simultaneous requests */ p = simple_get_bytes(p, end, &window_size, sizeof(window_size)); if (IS_ERR(p)) goto err; ctx->gc_win = window_size; /* gssd signals an error by passing ctx->gc_win = 0: */ if (ctx->gc_win == 0) { /* * in which case, p points to an error code. Anything other * than -EKEYEXPIRED gets converted to -EACCES. */ p = simple_get_bytes(p, end, &ret, sizeof(ret)); if (!IS_ERR(p)) p = (ret == -EKEYEXPIRED) ? ERR_PTR(-EKEYEXPIRED) : ERR_PTR(-EACCES); goto err; } /* copy the opaque wire context */ p = simple_get_netobj(p, end, &ctx->gc_wire_ctx); if (IS_ERR(p)) goto err; /* import the opaque security context */ p = simple_get_bytes(p, end, &seclen, sizeof(seclen)); if (IS_ERR(p)) goto err; q = (const void *)((const char *)p + seclen); if (unlikely(q > end || q < p)) { p = ERR_PTR(-EFAULT); goto err; } ret = gss_import_sec_context(p, seclen, gm, &ctx->gc_gss_ctx, NULL, GFP_KERNEL); if (ret < 0) { trace_rpcgss_import_ctx(ret); p = ERR_PTR(ret); goto err; } /* is there any trailing data? */ if (q == end) { p = q; goto done; } /* pull in acceptor name (if there is one) */ p = simple_get_netobj(q, end, &ctx->gc_acceptor); if (IS_ERR(p)) goto err; done: trace_rpcgss_context(window_size, ctx->gc_expiry, now, timeout, ctx->gc_acceptor.len, ctx->gc_acceptor.data); err: return p; } /* XXX: Need some documentation about why UPCALL_BUF_LEN is so small. * Is user space expecting no more than UPCALL_BUF_LEN bytes? * Note that there are now _two_ NI_MAXHOST sized data items * being passed in this string. */ #define UPCALL_BUF_LEN 256 struct gss_upcall_msg { refcount_t count; kuid_t uid; const char *service_name; struct rpc_pipe_msg msg; struct list_head list; struct gss_auth *auth; struct rpc_pipe *pipe; struct rpc_wait_queue rpc_waitqueue; wait_queue_head_t waitqueue; struct gss_cl_ctx *ctx; char databuf[UPCALL_BUF_LEN]; }; static int get_pipe_version(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); int ret; spin_lock(&pipe_version_lock); if (sn->pipe_version >= 0) { atomic_inc(&sn->pipe_users); ret = sn->pipe_version; } else ret = -EAGAIN; spin_unlock(&pipe_version_lock); return ret; } static void put_pipe_version(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); if (atomic_dec_and_lock(&sn->pipe_users, &pipe_version_lock)) { sn->pipe_version = -1; spin_unlock(&pipe_version_lock); } } static void gss_release_msg(struct gss_upcall_msg *gss_msg) { struct net *net = gss_msg->auth->net; if (!refcount_dec_and_test(&gss_msg->count)) return; put_pipe_version(net); BUG_ON(!list_empty(&gss_msg->list)); if (gss_msg->ctx != NULL) gss_put_ctx(gss_msg->ctx); rpc_destroy_wait_queue(&gss_msg->rpc_waitqueue); gss_put_auth(gss_msg->auth); kfree_const(gss_msg->service_name); kfree(gss_msg); } static struct gss_upcall_msg * __gss_find_upcall(struct rpc_pipe *pipe, kuid_t uid, const struct gss_auth *auth) { struct gss_upcall_msg *pos; list_for_each_entry(pos, &pipe->in_downcall, list) { if (!uid_eq(pos->uid, uid)) continue; if (pos->auth->service != auth->service) continue; refcount_inc(&pos->count); return pos; } return NULL; } /* Try to add an upcall to the pipefs queue. * If an upcall owned by our uid already exists, then we return a reference * to that upcall instead of adding the new upcall. */ static inline struct gss_upcall_msg * gss_add_msg(struct gss_upcall_msg *gss_msg) { struct rpc_pipe *pipe = gss_msg->pipe; struct gss_upcall_msg *old; spin_lock(&pipe->lock); old = __gss_find_upcall(pipe, gss_msg->uid, gss_msg->auth); if (old == NULL) { refcount_inc(&gss_msg->count); list_add(&gss_msg->list, &pipe->in_downcall); } else gss_msg = old; spin_unlock(&pipe->lock); return gss_msg; } static void __gss_unhash_msg(struct gss_upcall_msg *gss_msg) { list_del_init(&gss_msg->list); rpc_wake_up_status(&gss_msg->rpc_waitqueue, gss_msg->msg.errno); wake_up_all(&gss_msg->waitqueue); refcount_dec(&gss_msg->count); } static void gss_unhash_msg(struct gss_upcall_msg *gss_msg) { struct rpc_pipe *pipe = gss_msg->pipe; if (list_empty(&gss_msg->list)) return; spin_lock(&pipe->lock); if (!list_empty(&gss_msg->list)) __gss_unhash_msg(gss_msg); spin_unlock(&pipe->lock); } static void gss_handle_downcall_result(struct gss_cred *gss_cred, struct gss_upcall_msg *gss_msg) { switch (gss_msg->msg.errno) { case 0: if (gss_msg->ctx == NULL) break; clear_bit(RPCAUTH_CRED_NEGATIVE, &gss_cred->gc_base.cr_flags); gss_cred_set_ctx(&gss_cred->gc_base, gss_msg->ctx); break; case -EKEYEXPIRED: set_bit(RPCAUTH_CRED_NEGATIVE, &gss_cred->gc_base.cr_flags); } gss_cred->gc_upcall_timestamp = jiffies; gss_cred->gc_upcall = NULL; rpc_wake_up_status(&gss_msg->rpc_waitqueue, gss_msg->msg.errno); } static void gss_upcall_callback(struct rpc_task *task) { struct gss_cred *gss_cred = container_of(task->tk_rqstp->rq_cred, struct gss_cred, gc_base); struct gss_upcall_msg *gss_msg = gss_cred->gc_upcall; struct rpc_pipe *pipe = gss_msg->pipe; spin_lock(&pipe->lock); gss_handle_downcall_result(gss_cred, gss_msg); spin_unlock(&pipe->lock); task->tk_status = gss_msg->msg.errno; gss_release_msg(gss_msg); } static void gss_encode_v0_msg(struct gss_upcall_msg *gss_msg, const struct cred *cred) { struct user_namespace *userns = cred->user_ns; uid_t uid = from_kuid_munged(userns, gss_msg->uid); memcpy(gss_msg->databuf, &uid, sizeof(uid)); gss_msg->msg.data = gss_msg->databuf; gss_msg->msg.len = sizeof(uid); BUILD_BUG_ON(sizeof(uid) > sizeof(gss_msg->databuf)); } static ssize_t gss_v0_upcall(struct file *file, struct rpc_pipe_msg *msg, char __user *buf, size_t buflen) { struct gss_upcall_msg *gss_msg = container_of(msg, struct gss_upcall_msg, msg); if (msg->copied == 0) gss_encode_v0_msg(gss_msg, file->f_cred); return rpc_pipe_generic_upcall(file, msg, buf, buflen); } static int gss_encode_v1_msg(struct gss_upcall_msg *gss_msg, const char *service_name, const char *target_name, const struct cred *cred) { struct user_namespace *userns = cred->user_ns; struct gss_api_mech *mech = gss_msg->auth->mech; char *p = gss_msg->databuf; size_t buflen = sizeof(gss_msg->databuf); int len; len = scnprintf(p, buflen, "mech=%s uid=%d", mech->gm_name, from_kuid_munged(userns, gss_msg->uid)); buflen -= len; p += len; gss_msg->msg.len = len; /* * target= is a full service principal that names the remote * identity that we are authenticating to. */ if (target_name) { len = scnprintf(p, buflen, " target=%s", target_name); buflen -= len; p += len; gss_msg->msg.len += len; } /* * gssd uses service= and srchost= to select a matching key from * the system's keytab to use as the source principal. * * service= is the service name part of the source principal, * or "*" (meaning choose any). * * srchost= is the hostname part of the source principal. When * not provided, gssd uses the local hostname. */ if (service_name) { char *c = strchr(service_name, '@'); if (!c) len = scnprintf(p, buflen, " service=%s", service_name); else len = scnprintf(p, buflen, " service=%.*s srchost=%s", (int)(c - service_name), service_name, c + 1); buflen -= len; p += len; gss_msg->msg.len += len; } if (mech->gm_upcall_enctypes) { len = scnprintf(p, buflen, " enctypes=%s", mech->gm_upcall_enctypes); buflen -= len; p += len; gss_msg->msg.len += len; } trace_rpcgss_upcall_msg(gss_msg->databuf); len = scnprintf(p, buflen, "\n"); if (len == 0) goto out_overflow; gss_msg->msg.len += len; gss_msg->msg.data = gss_msg->databuf; return 0; out_overflow: WARN_ON_ONCE(1); return -ENOMEM; } static ssize_t gss_v1_upcall(struct file *file, struct rpc_pipe_msg *msg, char __user *buf, size_t buflen) { struct gss_upcall_msg *gss_msg = container_of(msg, struct gss_upcall_msg, msg); int err; if (msg->copied == 0) { err = gss_encode_v1_msg(gss_msg, gss_msg->service_name, gss_msg->auth->target_name, file->f_cred); if (err) return err; } return rpc_pipe_generic_upcall(file, msg, buf, buflen); } static struct gss_upcall_msg * gss_alloc_msg(struct gss_auth *gss_auth, kuid_t uid, const char *service_name) { struct gss_upcall_msg *gss_msg; int vers; int err = -ENOMEM; gss_msg = kzalloc(sizeof(*gss_msg), GFP_KERNEL); if (gss_msg == NULL) goto err; vers = get_pipe_version(gss_auth->net); err = vers; if (err < 0) goto err_free_msg; gss_msg->pipe = gss_auth->gss_pipe[vers]->pipe; INIT_LIST_HEAD(&gss_msg->list); rpc_init_wait_queue(&gss_msg->rpc_waitqueue, "RPCSEC_GSS upcall waitq"); init_waitqueue_head(&gss_msg->waitqueue); refcount_set(&gss_msg->count, 1); gss_msg->uid = uid; gss_msg->auth = gss_auth; kref_get(&gss_auth->kref); if (service_name) { gss_msg->service_name = kstrdup_const(service_name, GFP_KERNEL); if (!gss_msg->service_name) { err = -ENOMEM; goto err_put_pipe_version; } } return gss_msg; err_put_pipe_version: put_pipe_version(gss_auth->net); err_free_msg: kfree(gss_msg); err: return ERR_PTR(err); } static struct gss_upcall_msg * gss_setup_upcall(struct gss_auth *gss_auth, struct rpc_cred *cred) { struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_upcall_msg *gss_new, *gss_msg; kuid_t uid = cred->cr_cred->fsuid; gss_new = gss_alloc_msg(gss_auth, uid, gss_cred->gc_principal); if (IS_ERR(gss_new)) return gss_new; gss_msg = gss_add_msg(gss_new); if (gss_msg == gss_new) { int res; refcount_inc(&gss_msg->count); res = rpc_queue_upcall(gss_new->pipe, &gss_new->msg); if (res) { gss_unhash_msg(gss_new); refcount_dec(&gss_msg->count); gss_release_msg(gss_new); gss_msg = ERR_PTR(res); } } else gss_release_msg(gss_new); return gss_msg; } static void warn_gssd(void) { dprintk("AUTH_GSS upcall failed. Please check user daemon is running.\n"); } static inline int gss_refresh_upcall(struct rpc_task *task) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth); struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_upcall_msg *gss_msg; struct rpc_pipe *pipe; int err = 0; gss_msg = gss_setup_upcall(gss_auth, cred); if (PTR_ERR(gss_msg) == -EAGAIN) { /* XXX: warning on the first, under the assumption we * shouldn't normally hit this case on a refresh. */ warn_gssd(); rpc_sleep_on_timeout(&pipe_version_rpc_waitqueue, task, NULL, jiffies + (15 * HZ)); err = -EAGAIN; goto out; } if (IS_ERR(gss_msg)) { err = PTR_ERR(gss_msg); goto out; } pipe = gss_msg->pipe; spin_lock(&pipe->lock); if (gss_cred->gc_upcall != NULL) rpc_sleep_on(&gss_cred->gc_upcall->rpc_waitqueue, task, NULL); else if (gss_msg->ctx == NULL && gss_msg->msg.errno >= 0) { gss_cred->gc_upcall = gss_msg; /* gss_upcall_callback will release the reference to gss_upcall_msg */ refcount_inc(&gss_msg->count); rpc_sleep_on(&gss_msg->rpc_waitqueue, task, gss_upcall_callback); } else { gss_handle_downcall_result(gss_cred, gss_msg); err = gss_msg->msg.errno; } spin_unlock(&pipe->lock); gss_release_msg(gss_msg); out: trace_rpcgss_upcall_result(from_kuid(&init_user_ns, cred->cr_cred->fsuid), err); return err; } static inline int gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred) { struct net *net = gss_auth->net; struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct rpc_pipe *pipe; struct rpc_cred *cred = &gss_cred->gc_base; struct gss_upcall_msg *gss_msg; DEFINE_WAIT(wait); int err; retry: err = 0; /* if gssd is down, just skip upcalling altogether */ if (!gssd_running(net)) { warn_gssd(); err = -EACCES; goto out; } gss_msg = gss_setup_upcall(gss_auth, cred); if (PTR_ERR(gss_msg) == -EAGAIN) { err = wait_event_interruptible_timeout(pipe_version_waitqueue, sn->pipe_version >= 0, 15 * HZ); if (sn->pipe_version < 0) { warn_gssd(); err = -EACCES; } if (err < 0) goto out; goto retry; } if (IS_ERR(gss_msg)) { err = PTR_ERR(gss_msg); goto out; } pipe = gss_msg->pipe; for (;;) { prepare_to_wait(&gss_msg->waitqueue, &wait, TASK_KILLABLE); spin_lock(&pipe->lock); if (gss_msg->ctx != NULL || gss_msg->msg.errno < 0) { break; } spin_unlock(&pipe->lock); if (fatal_signal_pending(current)) { err = -ERESTARTSYS; goto out_intr; } schedule(); } if (gss_msg->ctx) { trace_rpcgss_ctx_init(gss_cred); gss_cred_set_ctx(cred, gss_msg->ctx); } else { err = gss_msg->msg.errno; } spin_unlock(&pipe->lock); out_intr: finish_wait(&gss_msg->waitqueue, &wait); gss_release_msg(gss_msg); out: trace_rpcgss_upcall_result(from_kuid(&init_user_ns, cred->cr_cred->fsuid), err); return err; } static struct gss_upcall_msg * gss_find_downcall(struct rpc_pipe *pipe, kuid_t uid) { struct gss_upcall_msg *pos; list_for_each_entry(pos, &pipe->in_downcall, list) { if (!uid_eq(pos->uid, uid)) continue; if (!rpc_msg_is_inflight(&pos->msg)) continue; refcount_inc(&pos->count); return pos; } return NULL; } #define MSG_BUF_MAXSIZE 1024 static ssize_t gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { const void *p, *end; void *buf; struct gss_upcall_msg *gss_msg; struct rpc_pipe *pipe = RPC_I(file_inode(filp))->pipe; struct gss_cl_ctx *ctx; uid_t id; kuid_t uid; ssize_t err = -EFBIG; if (mlen > MSG_BUF_MAXSIZE) goto out; err = -ENOMEM; buf = kmalloc(mlen, GFP_KERNEL); if (!buf) goto out; err = -EFAULT; if (copy_from_user(buf, src, mlen)) goto err; end = (const void *)((char *)buf + mlen); p = simple_get_bytes(buf, end, &id, sizeof(id)); if (IS_ERR(p)) { err = PTR_ERR(p); goto err; } uid = make_kuid(current_user_ns(), id); if (!uid_valid(uid)) { err = -EINVAL; goto err; } err = -ENOMEM; ctx = gss_alloc_context(); if (ctx == NULL) goto err; err = -ENOENT; /* Find a matching upcall */ spin_lock(&pipe->lock); gss_msg = gss_find_downcall(pipe, uid); if (gss_msg == NULL) { spin_unlock(&pipe->lock); goto err_put_ctx; } list_del_init(&gss_msg->list); spin_unlock(&pipe->lock); p = gss_fill_context(p, end, ctx, gss_msg->auth->mech); if (IS_ERR(p)) { err = PTR_ERR(p); switch (err) { case -EACCES: case -EKEYEXPIRED: gss_msg->msg.errno = err; err = mlen; break; case -EFAULT: case -ENOMEM: case -EINVAL: case -ENOSYS: gss_msg->msg.errno = -EAGAIN; break; default: printk(KERN_CRIT "%s: bad return from " "gss_fill_context: %zd\n", __func__, err); gss_msg->msg.errno = -EIO; } goto err_release_msg; } gss_msg->ctx = gss_get_ctx(ctx); err = mlen; err_release_msg: spin_lock(&pipe->lock); __gss_unhash_msg(gss_msg); spin_unlock(&pipe->lock); gss_release_msg(gss_msg); err_put_ctx: gss_put_ctx(ctx); err: kfree(buf); out: return err; } static int gss_pipe_open(struct inode *inode, int new_version) { struct net *net = inode->i_sb->s_fs_info; struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); int ret = 0; spin_lock(&pipe_version_lock); if (sn->pipe_version < 0) { /* First open of any gss pipe determines the version: */ sn->pipe_version = new_version; rpc_wake_up(&pipe_version_rpc_waitqueue); wake_up(&pipe_version_waitqueue); } else if (sn->pipe_version != new_version) { /* Trying to open a pipe of a different version */ ret = -EBUSY; goto out; } atomic_inc(&sn->pipe_users); out: spin_unlock(&pipe_version_lock); return ret; } static int gss_pipe_open_v0(struct inode *inode) { return gss_pipe_open(inode, 0); } static int gss_pipe_open_v1(struct inode *inode) { return gss_pipe_open(inode, 1); } static void gss_pipe_release(struct inode *inode) { struct net *net = inode->i_sb->s_fs_info; struct rpc_pipe *pipe = RPC_I(inode)->pipe; struct gss_upcall_msg *gss_msg; restart: spin_lock(&pipe->lock); list_for_each_entry(gss_msg, &pipe->in_downcall, list) { if (!list_empty(&gss_msg->msg.list)) continue; gss_msg->msg.errno = -EPIPE; refcount_inc(&gss_msg->count); __gss_unhash_msg(gss_msg); spin_unlock(&pipe->lock); gss_release_msg(gss_msg); goto restart; } spin_unlock(&pipe->lock); put_pipe_version(net); } static void gss_pipe_destroy_msg(struct rpc_pipe_msg *msg) { struct gss_upcall_msg *gss_msg = container_of(msg, struct gss_upcall_msg, msg); if (msg->errno < 0) { refcount_inc(&gss_msg->count); gss_unhash_msg(gss_msg); if (msg->errno == -ETIMEDOUT) warn_gssd(); gss_release_msg(gss_msg); } gss_release_msg(gss_msg); } static void gss_pipe_dentry_destroy(struct dentry *dir, struct rpc_pipe_dir_object *pdo) { struct gss_pipe *gss_pipe = pdo->pdo_data; rpc_unlink(gss_pipe->pipe); } static int gss_pipe_dentry_create(struct dentry *dir, struct rpc_pipe_dir_object *pdo) { struct gss_pipe *p = pdo->pdo_data; return rpc_mkpipe_dentry(dir, p->name, p->clnt, p->pipe); } static const struct rpc_pipe_dir_object_ops gss_pipe_dir_object_ops = { .create = gss_pipe_dentry_create, .destroy = gss_pipe_dentry_destroy, }; static struct gss_pipe *gss_pipe_alloc(struct rpc_clnt *clnt, const char *name, const struct rpc_pipe_ops *upcall_ops) { struct gss_pipe *p; int err = -ENOMEM; p = kmalloc(sizeof(*p), GFP_KERNEL); if (p == NULL) goto err; p->pipe = rpc_mkpipe_data(upcall_ops, RPC_PIPE_WAIT_FOR_OPEN); if (IS_ERR(p->pipe)) { err = PTR_ERR(p->pipe); goto err_free_gss_pipe; } p->name = name; p->clnt = clnt; kref_init(&p->kref); rpc_init_pipe_dir_object(&p->pdo, &gss_pipe_dir_object_ops, p); return p; err_free_gss_pipe: kfree(p); err: return ERR_PTR(err); } struct gss_alloc_pdo { struct rpc_clnt *clnt; const char *name; const struct rpc_pipe_ops *upcall_ops; }; static int gss_pipe_match_pdo(struct rpc_pipe_dir_object *pdo, void *data) { struct gss_pipe *gss_pipe; struct gss_alloc_pdo *args = data; if (pdo->pdo_ops != &gss_pipe_dir_object_ops) return 0; gss_pipe = container_of(pdo, struct gss_pipe, pdo); if (strcmp(gss_pipe->name, args->name) != 0) return 0; if (!kref_get_unless_zero(&gss_pipe->kref)) return 0; return 1; } static struct rpc_pipe_dir_object *gss_pipe_alloc_pdo(void *data) { struct gss_pipe *gss_pipe; struct gss_alloc_pdo *args = data; gss_pipe = gss_pipe_alloc(args->clnt, args->name, args->upcall_ops); if (!IS_ERR(gss_pipe)) return &gss_pipe->pdo; return NULL; } static struct gss_pipe *gss_pipe_get(struct rpc_clnt *clnt, const char *name, const struct rpc_pipe_ops *upcall_ops) { struct net *net = rpc_net_ns(clnt); struct rpc_pipe_dir_object *pdo; struct gss_alloc_pdo args = { .clnt = clnt, .name = name, .upcall_ops = upcall_ops, }; pdo = rpc_find_or_alloc_pipe_dir_object(net, &clnt->cl_pipedir_objects, gss_pipe_match_pdo, gss_pipe_alloc_pdo, &args); if (pdo != NULL) return container_of(pdo, struct gss_pipe, pdo); return ERR_PTR(-ENOMEM); } static void __gss_pipe_free(struct gss_pipe *p) { struct rpc_clnt *clnt = p->clnt; struct net *net = rpc_net_ns(clnt); rpc_remove_pipe_dir_object(net, &clnt->cl_pipedir_objects, &p->pdo); rpc_destroy_pipe_data(p->pipe); kfree(p); } static void __gss_pipe_release(struct kref *kref) { struct gss_pipe *p = container_of(kref, struct gss_pipe, kref); __gss_pipe_free(p); } static void gss_pipe_free(struct gss_pipe *p) { if (p != NULL) kref_put(&p->kref, __gss_pipe_release); } /* * NOTE: we have the opportunity to use different * parameters based on the input flavor (which must be a pseudoflavor) */ static struct gss_auth * gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { rpc_authflavor_t flavor = args->pseudoflavor; struct gss_auth *gss_auth; struct gss_pipe *gss_pipe; struct rpc_auth * auth; int err = -ENOMEM; /* XXX? */ if (!try_module_get(THIS_MODULE)) return ERR_PTR(err); if (!(gss_auth = kmalloc(sizeof(*gss_auth), GFP_KERNEL))) goto out_dec; INIT_HLIST_NODE(&gss_auth->hash); gss_auth->target_name = NULL; if (args->target_name) { gss_auth->target_name = kstrdup(args->target_name, GFP_KERNEL); if (gss_auth->target_name == NULL) goto err_free; } gss_auth->client = clnt; gss_auth->net = get_net_track(rpc_net_ns(clnt), &gss_auth->ns_tracker, GFP_KERNEL); err = -EINVAL; gss_auth->mech = gss_mech_get_by_pseudoflavor(flavor); if (!gss_auth->mech) goto err_put_net; gss_auth->service = gss_pseudoflavor_to_service(gss_auth->mech, flavor); if (gss_auth->service == 0) goto err_put_mech; if (!gssd_running(gss_auth->net)) goto err_put_mech; auth = &gss_auth->rpc_auth; auth->au_cslack = GSS_CRED_SLACK >> 2; BUILD_BUG_ON(GSS_KRB5_MAX_SLACK_NEEDED > RPC_MAX_AUTH_SIZE); auth->au_rslack = GSS_KRB5_MAX_SLACK_NEEDED >> 2; auth->au_verfsize = GSS_VERF_SLACK >> 2; auth->au_ralign = GSS_VERF_SLACK >> 2; __set_bit(RPCAUTH_AUTH_UPDATE_SLACK, &auth->au_flags); auth->au_ops = &authgss_ops; auth->au_flavor = flavor; if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor)) __set_bit(RPCAUTH_AUTH_DATATOUCH, &auth->au_flags); refcount_set(&auth->au_count, 1); kref_init(&gss_auth->kref); err = rpcauth_init_credcache(auth); if (err) goto err_put_mech; /* * Note: if we created the old pipe first, then someone who * examined the directory at the right moment might conclude * that we supported only the old pipe. So we instead create * the new pipe first. */ gss_pipe = gss_pipe_get(clnt, "gssd", &gss_upcall_ops_v1); if (IS_ERR(gss_pipe)) { err = PTR_ERR(gss_pipe); goto err_destroy_credcache; } gss_auth->gss_pipe[1] = gss_pipe; gss_pipe = gss_pipe_get(clnt, gss_auth->mech->gm_name, &gss_upcall_ops_v0); if (IS_ERR(gss_pipe)) { err = PTR_ERR(gss_pipe); goto err_destroy_pipe_1; } gss_auth->gss_pipe[0] = gss_pipe; return gss_auth; err_destroy_pipe_1: gss_pipe_free(gss_auth->gss_pipe[1]); err_destroy_credcache: rpcauth_destroy_credcache(auth); err_put_mech: gss_mech_put(gss_auth->mech); err_put_net: put_net_track(gss_auth->net, &gss_auth->ns_tracker); err_free: kfree(gss_auth->target_name); kfree(gss_auth); out_dec: module_put(THIS_MODULE); trace_rpcgss_createauth(flavor, err); return ERR_PTR(err); } static void gss_free(struct gss_auth *gss_auth) { gss_pipe_free(gss_auth->gss_pipe[0]); gss_pipe_free(gss_auth->gss_pipe[1]); gss_mech_put(gss_auth->mech); put_net_track(gss_auth->net, &gss_auth->ns_tracker); kfree(gss_auth->target_name); kfree(gss_auth); module_put(THIS_MODULE); } static void gss_free_callback(struct kref *kref) { struct gss_auth *gss_auth = container_of(kref, struct gss_auth, kref); gss_free(gss_auth); } static void gss_put_auth(struct gss_auth *gss_auth) { kref_put(&gss_auth->kref, gss_free_callback); } static void gss_destroy(struct rpc_auth *auth) { struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth); if (hash_hashed(&gss_auth->hash)) { spin_lock(&gss_auth_hash_lock); hash_del(&gss_auth->hash); spin_unlock(&gss_auth_hash_lock); } gss_pipe_free(gss_auth->gss_pipe[0]); gss_auth->gss_pipe[0] = NULL; gss_pipe_free(gss_auth->gss_pipe[1]); gss_auth->gss_pipe[1] = NULL; rpcauth_destroy_credcache(auth); gss_put_auth(gss_auth); } /* * Auths may be shared between rpc clients that were cloned from a * common client with the same xprt, if they also share the flavor and * target_name. * * The auth is looked up from the oldest parent sharing the same * cl_xprt, and the auth itself references only that common parent * (which is guaranteed to last as long as any of its descendants). */ static struct gss_auth * gss_auth_find_or_add_hashed(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt, struct gss_auth *new) { struct gss_auth *gss_auth; unsigned long hashval = (unsigned long)clnt; spin_lock(&gss_auth_hash_lock); hash_for_each_possible(gss_auth_hash_table, gss_auth, hash, hashval) { if (gss_auth->client != clnt) continue; if (gss_auth->rpc_auth.au_flavor != args->pseudoflavor) continue; if (gss_auth->target_name != args->target_name) { if (gss_auth->target_name == NULL) continue; if (args->target_name == NULL) continue; if (strcmp(gss_auth->target_name, args->target_name)) continue; } if (!refcount_inc_not_zero(&gss_auth->rpc_auth.au_count)) continue; goto out; } if (new) hash_add(gss_auth_hash_table, &new->hash, hashval); gss_auth = new; out: spin_unlock(&gss_auth_hash_lock); return gss_auth; } static struct gss_auth * gss_create_hashed(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { struct gss_auth *gss_auth; struct gss_auth *new; gss_auth = gss_auth_find_or_add_hashed(args, clnt, NULL); if (gss_auth != NULL) goto out; new = gss_create_new(args, clnt); if (IS_ERR(new)) return new; gss_auth = gss_auth_find_or_add_hashed(args, clnt, new); if (gss_auth != new) gss_destroy(&new->rpc_auth); out: return gss_auth; } static struct rpc_auth * gss_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { struct gss_auth *gss_auth; struct rpc_xprt_switch *xps = rcu_access_pointer(clnt->cl_xpi.xpi_xpswitch); while (clnt != clnt->cl_parent) { struct rpc_clnt *parent = clnt->cl_parent; /* Find the original parent for this transport */ if (rcu_access_pointer(parent->cl_xpi.xpi_xpswitch) != xps) break; clnt = parent; } gss_auth = gss_create_hashed(args, clnt); if (IS_ERR(gss_auth)) return ERR_CAST(gss_auth); return &gss_auth->rpc_auth; } static struct gss_cred * gss_dup_cred(struct gss_auth *gss_auth, struct gss_cred *gss_cred) { struct gss_cred *new; /* Make a copy of the cred so that we can reference count it */ new = kzalloc(sizeof(*gss_cred), GFP_KERNEL); if (new) { struct auth_cred acred = { .cred = gss_cred->gc_base.cr_cred, }; struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1); rpcauth_init_cred(&new->gc_base, &acred, &gss_auth->rpc_auth, &gss_nullops); new->gc_base.cr_flags = 1UL << RPCAUTH_CRED_UPTODATE; new->gc_service = gss_cred->gc_service; new->gc_principal = gss_cred->gc_principal; kref_get(&gss_auth->kref); rcu_assign_pointer(new->gc_ctx, ctx); gss_get_ctx(ctx); } return new; } /* * gss_send_destroy_context will cause the RPCSEC_GSS to send a NULL RPC call * to the server with the GSS control procedure field set to * RPC_GSS_PROC_DESTROY. This should normally cause the server to release * all RPCSEC_GSS state associated with that context. */ static void gss_send_destroy_context(struct rpc_cred *cred) { struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth); struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1); struct gss_cred *new; struct rpc_task *task; new = gss_dup_cred(gss_auth, gss_cred); if (new) { ctx->gc_proc = RPC_GSS_PROC_DESTROY; trace_rpcgss_ctx_destroy(gss_cred); task = rpc_call_null(gss_auth->client, &new->gc_base, RPC_TASK_ASYNC); if (!IS_ERR(task)) rpc_put_task(task); put_rpccred(&new->gc_base); } } /* gss_destroy_cred (and gss_free_ctx) are used to clean up after failure * to create a new cred or context, so they check that things have been * allocated before freeing them. */ static void gss_do_free_ctx(struct gss_cl_ctx *ctx) { gss_delete_sec_context(&ctx->gc_gss_ctx); kfree(ctx->gc_wire_ctx.data); kfree(ctx->gc_acceptor.data); kfree(ctx); } static void gss_free_ctx_callback(struct rcu_head *head) { struct gss_cl_ctx *ctx = container_of(head, struct gss_cl_ctx, gc_rcu); gss_do_free_ctx(ctx); } static void gss_free_ctx(struct gss_cl_ctx *ctx) { call_rcu(&ctx->gc_rcu, gss_free_ctx_callback); } static void gss_free_cred(struct gss_cred *gss_cred) { kfree(gss_cred); } static void gss_free_cred_callback(struct rcu_head *head) { struct gss_cred *gss_cred = container_of(head, struct gss_cred, gc_base.cr_rcu); gss_free_cred(gss_cred); } static void gss_destroy_nullcred(struct rpc_cred *cred) { struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth); struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1); RCU_INIT_POINTER(gss_cred->gc_ctx, NULL); put_cred(cred->cr_cred); call_rcu(&cred->cr_rcu, gss_free_cred_callback); if (ctx) gss_put_ctx(ctx); gss_put_auth(gss_auth); } static void gss_destroy_cred(struct rpc_cred *cred) { if (test_and_clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0) gss_send_destroy_context(cred); gss_destroy_nullcred(cred); } static int gss_hash_cred(struct auth_cred *acred, unsigned int hashbits) { return hash_64(from_kuid(&init_user_ns, acred->cred->fsuid), hashbits); } /* * Lookup RPCSEC_GSS cred for the current process */ static struct rpc_cred *gss_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) { return rpcauth_lookup_credcache(auth, acred, flags, rpc_task_gfp_mask()); } static struct rpc_cred * gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp) { struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth); struct gss_cred *cred = NULL; int err = -ENOMEM; if (!(cred = kzalloc(sizeof(*cred), gfp))) goto out_err; rpcauth_init_cred(&cred->gc_base, acred, auth, &gss_credops); /* * Note: in order to force a call to call_refresh(), we deliberately * fail to flag the credential as RPCAUTH_CRED_UPTODATE. */ cred->gc_base.cr_flags = 1UL << RPCAUTH_CRED_NEW; cred->gc_service = gss_auth->service; cred->gc_principal = acred->principal; kref_get(&gss_auth->kref); return &cred->gc_base; out_err: return ERR_PTR(err); } static int gss_cred_init(struct rpc_auth *auth, struct rpc_cred *cred) { struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth); struct gss_cred *gss_cred = container_of(cred,struct gss_cred, gc_base); int err; do { err = gss_create_upcall(gss_auth, gss_cred); } while (err == -EAGAIN); return err; } static char * gss_stringify_acceptor(struct rpc_cred *cred) { char *string = NULL; struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_cl_ctx *ctx; unsigned int len; struct xdr_netobj *acceptor; rcu_read_lock(); ctx = rcu_dereference(gss_cred->gc_ctx); if (!ctx) goto out; len = ctx->gc_acceptor.len; rcu_read_unlock(); /* no point if there's no string */ if (!len) return NULL; realloc: string = kmalloc(len + 1, GFP_KERNEL); if (!string) return NULL; rcu_read_lock(); ctx = rcu_dereference(gss_cred->gc_ctx); /* did the ctx disappear or was it replaced by one with no acceptor? */ if (!ctx || !ctx->gc_acceptor.len) { kfree(string); string = NULL; goto out; } acceptor = &ctx->gc_acceptor; /* * Did we find a new acceptor that's longer than the original? Allocate * a longer buffer and try again. */ if (len < acceptor->len) { len = acceptor->len; rcu_read_unlock(); kfree(string); goto realloc; } memcpy(string, acceptor->data, acceptor->len); string[acceptor->len] = '\0'; out: rcu_read_unlock(); return string; } /* * Returns -EACCES if GSS context is NULL or will expire within the * timeout (miliseconds) */ static int gss_key_timeout(struct rpc_cred *rc) { struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base); struct gss_cl_ctx *ctx; unsigned long timeout = jiffies + (gss_key_expire_timeo * HZ); int ret = 0; rcu_read_lock(); ctx = rcu_dereference(gss_cred->gc_ctx); if (!ctx || time_after(timeout, ctx->gc_expiry)) ret = -EACCES; rcu_read_unlock(); return ret; } static int gss_match(struct auth_cred *acred, struct rpc_cred *rc, int flags) { struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base); struct gss_cl_ctx *ctx; int ret; if (test_bit(RPCAUTH_CRED_NEW, &rc->cr_flags)) goto out; /* Don't match with creds that have expired. */ rcu_read_lock(); ctx = rcu_dereference(gss_cred->gc_ctx); if (!ctx || time_after(jiffies, ctx->gc_expiry)) { rcu_read_unlock(); return 0; } rcu_read_unlock(); if (!test_bit(RPCAUTH_CRED_UPTODATE, &rc->cr_flags)) return 0; out: if (acred->principal != NULL) { if (gss_cred->gc_principal == NULL) return 0; ret = strcmp(acred->principal, gss_cred->gc_principal) == 0; } else { if (gss_cred->gc_principal != NULL) return 0; ret = uid_eq(rc->cr_cred->fsuid, acred->cred->fsuid); } return ret; } /* * Marshal credentials. * * The expensive part is computing the verifier. We can't cache a * pre-computed version of the verifier because the seqno, which * is different every time, is included in the MIC. */ static int gss_marshal(struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_cred *cred = req->rq_cred; struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); __be32 *p, *cred_len; u32 maj_stat = 0; struct xdr_netobj mic; struct kvec iov; struct xdr_buf verf_buf; int status; u32 seqno; /* Credential */ p = xdr_reserve_space(xdr, 7 * sizeof(*p) + ctx->gc_wire_ctx.len); if (!p) goto marshal_failed; *p++ = rpc_auth_gss; cred_len = p++; spin_lock(&ctx->gc_seq_lock); seqno = (ctx->gc_seq < MAXSEQ) ? ctx->gc_seq++ : MAXSEQ; xprt_rqst_add_seqno(req, seqno); spin_unlock(&ctx->gc_seq_lock); if (*req->rq_seqnos == MAXSEQ) goto expired; trace_rpcgss_seqno(task); *p++ = cpu_to_be32(RPC_GSS_VERSION); *p++ = cpu_to_be32(ctx->gc_proc); *p++ = cpu_to_be32(*req->rq_seqnos); *p++ = cpu_to_be32(gss_cred->gc_service); p = xdr_encode_netobj(p, &ctx->gc_wire_ctx); *cred_len = cpu_to_be32((p - (cred_len + 1)) << 2); /* Verifier */ /* We compute the checksum for the verifier over the xdr-encoded bytes * starting with the xid and ending at the end of the credential: */ iov.iov_base = req->rq_snd_buf.head[0].iov_base; iov.iov_len = (u8 *)p - (u8 *)iov.iov_base; xdr_buf_from_iov(&iov, &verf_buf); p = xdr_reserve_space(xdr, sizeof(*p)); if (!p) goto marshal_failed; *p++ = rpc_auth_gss; mic.data = (u8 *)(p + 1); maj_stat = gss_get_mic(ctx->gc_gss_ctx, &verf_buf, &mic); if (maj_stat == GSS_S_CONTEXT_EXPIRED) goto expired; else if (maj_stat != 0) goto bad_mic; if (xdr_stream_encode_opaque_inline(xdr, (void **)&p, mic.len) < 0) goto marshal_failed; status = 0; out: gss_put_ctx(ctx); return status; expired: clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); status = -EKEYEXPIRED; goto out; marshal_failed: status = -EMSGSIZE; goto out; bad_mic: trace_rpcgss_get_mic(task, maj_stat); status = -EIO; goto out; } static int gss_renew_cred(struct rpc_task *task) { struct rpc_cred *oldcred = task->tk_rqstp->rq_cred; struct gss_cred *gss_cred = container_of(oldcred, struct gss_cred, gc_base); struct rpc_auth *auth = oldcred->cr_auth; struct auth_cred acred = { .cred = oldcred->cr_cred, .principal = gss_cred->gc_principal, }; struct rpc_cred *new; new = gss_lookup_cred(auth, &acred, RPCAUTH_LOOKUP_NEW); if (IS_ERR(new)) return PTR_ERR(new); task->tk_rqstp->rq_cred = new; put_rpccred(oldcred); return 0; } static int gss_cred_is_negative_entry(struct rpc_cred *cred) { if (test_bit(RPCAUTH_CRED_NEGATIVE, &cred->cr_flags)) { unsigned long now = jiffies; unsigned long begin, expire; struct gss_cred *gss_cred; gss_cred = container_of(cred, struct gss_cred, gc_base); begin = gss_cred->gc_upcall_timestamp; expire = begin + gss_expired_cred_retry_delay * HZ; if (time_in_range_open(now, begin, expire)) return 1; } return 0; } /* * Refresh credentials. XXX - finish */ static int gss_refresh(struct rpc_task *task) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; int ret = 0; if (gss_cred_is_negative_entry(cred)) return -EKEYEXPIRED; if (!test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags) && !test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags)) { ret = gss_renew_cred(task); if (ret < 0) goto out; cred = task->tk_rqstp->rq_cred; } if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags)) ret = gss_refresh_upcall(task); out: return ret; } /* Dummy refresh routine: used only when destroying the context */ static int gss_refresh_null(struct rpc_task *task) { return 0; } static u32 gss_validate_seqno_mic(struct gss_cl_ctx *ctx, u32 seqno, __be32 *seq, __be32 *p, u32 len) { struct kvec iov; struct xdr_buf verf_buf; struct xdr_netobj mic; *seq = cpu_to_be32(seqno); iov.iov_base = seq; iov.iov_len = 4; xdr_buf_from_iov(&iov, &verf_buf); mic.data = (u8 *)p; mic.len = len; return gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic); } static int gss_validate(struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); __be32 *p, *seq = NULL; u32 len, maj_stat; int status; int i = 1; /* don't recheck the first item */ p = xdr_inline_decode(xdr, 2 * sizeof(*p)); if (!p) goto validate_failed; if (*p++ != rpc_auth_gss) goto validate_failed; len = be32_to_cpup(p); if (len > RPC_MAX_AUTH_SIZE) goto validate_failed; p = xdr_inline_decode(xdr, len); if (!p) goto validate_failed; seq = kmalloc(4, GFP_KERNEL); if (!seq) goto validate_failed; maj_stat = gss_validate_seqno_mic(ctx, task->tk_rqstp->rq_seqnos[0], seq, p, len); /* RFC 2203 5.3.3.1 - compute the checksum of each sequence number in the cache */ while (unlikely(maj_stat == GSS_S_BAD_SIG && i < task->tk_rqstp->rq_seqno_count)) maj_stat = gss_validate_seqno_mic(ctx, task->tk_rqstp->rq_seqnos[i++], seq, p, len); if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); if (maj_stat) goto bad_mic; /* We leave it to unwrap to calculate au_rslack. For now we just * calculate the length of the verifier: */ if (test_bit(RPCAUTH_AUTH_UPDATE_SLACK, &cred->cr_auth->au_flags)) cred->cr_auth->au_verfsize = XDR_QUADLEN(len) + 2; status = 0; out: gss_put_ctx(ctx); kfree(seq); return status; validate_failed: status = -EIO; goto out; bad_mic: trace_rpcgss_verify_mic(task, maj_stat); status = -EACCES; goto out; } static noinline_for_stack int gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_rqst *rqstp = task->tk_rqstp; struct xdr_buf integ_buf, *snd_buf = &rqstp->rq_snd_buf; struct xdr_netobj mic; __be32 *p, *integ_len; u32 offset, maj_stat; p = xdr_reserve_space(xdr, 2 * sizeof(*p)); if (!p) goto wrap_failed; integ_len = p++; *p = cpu_to_be32(*rqstp->rq_seqnos); if (rpcauth_wrap_req_encode(task, xdr)) goto wrap_failed; offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base; if (xdr_buf_subsegment(snd_buf, &integ_buf, offset, snd_buf->len - offset)) goto wrap_failed; *integ_len = cpu_to_be32(integ_buf.len); p = xdr_reserve_space(xdr, 0); if (!p) goto wrap_failed; mic.data = (u8 *)(p + 1); maj_stat = gss_get_mic(ctx->gc_gss_ctx, &integ_buf, &mic); if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); else if (maj_stat) goto bad_mic; /* Check that the trailing MIC fit in the buffer, after the fact */ if (xdr_stream_encode_opaque_inline(xdr, (void **)&p, mic.len) < 0) goto wrap_failed; return 0; wrap_failed: return -EMSGSIZE; bad_mic: trace_rpcgss_get_mic(task, maj_stat); return -EIO; } static void priv_release_snd_buf(struct rpc_rqst *rqstp) { int i; for (i=0; i < rqstp->rq_enc_pages_num; i++) __free_page(rqstp->rq_enc_pages[i]); kfree(rqstp->rq_enc_pages); rqstp->rq_release_snd_buf = NULL; } static int alloc_enc_pages(struct rpc_rqst *rqstp) { struct xdr_buf *snd_buf = &rqstp->rq_snd_buf; int first, last, i; if (rqstp->rq_release_snd_buf) rqstp->rq_release_snd_buf(rqstp); if (snd_buf->page_len == 0) { rqstp->rq_enc_pages_num = 0; return 0; } first = snd_buf->page_base >> PAGE_SHIFT; last = (snd_buf->page_base + snd_buf->page_len - 1) >> PAGE_SHIFT; rqstp->rq_enc_pages_num = last - first + 1 + 1; rqstp->rq_enc_pages = kmalloc_array(rqstp->rq_enc_pages_num, sizeof(struct page *), GFP_KERNEL); if (!rqstp->rq_enc_pages) goto out; for (i=0; i < rqstp->rq_enc_pages_num; i++) { rqstp->rq_enc_pages[i] = alloc_page(GFP_KERNEL); if (rqstp->rq_enc_pages[i] == NULL) goto out_free; } rqstp->rq_release_snd_buf = priv_release_snd_buf; return 0; out_free: rqstp->rq_enc_pages_num = i; priv_release_snd_buf(rqstp); out: return -EAGAIN; } static noinline_for_stack int gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_rqst *rqstp = task->tk_rqstp; struct xdr_buf *snd_buf = &rqstp->rq_snd_buf; u32 pad, offset, maj_stat; int status; __be32 *p, *opaque_len; struct page **inpages; int first; struct kvec *iov; status = -EIO; p = xdr_reserve_space(xdr, 2 * sizeof(*p)); if (!p) goto wrap_failed; opaque_len = p++; *p = cpu_to_be32(*rqstp->rq_seqnos); if (rpcauth_wrap_req_encode(task, xdr)) goto wrap_failed; status = alloc_enc_pages(rqstp); if (unlikely(status)) goto wrap_failed; first = snd_buf->page_base >> PAGE_SHIFT; inpages = snd_buf->pages + first; snd_buf->pages = rqstp->rq_enc_pages; snd_buf->page_base -= first << PAGE_SHIFT; /* * Move the tail into its own page, in case gss_wrap needs * more space in the head when wrapping. * * Still... Why can't gss_wrap just slide the tail down? */ if (snd_buf->page_len || snd_buf->tail[0].iov_len) { char *tmp; tmp = page_address(rqstp->rq_enc_pages[rqstp->rq_enc_pages_num - 1]); memcpy(tmp, snd_buf->tail[0].iov_base, snd_buf->tail[0].iov_len); snd_buf->tail[0].iov_base = tmp; } offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base; maj_stat = gss_wrap(ctx->gc_gss_ctx, offset, snd_buf, inpages); /* slack space should prevent this ever happening: */ if (unlikely(snd_buf->len > snd_buf->buflen)) { status = -EIO; goto wrap_failed; } /* We're assuming that when GSS_S_CONTEXT_EXPIRED, the encryption was * done anyway, so it's safe to put the request on the wire: */ if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); else if (maj_stat) goto bad_wrap; *opaque_len = cpu_to_be32(snd_buf->len - offset); /* guess whether the pad goes into the head or the tail: */ if (snd_buf->page_len || snd_buf->tail[0].iov_len) iov = snd_buf->tail; else iov = snd_buf->head; p = iov->iov_base + iov->iov_len; pad = xdr_pad_size(snd_buf->len - offset); memset(p, 0, pad); iov->iov_len += pad; snd_buf->len += pad; return 0; wrap_failed: return status; bad_wrap: trace_rpcgss_wrap(task, maj_stat); return -EIO; } static int gss_wrap_req(struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); int status; status = -EIO; if (ctx->gc_proc != RPC_GSS_PROC_DATA) { /* The spec seems a little ambiguous here, but I think that not * wrapping context destruction requests makes the most sense. */ status = rpcauth_wrap_req_encode(task, xdr); goto out; } switch (gss_cred->gc_service) { case RPC_GSS_SVC_NONE: status = rpcauth_wrap_req_encode(task, xdr); break; case RPC_GSS_SVC_INTEGRITY: status = gss_wrap_req_integ(cred, ctx, task, xdr); break; case RPC_GSS_SVC_PRIVACY: status = gss_wrap_req_priv(cred, ctx, task, xdr); break; default: status = -EIO; } out: gss_put_ctx(ctx); return status; } /** * gss_update_rslack - Possibly update RPC receive buffer size estimates * @task: rpc_task for incoming RPC Reply being unwrapped * @cred: controlling rpc_cred for @task * @before: XDR words needed before each RPC Reply message * @after: XDR words needed following each RPC Reply message * */ static void gss_update_rslack(struct rpc_task *task, struct rpc_cred *cred, unsigned int before, unsigned int after) { struct rpc_auth *auth = cred->cr_auth; if (test_and_clear_bit(RPCAUTH_AUTH_UPDATE_SLACK, &auth->au_flags)) { auth->au_ralign = auth->au_verfsize + before; auth->au_rslack = auth->au_verfsize + after; trace_rpcgss_update_slack(task, auth); } } static int gss_unwrap_resp_auth(struct rpc_task *task, struct rpc_cred *cred) { gss_update_rslack(task, cred, 0, 0); return 0; } /* * RFC 2203, Section 5.3.2.2 * * struct rpc_gss_integ_data { * opaque databody_integ<>; * opaque checksum<>; * }; * * struct rpc_gss_data_t { * unsigned int seq_num; * proc_req_arg_t arg; * }; */ static noinline_for_stack int gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred, struct gss_cl_ctx *ctx, struct rpc_rqst *rqstp, struct xdr_stream *xdr) { struct xdr_buf gss_data, *rcv_buf = &rqstp->rq_rcv_buf; u32 len, offset, seqno, maj_stat; struct xdr_netobj mic; int ret; ret = -EIO; mic.data = NULL; /* opaque databody_integ<>; */ if (xdr_stream_decode_u32(xdr, &len)) goto unwrap_failed; if (len & 3) goto unwrap_failed; offset = rcv_buf->len - xdr_stream_remaining(xdr); if (xdr_stream_decode_u32(xdr, &seqno)) goto unwrap_failed; if (seqno != *rqstp->rq_seqnos) goto bad_seqno; if (xdr_buf_subsegment(rcv_buf, &gss_data, offset, len)) goto unwrap_failed; /* * The xdr_stream now points to the beginning of the * upper layer payload, to be passed below to * rpcauth_unwrap_resp_decode(). The checksum, which * follows the upper layer payload in @rcv_buf, is * located and parsed without updating the xdr_stream. */ /* opaque checksum<>; */ offset += len; if (xdr_decode_word(rcv_buf, offset, &len)) goto unwrap_failed; offset += sizeof(__be32); if (offset + len > rcv_buf->len) goto unwrap_failed; mic.len = len; mic.data = kmalloc(len, GFP_KERNEL); if (ZERO_OR_NULL_PTR(mic.data)) goto unwrap_failed; if (read_bytes_from_xdr_buf(rcv_buf, offset, mic.data, mic.len)) goto unwrap_failed; maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &gss_data, &mic); if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); if (maj_stat != GSS_S_COMPLETE) goto bad_mic; gss_update_rslack(task, cred, 2, 2 + 1 + XDR_QUADLEN(mic.len)); ret = 0; out: kfree(mic.data); return ret; unwrap_failed: trace_rpcgss_unwrap_failed(task); goto out; bad_seqno: trace_rpcgss_bad_seqno(task, *rqstp->rq_seqnos, seqno); goto out; bad_mic: trace_rpcgss_verify_mic(task, maj_stat); goto out; } static noinline_for_stack int gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred, struct gss_cl_ctx *ctx, struct rpc_rqst *rqstp, struct xdr_stream *xdr) { struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf; struct kvec *head = rqstp->rq_rcv_buf.head; u32 offset, opaque_len, maj_stat; __be32 *p; p = xdr_inline_decode(xdr, 2 * sizeof(*p)); if (unlikely(!p)) goto unwrap_failed; opaque_len = be32_to_cpup(p++); offset = (u8 *)(p) - (u8 *)head->iov_base; if (offset + opaque_len > rcv_buf->len) goto unwrap_failed; maj_stat = gss_unwrap(ctx->gc_gss_ctx, offset, offset + opaque_len, rcv_buf); if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); if (maj_stat != GSS_S_COMPLETE) goto bad_unwrap; /* gss_unwrap decrypted the sequence number */ if (be32_to_cpup(p++) != *rqstp->rq_seqnos) goto bad_seqno; /* gss_unwrap redacts the opaque blob from the head iovec. * rcv_buf has changed, thus the stream needs to be reset. */ xdr_init_decode(xdr, rcv_buf, p, rqstp); gss_update_rslack(task, cred, 2 + ctx->gc_gss_ctx->align, 2 + ctx->gc_gss_ctx->slack); return 0; unwrap_failed: trace_rpcgss_unwrap_failed(task); return -EIO; bad_seqno: trace_rpcgss_bad_seqno(task, *rqstp->rq_seqnos, be32_to_cpup(--p)); return -EIO; bad_unwrap: trace_rpcgss_unwrap(task, maj_stat); return -EIO; } static bool gss_seq_is_newer(u32 new, u32 old) { return (s32)(new - old) > 0; } static bool gss_xmit_need_reencode(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_cred *cred = req->rq_cred; struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); u32 win, seq_xmit = 0; bool ret = true; if (!ctx) goto out; if (gss_seq_is_newer(*req->rq_seqnos, READ_ONCE(ctx->gc_seq))) goto out_ctx; seq_xmit = READ_ONCE(ctx->gc_seq_xmit); while (gss_seq_is_newer(*req->rq_seqnos, seq_xmit)) { u32 tmp = seq_xmit; seq_xmit = cmpxchg(&ctx->gc_seq_xmit, tmp, *req->rq_seqnos); if (seq_xmit == tmp) { ret = false; goto out_ctx; } } win = ctx->gc_win; if (win > 0) ret = !gss_seq_is_newer(*req->rq_seqnos, seq_xmit - win); out_ctx: gss_put_ctx(ctx); out: trace_rpcgss_need_reencode(task, seq_xmit, ret); return ret; } static int gss_unwrap_resp(struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_rqst *rqstp = task->tk_rqstp; struct rpc_cred *cred = rqstp->rq_cred; struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); int status = -EIO; if (ctx->gc_proc != RPC_GSS_PROC_DATA) goto out_decode; switch (gss_cred->gc_service) { case RPC_GSS_SVC_NONE: status = gss_unwrap_resp_auth(task, cred); break; case RPC_GSS_SVC_INTEGRITY: status = gss_unwrap_resp_integ(task, cred, ctx, rqstp, xdr); break; case RPC_GSS_SVC_PRIVACY: status = gss_unwrap_resp_priv(task, cred, ctx, rqstp, xdr); break; } if (status) goto out; out_decode: status = rpcauth_unwrap_resp_decode(task, xdr); out: gss_put_ctx(ctx); return status; } static const struct rpc_authops authgss_ops = { .owner = THIS_MODULE, .au_flavor = RPC_AUTH_GSS, .au_name = "RPCSEC_GSS", .create = gss_create, .destroy = gss_destroy, .hash_cred = gss_hash_cred, .lookup_cred = gss_lookup_cred, .crcreate = gss_create_cred, .info2flavor = gss_mech_info2flavor, .flavor2info = gss_mech_flavor2info, }; static const struct rpc_credops gss_credops = { .cr_name = "AUTH_GSS", .crdestroy = gss_destroy_cred, .cr_init = gss_cred_init, .crmatch = gss_match, .crmarshal = gss_marshal, .crrefresh = gss_refresh, .crvalidate = gss_validate, .crwrap_req = gss_wrap_req, .crunwrap_resp = gss_unwrap_resp, .crkey_timeout = gss_key_timeout, .crstringify_acceptor = gss_stringify_acceptor, .crneed_reencode = gss_xmit_need_reencode, }; static const struct rpc_credops gss_nullops = { .cr_name = "AUTH_GSS", .crdestroy = gss_destroy_nullcred, .crmatch = gss_match, .crmarshal = gss_marshal, .crrefresh = gss_refresh_null, .crvalidate = gss_validate, .crwrap_req = gss_wrap_req, .crunwrap_resp = gss_unwrap_resp, .crstringify_acceptor = gss_stringify_acceptor, }; static const struct rpc_pipe_ops gss_upcall_ops_v0 = { .upcall = gss_v0_upcall, .downcall = gss_pipe_downcall, .destroy_msg = gss_pipe_destroy_msg, .open_pipe = gss_pipe_open_v0, .release_pipe = gss_pipe_release, }; static const struct rpc_pipe_ops gss_upcall_ops_v1 = { .upcall = gss_v1_upcall, .downcall = gss_pipe_downcall, .destroy_msg = gss_pipe_destroy_msg, .open_pipe = gss_pipe_open_v1, .release_pipe = gss_pipe_release, }; static __net_init int rpcsec_gss_init_net(struct net *net) { return gss_svc_init_net(net); } static __net_exit void rpcsec_gss_exit_net(struct net *net) { gss_svc_shutdown_net(net); } static struct pernet_operations rpcsec_gss_net_ops = { .init = rpcsec_gss_init_net, .exit = rpcsec_gss_exit_net, }; /* * Initialize RPCSEC_GSS module */ static int __init init_rpcsec_gss(void) { int err = 0; err = rpcauth_register(&authgss_ops); if (err) goto out; err = gss_svc_init(); if (err) goto out_unregister; err = register_pernet_subsys(&rpcsec_gss_net_ops); if (err) goto out_svc_exit; rpc_init_wait_queue(&pipe_version_rpc_waitqueue, "gss pipe version"); return 0; out_svc_exit: gss_svc_shutdown(); out_unregister: rpcauth_unregister(&authgss_ops); out: return err; } static void __exit exit_rpcsec_gss(void) { unregister_pernet_subsys(&rpcsec_gss_net_ops); gss_svc_shutdown(); rpcauth_unregister(&authgss_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ } MODULE_ALIAS("rpc-auth-6"); MODULE_DESCRIPTION("Sun RPC Kerberos RPCSEC_GSS client authentication"); MODULE_LICENSE("GPL"); module_param_named(expired_cred_retry_delay, gss_expired_cred_retry_delay, uint, 0644); MODULE_PARM_DESC(expired_cred_retry_delay, "Timeout (in seconds) until " "the RPC engine retries an expired credential"); module_param_named(key_expire_timeo, gss_key_expire_timeo, uint, 0644); MODULE_PARM_DESC(key_expire_timeo, "Time (in seconds) at the end of a " "credential keys lifetime where the NFS layer cleans up " "prior to key expiration"); module_init(init_rpcsec_gss) module_exit(exit_rpcsec_gss)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2009 IBM Corporation * Author: Mimi Zohar <zohar@us.ibm.com> */ #ifndef _LINUX_INTEGRITY_H #define _LINUX_INTEGRITY_H #include <linux/fs.h> #include <linux/iversion.h> enum integrity_status { INTEGRITY_PASS = 0, INTEGRITY_PASS_IMMUTABLE, INTEGRITY_FAIL, INTEGRITY_FAIL_IMMUTABLE, INTEGRITY_NOLABEL, INTEGRITY_NOXATTRS, INTEGRITY_UNKNOWN, }; #ifdef CONFIG_INTEGRITY extern void __init integrity_load_keys(void); #else static inline void integrity_load_keys(void) { } #endif /* CONFIG_INTEGRITY */ /* An inode's attributes for detection of changes */ struct integrity_inode_attributes { u64 version; /* track inode changes */ unsigned long ino; dev_t dev; }; /* * On stacked filesystems the i_version alone is not enough to detect file data * or metadata change. Additional metadata is required. */ static inline void integrity_inode_attrs_store(struct integrity_inode_attributes *attrs, u64 i_version, const struct inode *inode) { attrs->version = i_version; attrs->dev = inode->i_sb->s_dev; attrs->ino = inode->i_ino; } /* * On stacked filesystems detect whether the inode or its content has changed. */ static inline bool integrity_inode_attrs_changed(const struct integrity_inode_attributes *attrs, const struct inode *inode) { return (inode->i_sb->s_dev != attrs->dev || inode->i_ino != attrs->ino || !inode_eq_iversion(inode, attrs->version)); } #endif /* _LINUX_INTEGRITY_H */
11 1 1 2 3 4 4 4 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 // SPDX-License-Identifier: GPL-2.0-or-later /* * mpls tunnels An implementation mpls tunnels using the light weight tunnel * infrastructure * * Authors: Roopa Prabhu, <roopa@cumulusnetworks.com> */ #include <linux/types.h> #include <linux/skbuff.h> #include <linux/net.h> #include <linux/module.h> #include <linux/mpls.h> #include <linux/vmalloc.h> #include <net/ip.h> #include <net/dst.h> #include <net/lwtunnel.h> #include <net/netevent.h> #include <net/netns/generic.h> #include <net/ip6_fib.h> #include <net/route.h> #include <net/mpls_iptunnel.h> #include <linux/mpls_iptunnel.h> #include "internal.h" static const struct nla_policy mpls_iptunnel_policy[MPLS_IPTUNNEL_MAX + 1] = { [MPLS_IPTUNNEL_DST] = { .len = sizeof(u32) }, [MPLS_IPTUNNEL_TTL] = { .type = NLA_U8 }, }; static unsigned int mpls_encap_size(struct mpls_iptunnel_encap *en) { /* The size of the layer 2.5 labels to be added for this route */ return en->labels * sizeof(struct mpls_shim_hdr); } static int mpls_xmit(struct sk_buff *skb) { struct mpls_iptunnel_encap *tun_encap_info; struct mpls_shim_hdr *hdr; struct net_device *out_dev; unsigned int hh_len; unsigned int new_header_size; unsigned int mtu; struct dst_entry *dst = skb_dst(skb); struct rtable *rt = NULL; struct rt6_info *rt6 = NULL; struct mpls_dev *out_mdev; struct net *net; int err = 0; bool bos; int i; unsigned int ttl; /* Find the output device */ out_dev = dst->dev; net = dev_net(out_dev); if (!mpls_output_possible(out_dev) || !dst->lwtstate || skb_warn_if_lro(skb)) goto drop; skb_forward_csum(skb); tun_encap_info = mpls_lwtunnel_encap(dst->lwtstate); /* Obtain the ttl using the following set of rules. * * LWT ttl propagation setting: * - disabled => use default TTL value from LWT * - enabled => use TTL value from IPv4/IPv6 header * - default => * Global ttl propagation setting: * - disabled => use default TTL value from global setting * - enabled => use TTL value from IPv4/IPv6 header */ if (dst->ops->family == AF_INET) { if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DISABLED) ttl = tun_encap_info->default_ttl; else if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DEFAULT && !net->mpls.ip_ttl_propagate) ttl = net->mpls.default_ttl; else ttl = ip_hdr(skb)->ttl; rt = dst_rtable(dst); } else if (dst->ops->family == AF_INET6) { if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DISABLED) ttl = tun_encap_info->default_ttl; else if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DEFAULT && !net->mpls.ip_ttl_propagate) ttl = net->mpls.default_ttl; else ttl = ipv6_hdr(skb)->hop_limit; rt6 = dst_rt6_info(dst); } else { goto drop; } /* Verify the destination can hold the packet */ new_header_size = mpls_encap_size(tun_encap_info); mtu = mpls_dev_mtu(out_dev); if (mpls_pkt_too_big(skb, mtu - new_header_size)) goto drop; hh_len = LL_RESERVED_SPACE(out_dev); if (!out_dev->header_ops) hh_len = 0; /* Ensure there is enough space for the headers in the skb */ if (skb_cow_head(skb, hh_len + new_header_size)) goto drop; skb_set_inner_protocol(skb, skb->protocol); skb_reset_inner_network_header(skb); skb_push(skb, new_header_size); skb_reset_network_header(skb); skb->dev = out_dev; skb->protocol = htons(ETH_P_MPLS_UC); /* Push the new labels */ hdr = mpls_hdr(skb); bos = true; for (i = tun_encap_info->labels - 1; i >= 0; i--) { hdr[i] = mpls_entry_encode(tun_encap_info->label[i], ttl, 0, bos); bos = false; } mpls_stats_inc_outucastpkts(out_dev, skb); if (rt) { if (rt->rt_gw_family == AF_INET6) err = neigh_xmit(NEIGH_ND_TABLE, out_dev, &rt->rt_gw6, skb); else err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gw4, skb); } else if (rt6) { if (ipv6_addr_v4mapped(&rt6->rt6i_gateway)) { /* 6PE (RFC 4798) */ err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt6->rt6i_gateway.s6_addr32[3], skb); } else err = neigh_xmit(NEIGH_ND_TABLE, out_dev, &rt6->rt6i_gateway, skb); } if (err) net_dbg_ratelimited("%s: packet transmission failed: %d\n", __func__, err); return LWTUNNEL_XMIT_DONE; drop: out_mdev = out_dev ? mpls_dev_get(out_dev) : NULL; if (out_mdev) MPLS_INC_STATS(out_mdev, tx_errors); kfree_skb(skb); return -EINVAL; } static int mpls_build_state(struct net *net, struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { struct mpls_iptunnel_encap *tun_encap_info; struct nlattr *tb[MPLS_IPTUNNEL_MAX + 1]; struct lwtunnel_state *newts; u8 n_labels; int ret; ret = nla_parse_nested_deprecated(tb, MPLS_IPTUNNEL_MAX, nla, mpls_iptunnel_policy, extack); if (ret < 0) return ret; if (!tb[MPLS_IPTUNNEL_DST]) { NL_SET_ERR_MSG(extack, "MPLS_IPTUNNEL_DST attribute is missing"); return -EINVAL; } /* determine number of labels */ if (nla_get_labels(tb[MPLS_IPTUNNEL_DST], MAX_NEW_LABELS, &n_labels, NULL, extack)) return -EINVAL; newts = lwtunnel_state_alloc(struct_size(tun_encap_info, label, n_labels)); if (!newts) return -ENOMEM; tun_encap_info = mpls_lwtunnel_encap(newts); ret = nla_get_labels(tb[MPLS_IPTUNNEL_DST], n_labels, &tun_encap_info->labels, tun_encap_info->label, extack); if (ret) goto errout; tun_encap_info->ttl_propagate = MPLS_TTL_PROP_DEFAULT; if (tb[MPLS_IPTUNNEL_TTL]) { tun_encap_info->default_ttl = nla_get_u8(tb[MPLS_IPTUNNEL_TTL]); /* TTL 0 implies propagate from IP header */ tun_encap_info->ttl_propagate = tun_encap_info->default_ttl ? MPLS_TTL_PROP_DISABLED : MPLS_TTL_PROP_ENABLED; } newts->type = LWTUNNEL_ENCAP_MPLS; newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; newts->headroom = mpls_encap_size(tun_encap_info); *ts = newts; return 0; errout: kfree(newts); *ts = NULL; return ret; } static int mpls_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { struct mpls_iptunnel_encap *tun_encap_info; tun_encap_info = mpls_lwtunnel_encap(lwtstate); if (nla_put_labels(skb, MPLS_IPTUNNEL_DST, tun_encap_info->labels, tun_encap_info->label)) goto nla_put_failure; if (tun_encap_info->ttl_propagate != MPLS_TTL_PROP_DEFAULT && nla_put_u8(skb, MPLS_IPTUNNEL_TTL, tun_encap_info->default_ttl)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static int mpls_encap_nlsize(struct lwtunnel_state *lwtstate) { struct mpls_iptunnel_encap *tun_encap_info; int nlsize; tun_encap_info = mpls_lwtunnel_encap(lwtstate); nlsize = nla_total_size(tun_encap_info->labels * 4); if (tun_encap_info->ttl_propagate != MPLS_TTL_PROP_DEFAULT) nlsize += nla_total_size(1); return nlsize; } static int mpls_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) { struct mpls_iptunnel_encap *a_hdr = mpls_lwtunnel_encap(a); struct mpls_iptunnel_encap *b_hdr = mpls_lwtunnel_encap(b); int l; if (a_hdr->labels != b_hdr->labels || a_hdr->ttl_propagate != b_hdr->ttl_propagate || a_hdr->default_ttl != b_hdr->default_ttl) return 1; for (l = 0; l < a_hdr->labels; l++) if (a_hdr->label[l] != b_hdr->label[l]) return 1; return 0; } static const struct lwtunnel_encap_ops mpls_iptun_ops = { .build_state = mpls_build_state, .xmit = mpls_xmit, .fill_encap = mpls_fill_encap_info, .get_encap_size = mpls_encap_nlsize, .cmp_encap = mpls_encap_cmp, .owner = THIS_MODULE, }; static int __init mpls_iptunnel_init(void) { return lwtunnel_encap_add_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS); } module_init(mpls_iptunnel_init); static void __exit mpls_iptunnel_exit(void) { lwtunnel_encap_del_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS); } module_exit(mpls_iptunnel_exit); MODULE_ALIAS_RTNL_LWT(MPLS); MODULE_SOFTDEP("post: mpls_gso"); MODULE_DESCRIPTION("MultiProtocol Label Switching IP Tunnels"); MODULE_LICENSE("GPL v2");
4 4 1 4 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 // SPDX-License-Identifier: GPL-2.0 /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Manage send buffer. * Producer: * Copy user space data into send buffer, if send buffer space available. * Consumer: * Trigger RDMA write into RMBE of peer and send CDC, if RMBE space available. * * Copyright IBM Corp. 2016 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> */ #include <linux/net.h> #include <linux/rcupdate.h> #include <linux/workqueue.h> #include <linux/sched/signal.h> #include <net/sock.h> #include <net/tcp.h> #include "smc.h" #include "smc_wr.h" #include "smc_cdc.h" #include "smc_close.h" #include "smc_ism.h" #include "smc_tx.h" #include "smc_stats.h" #include "smc_tracepoint.h" #define SMC_TX_WORK_DELAY 0 /***************************** sndbuf producer *******************************/ /* callback implementation for sk.sk_write_space() * to wakeup sndbuf producers that blocked with smc_tx_wait(). * called under sk_socket lock. */ static void smc_tx_write_space(struct sock *sk) { struct socket *sock = sk->sk_socket; struct smc_sock *smc = smc_sk(sk); struct socket_wq *wq; /* similar to sk_stream_write_space */ if (atomic_read(&smc->conn.sndbuf_space) && sock) { if (test_bit(SOCK_NOSPACE, &sock->flags)) SMC_STAT_RMB_TX_FULL(smc, !smc->conn.lnk); clear_bit(SOCK_NOSPACE, &sock->flags); rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_poll(&wq->wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT); rcu_read_unlock(); } } /* Wakeup sndbuf producers that blocked with smc_tx_wait(). * Cf. tcp_data_snd_check()=>tcp_check_space()=>tcp_new_space(). */ void smc_tx_sndbuf_nonfull(struct smc_sock *smc) { if (smc->sk.sk_socket && test_bit(SOCK_NOSPACE, &smc->sk.sk_socket->flags)) smc->sk.sk_write_space(&smc->sk); } /* blocks sndbuf producer until at least one byte of free space available * or urgent Byte was consumed */ static int smc_tx_wait(struct smc_sock *smc, int flags) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct smc_connection *conn = &smc->conn; struct sock *sk = &smc->sk; long timeo; int rc = 0; /* similar to sk_stream_wait_memory */ timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); add_wait_queue(sk_sleep(sk), &wait); while (1) { sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN) || conn->killed || conn->local_tx_ctrl.conn_state_flags.peer_done_writing) { rc = -EPIPE; break; } if (smc_cdc_rxed_any_close(conn)) { rc = -ECONNRESET; break; } if (!timeo) { /* ensure EPOLLOUT is subsequently generated */ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); rc = -EAGAIN; break; } if (signal_pending(current)) { rc = sock_intr_errno(timeo); break; } sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (atomic_read(&conn->sndbuf_space) && !conn->urg_tx_pend) break; /* at least 1 byte of free & no urgent data */ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk_wait_event(sk, &timeo, READ_ONCE(sk->sk_err) || (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) || smc_cdc_rxed_any_close(conn) || (atomic_read(&conn->sndbuf_space) && !conn->urg_tx_pend), &wait); } remove_wait_queue(sk_sleep(sk), &wait); return rc; } static bool smc_tx_is_corked(struct smc_sock *smc) { struct tcp_sock *tp = tcp_sk(smc->clcsock->sk); return (tp->nonagle & TCP_NAGLE_CORK) ? true : false; } /* If we have pending CDC messages, do not send: * Because CQE of this CDC message will happen shortly, it gives * a chance to coalesce future sendmsg() payload in to one RDMA Write, * without need for a timer, and with no latency trade off. * Algorithm here: * 1. First message should never cork * 2. If we have pending Tx CDC messages, wait for the first CDC * message's completion * 3. Don't cork to much data in a single RDMA Write to prevent burst * traffic, total corked message should not exceed sendbuf/2 */ static bool smc_should_autocork(struct smc_sock *smc) { struct smc_connection *conn = &smc->conn; int corking_size; corking_size = min_t(unsigned int, conn->sndbuf_desc->len >> 1, sock_net(&smc->sk)->smc.sysctl_autocorking_size); if (atomic_read(&conn->cdc_pend_tx_wr) == 0 || smc_tx_prepared_sends(conn) > corking_size) return false; return true; } static bool smc_tx_should_cork(struct smc_sock *smc, struct msghdr *msg) { struct smc_connection *conn = &smc->conn; if (smc_should_autocork(smc)) return true; /* for a corked socket defer the RDMA writes if * sndbuf_space is still available. The applications * should known how/when to uncork it. */ if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) && atomic_read(&conn->sndbuf_space)) return true; return false; } /* sndbuf producer: main API called by socket layer. * called under sock lock. */ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) { size_t copylen, send_done = 0, send_remaining = len; size_t chunk_len, chunk_off, chunk_len_sum; struct smc_connection *conn = &smc->conn; union smc_host_cursor prep; struct sock *sk = &smc->sk; char *sndbuf_base; int tx_cnt_prep; int writespace; int rc, chunk; /* This should be in poll */ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) { rc = -EPIPE; goto out_err; } if (sk->sk_state == SMC_INIT) return -ENOTCONN; if (len > conn->sndbuf_desc->len) SMC_STAT_RMB_TX_SIZE_SMALL(smc, !conn->lnk); if (len > conn->peer_rmbe_size) SMC_STAT_RMB_TX_PEER_SIZE_SMALL(smc, !conn->lnk); if (msg->msg_flags & MSG_OOB) SMC_STAT_INC(smc, urg_data_cnt); while (msg_data_left(msg)) { if (smc->sk.sk_shutdown & SEND_SHUTDOWN || (smc->sk.sk_err == ECONNABORTED) || conn->killed) return -EPIPE; if (smc_cdc_rxed_any_close(conn)) return send_done ?: -ECONNRESET; if (msg->msg_flags & MSG_OOB) conn->local_tx_ctrl.prod_flags.urg_data_pending = 1; if (!atomic_read(&conn->sndbuf_space) || conn->urg_tx_pend) { if (send_done) return send_done; rc = smc_tx_wait(smc, msg->msg_flags); if (rc) goto out_err; continue; } /* initialize variables for 1st iteration of subsequent loop */ /* could be just 1 byte, even after smc_tx_wait above */ writespace = atomic_read(&conn->sndbuf_space); /* not more than what user space asked for */ copylen = min_t(size_t, send_remaining, writespace); /* determine start of sndbuf */ sndbuf_base = conn->sndbuf_desc->cpu_addr; smc_curs_copy(&prep, &conn->tx_curs_prep, conn); tx_cnt_prep = prep.count; /* determine chunks where to write into sndbuf */ /* either unwrapped case, or 1st chunk of wrapped case */ chunk_len = min_t(size_t, copylen, conn->sndbuf_desc->len - tx_cnt_prep); chunk_len_sum = chunk_len; chunk_off = tx_cnt_prep; for (chunk = 0; chunk < 2; chunk++) { rc = memcpy_from_msg(sndbuf_base + chunk_off, msg, chunk_len); if (rc) { smc_sndbuf_sync_sg_for_device(conn); if (send_done) return send_done; goto out_err; } send_done += chunk_len; send_remaining -= chunk_len; if (chunk_len_sum == copylen) break; /* either on 1st or 2nd iteration */ /* prepare next (== 2nd) iteration */ chunk_len = copylen - chunk_len; /* remainder */ chunk_len_sum += chunk_len; chunk_off = 0; /* modulo offset in send ring buffer */ } smc_sndbuf_sync_sg_for_device(conn); /* update cursors */ smc_curs_add(conn->sndbuf_desc->len, &prep, copylen); smc_curs_copy(&conn->tx_curs_prep, &prep, conn); /* increased in send tasklet smc_cdc_tx_handler() */ smp_mb__before_atomic(); atomic_sub(copylen, &conn->sndbuf_space); /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */ smp_mb__after_atomic(); /* since we just produced more new data into sndbuf, * trigger sndbuf consumer: RDMA write into peer RMBE and CDC */ if ((msg->msg_flags & MSG_OOB) && !send_remaining) conn->urg_tx_pend = true; /* If we need to cork, do nothing and wait for the next * sendmsg() call or push on tx completion */ if (!smc_tx_should_cork(smc, msg)) smc_tx_sndbuf_nonempty(conn); trace_smc_tx_sendmsg(smc, copylen); } /* while (msg_data_left(msg)) */ return send_done; out_err: rc = sk_stream_error(sk, msg->msg_flags, rc); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(rc == -EAGAIN)) sk->sk_write_space(sk); return rc; } /***************************** sndbuf consumer *******************************/ /* sndbuf consumer: actual data transfer of one target chunk with ISM write */ int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len, u32 offset, int signal) { int rc; rc = smc_ism_write(conn->lgr->smcd, conn->peer_token, conn->peer_rmbe_idx, signal, conn->tx_off + offset, data, len); if (rc) conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; return rc; } /* sndbuf consumer: actual data transfer of one target chunk with RDMA write */ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, int num_sges, struct ib_rdma_wr *rdma_wr) { struct smc_link_group *lgr = conn->lgr; struct smc_link *link = conn->lnk; int rc; rdma_wr->wr.wr_id = smc_wr_tx_get_next_wr_id(link); rdma_wr->wr.num_sge = num_sges; rdma_wr->remote_addr = lgr->rtokens[conn->rtoken_idx][link->link_idx].dma_addr + /* RMBE within RMB */ conn->tx_off + /* offset within RMBE */ peer_rmbe_offset; rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][link->link_idx].rkey; rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); if (rc) smcr_link_down_cond_sched(link); return rc; } /* sndbuf consumer */ static inline void smc_tx_advance_cursors(struct smc_connection *conn, union smc_host_cursor *prod, union smc_host_cursor *sent, size_t len) { smc_curs_add(conn->peer_rmbe_size, prod, len); /* increased in recv tasklet smc_cdc_msg_rcv() */ smp_mb__before_atomic(); /* data in flight reduces usable snd_wnd */ atomic_sub(len, &conn->peer_rmbe_space); /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */ smp_mb__after_atomic(); smc_curs_add(conn->sndbuf_desc->len, sent, len); } /* SMC-R helper for smc_tx_rdma_writes() */ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, size_t src_off, size_t src_len, size_t dst_off, size_t dst_len, struct smc_rdma_wr *wr_rdma_buf) { struct smc_link *link = conn->lnk; dma_addr_t dma_addr = sg_dma_address(conn->sndbuf_desc->sgt[link->link_idx].sgl); u64 virt_addr = (uintptr_t)conn->sndbuf_desc->cpu_addr; int src_len_sum = src_len, dst_len_sum = dst_len; int sent_count = src_off; int srcchunk, dstchunk; int num_sges; int rc; for (dstchunk = 0; dstchunk < 2; dstchunk++) { struct ib_rdma_wr *wr = &wr_rdma_buf->wr_tx_rdma[dstchunk]; struct ib_sge *sge = wr->wr.sg_list; u64 base_addr = dma_addr; if (dst_len < link->qp_attr.cap.max_inline_data) { base_addr = virt_addr; wr->wr.send_flags |= IB_SEND_INLINE; } else { wr->wr.send_flags &= ~IB_SEND_INLINE; } num_sges = 0; for (srcchunk = 0; srcchunk < 2; srcchunk++) { sge[srcchunk].addr = conn->sndbuf_desc->is_vm ? (virt_addr + src_off) : (base_addr + src_off); sge[srcchunk].length = src_len; if (conn->sndbuf_desc->is_vm) sge[srcchunk].lkey = conn->sndbuf_desc->mr[link->link_idx]->lkey; num_sges++; src_off += src_len; if (src_off >= conn->sndbuf_desc->len) src_off -= conn->sndbuf_desc->len; /* modulo in send ring */ if (src_len_sum == dst_len) break; /* either on 1st or 2nd iteration */ /* prepare next (== 2nd) iteration */ src_len = dst_len - src_len; /* remainder */ src_len_sum += src_len; } rc = smc_tx_rdma_write(conn, dst_off, num_sges, wr); if (rc) return rc; if (dst_len_sum == len) break; /* either on 1st or 2nd iteration */ /* prepare next (== 2nd) iteration */ dst_off = 0; /* modulo offset in RMBE ring buffer */ dst_len = len - dst_len; /* remainder */ dst_len_sum += dst_len; src_len = min_t(int, dst_len, conn->sndbuf_desc->len - sent_count); src_len_sum = src_len; } return 0; } /* SMC-D helper for smc_tx_rdma_writes() */ static int smcd_tx_rdma_writes(struct smc_connection *conn, size_t len, size_t src_off, size_t src_len, size_t dst_off, size_t dst_len) { int src_len_sum = src_len, dst_len_sum = dst_len; int srcchunk, dstchunk; int rc; if (conn->sndbuf_desc->is_attached) return 0; for (dstchunk = 0; dstchunk < 2; dstchunk++) { for (srcchunk = 0; srcchunk < 2; srcchunk++) { void *data = conn->sndbuf_desc->cpu_addr + src_off; rc = smcd_tx_ism_write(conn, data, src_len, dst_off + sizeof(struct smcd_cdc_msg), 0); if (rc) return rc; dst_off += src_len; src_off += src_len; if (src_off >= conn->sndbuf_desc->len) src_off -= conn->sndbuf_desc->len; /* modulo in send ring */ if (src_len_sum == dst_len) break; /* either on 1st or 2nd iteration */ /* prepare next (== 2nd) iteration */ src_len = dst_len - src_len; /* remainder */ src_len_sum += src_len; } if (dst_len_sum == len) break; /* either on 1st or 2nd iteration */ /* prepare next (== 2nd) iteration */ dst_off = 0; /* modulo offset in RMBE ring buffer */ dst_len = len - dst_len; /* remainder */ dst_len_sum += dst_len; src_len = min_t(int, dst_len, conn->sndbuf_desc->len - src_off); src_len_sum = src_len; } return 0; } /* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit; * usable snd_wnd as max transmit */ static int smc_tx_rdma_writes(struct smc_connection *conn, struct smc_rdma_wr *wr_rdma_buf) { size_t len, src_len, dst_off, dst_len; /* current chunk values */ union smc_host_cursor sent, prep, prod, cons; struct smc_cdc_producer_flags *pflags; int to_send, rmbespace; int rc; /* source: sndbuf */ smc_curs_copy(&sent, &conn->tx_curs_sent, conn); smc_curs_copy(&prep, &conn->tx_curs_prep, conn); /* cf. wmem_alloc - (snd_max - snd_una) */ to_send = smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep); if (to_send <= 0) return 0; /* destination: RMBE */ /* cf. snd_wnd */ rmbespace = atomic_read(&conn->peer_rmbe_space); if (rmbespace <= 0) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk); return 0; } smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn); smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn); /* if usable snd_wnd closes ask peer to advertise once it opens again */ pflags = &conn->local_tx_ctrl.prod_flags; pflags->write_blocked = (to_send >= rmbespace); /* cf. usable snd_wnd */ len = min(to_send, rmbespace); /* initialize variables for first iteration of subsequent nested loop */ dst_off = prod.count; if (prod.wrap == cons.wrap) { /* the filled destination area is unwrapped, * hence the available free destination space is wrapped * and we need 2 destination chunks of sum len; start with 1st * which is limited by what's available in sndbuf */ dst_len = min_t(size_t, conn->peer_rmbe_size - prod.count, len); } else { /* the filled destination area is wrapped, * hence the available free destination space is unwrapped * and we need a single destination chunk of entire len */ dst_len = len; } /* dst_len determines the maximum src_len */ if (sent.count + dst_len <= conn->sndbuf_desc->len) { /* unwrapped src case: single chunk of entire dst_len */ src_len = dst_len; } else { /* wrapped src case: 2 chunks of sum dst_len; start with 1st: */ src_len = conn->sndbuf_desc->len - sent.count; } if (conn->lgr->is_smcd) rc = smcd_tx_rdma_writes(conn, len, sent.count, src_len, dst_off, dst_len); else rc = smcr_tx_rdma_writes(conn, len, sent.count, src_len, dst_off, dst_len, wr_rdma_buf); if (rc) return rc; if (conn->urg_tx_pend && len == to_send) pflags->urg_data_present = 1; smc_tx_advance_cursors(conn, &prod, &sent, len); /* update connection's cursors with advanced local cursors */ smc_curs_copy(&conn->local_tx_ctrl.prod, &prod, conn); /* dst: peer RMBE */ smc_curs_copy(&conn->tx_curs_sent, &sent, conn);/* src: local sndbuf */ return 0; } /* Wakeup sndbuf consumers from any context (IRQ or process) * since there is more data to transmit; usable snd_wnd as max transmit */ static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) { struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags; struct smc_link *link = conn->lnk; struct smc_rdma_wr *wr_rdma_buf; struct smc_cdc_tx_pend *pend; struct smc_wr_buf *wr_buf; int rc; if (!link || !smc_wr_tx_link_hold(link)) return -ENOLINK; rc = smc_cdc_get_free_slot(conn, link, &wr_buf, &wr_rdma_buf, &pend); if (rc < 0) { smc_wr_tx_link_put(link); if (rc == -EBUSY) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); if (smc->sk.sk_err == ECONNABORTED) return sock_error(&smc->sk); if (conn->killed) return -EPIPE; rc = 0; mod_delayed_work(conn->lgr->tx_wq, &conn->tx_work, SMC_TX_WORK_DELAY); } return rc; } spin_lock_bh(&conn->send_lock); if (link != conn->lnk) { /* link of connection changed, tx_work will restart */ smc_wr_tx_put_slot(link, (struct smc_wr_tx_pend_priv *)pend); rc = -ENOLINK; goto out_unlock; } if (!pflags->urg_data_present) { rc = smc_tx_rdma_writes(conn, wr_rdma_buf); if (rc) { smc_wr_tx_put_slot(link, (struct smc_wr_tx_pend_priv *)pend); goto out_unlock; } } rc = smc_cdc_msg_send(conn, wr_buf, pend); if (!rc && pflags->urg_data_present) { pflags->urg_data_pending = 0; pflags->urg_data_present = 0; } out_unlock: spin_unlock_bh(&conn->send_lock); smc_wr_tx_link_put(link); return rc; } static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn) { struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags; int rc = 0; spin_lock_bh(&conn->send_lock); if (!pflags->urg_data_present) rc = smc_tx_rdma_writes(conn, NULL); if (!rc) rc = smcd_cdc_msg_send(conn); if (!rc && pflags->urg_data_present) { pflags->urg_data_pending = 0; pflags->urg_data_present = 0; } spin_unlock_bh(&conn->send_lock); return rc; } int smc_tx_sndbuf_nonempty(struct smc_connection *conn) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); int rc = 0; /* No data in the send queue */ if (unlikely(smc_tx_prepared_sends(conn) <= 0)) goto out; /* Peer don't have RMBE space */ if (unlikely(atomic_read(&conn->peer_rmbe_space) <= 0)) { SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk); goto out; } if (conn->killed || conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) { rc = -EPIPE; /* connection being aborted */ goto out; } if (conn->lgr->is_smcd) rc = smcd_tx_sndbuf_nonempty(conn); else rc = smcr_tx_sndbuf_nonempty(conn); if (!rc) { /* trigger socket release if connection is closing */ smc_close_wake_tx_prepared(smc); } out: return rc; } /* Wakeup sndbuf consumers from process context * since there is more data to transmit. The caller * must hold sock lock. */ void smc_tx_pending(struct smc_connection *conn) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); int rc; if (smc->sk.sk_err) return; rc = smc_tx_sndbuf_nonempty(conn); if (!rc && conn->local_rx_ctrl.prod_flags.write_blocked && !atomic_read(&conn->bytes_to_rcv)) conn->local_rx_ctrl.prod_flags.write_blocked = 0; } /* Wakeup sndbuf consumers from process context * since there is more data to transmit in locked * sock. */ void smc_tx_work(struct work_struct *work) { struct smc_connection *conn = container_of(to_delayed_work(work), struct smc_connection, tx_work); struct smc_sock *smc = container_of(conn, struct smc_sock, conn); lock_sock(&smc->sk); smc_tx_pending(conn); release_sock(&smc->sk); } void smc_tx_consumer_update(struct smc_connection *conn, bool force) { union smc_host_cursor cfed, cons, prod; int sender_free = conn->rmb_desc->len; int to_confirm; smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn); smc_curs_copy(&cfed, &conn->rx_curs_confirmed, conn); to_confirm = smc_curs_diff(conn->rmb_desc->len, &cfed, &cons); if (to_confirm > conn->rmbe_update_limit) { smc_curs_copy(&prod, &conn->local_rx_ctrl.prod, conn); sender_free = conn->rmb_desc->len - smc_curs_diff_large(conn->rmb_desc->len, &cfed, &prod); } if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req || force || ((to_confirm > conn->rmbe_update_limit) && ((sender_free <= (conn->rmb_desc->len / 2)) || conn->local_rx_ctrl.prod_flags.write_blocked))) { if (conn->killed || conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) return; if ((smc_cdc_get_slot_and_msg_send(conn) < 0) && !conn->killed) { queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, SMC_TX_WORK_DELAY); return; } } if (conn->local_rx_ctrl.prod_flags.write_blocked && !atomic_read(&conn->bytes_to_rcv)) conn->local_rx_ctrl.prod_flags.write_blocked = 0; } /***************************** send initialize *******************************/ /* Initialize send properties on connection establishment. NB: not __init! */ void smc_tx_init(struct smc_sock *smc) { smc->sk.sk_write_space = smc_tx_write_space; }
147 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright 2002-2005, Devicescape Software, Inc. * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2015-2017 Intel Deutschland GmbH * Copyright(c) 2020-2024 Intel Corporation */ #ifndef STA_INFO_H #define STA_INFO_H #include <linux/list.h> #include <linux/types.h> #include <linux/if_ether.h> #include <linux/workqueue.h> #include <linux/average.h> #include <linux/bitfield.h> #include <linux/etherdevice.h> #include <linux/rhashtable.h> #include <linux/u64_stats_sync.h> #include "key.h" /** * enum ieee80211_sta_info_flags - Stations flags * * These flags are used with &struct sta_info's @flags member, but * only indirectly with set_sta_flag() and friends. * * @WLAN_STA_AUTH: Station is authenticated. * @WLAN_STA_ASSOC: Station is associated. * @WLAN_STA_PS_STA: Station is in power-save mode * @WLAN_STA_AUTHORIZED: Station is authorized to send/receive traffic. * This bit is always checked so needs to be enabled for all stations * when virtual port control is not in use. * @WLAN_STA_SHORT_PREAMBLE: Station is capable of receiving short-preamble * frames. * @WLAN_STA_WDS: Station is one of our WDS peers. * @WLAN_STA_CLEAR_PS_FILT: Clear PS filter in hardware (using the * IEEE80211_TX_CTL_CLEAR_PS_FILT control flag) when the next * frame to this station is transmitted. * @WLAN_STA_MFP: Management frame protection is used with this STA. * @WLAN_STA_BLOCK_BA: Used to deny ADDBA requests (both TX and RX) * during suspend/resume and station removal. * @WLAN_STA_PS_DRIVER: driver requires keeping this station in * power-save mode logically to flush frames that might still * be in the queues * @WLAN_STA_PSPOLL: Station sent PS-poll while driver was keeping * station in power-save mode, reply when the driver unblocks. * @WLAN_STA_TDLS_PEER: Station is a TDLS peer. * @WLAN_STA_TDLS_PEER_AUTH: This TDLS peer is authorized to send direct * packets. This means the link is enabled. * @WLAN_STA_TDLS_INITIATOR: We are the initiator of the TDLS link with this * station. * @WLAN_STA_TDLS_CHAN_SWITCH: This TDLS peer supports TDLS channel-switching * @WLAN_STA_TDLS_OFF_CHANNEL: The local STA is currently off-channel with this * TDLS peer * @WLAN_STA_TDLS_WIDER_BW: This TDLS peer supports working on a wider bw on * the BSS base channel. * @WLAN_STA_UAPSD: Station requested unscheduled SP while driver was * keeping station in power-save mode, reply when the driver * unblocks the station. * @WLAN_STA_SP: Station is in a service period, so don't try to * reply to other uAPSD trigger frames or PS-Poll. * @WLAN_STA_4ADDR_EVENT: 4-addr event was already sent for this frame. * @WLAN_STA_INSERTED: This station is inserted into the hash table. * @WLAN_STA_RATE_CONTROL: rate control was initialized for this station. * @WLAN_STA_TOFFSET_KNOWN: toffset calculated for this station is valid. * @WLAN_STA_MPSP_OWNER: local STA is owner of a mesh Peer Service Period. * @WLAN_STA_MPSP_RECIPIENT: local STA is recipient of a MPSP. * @WLAN_STA_PS_DELIVER: station woke up, but we're still blocking TX * until pending frames are delivered * @WLAN_STA_USES_ENCRYPTION: This station was configured for encryption, * so drop all packets without a key later. * @WLAN_STA_DECAP_OFFLOAD: This station uses rx decap offload * * @NUM_WLAN_STA_FLAGS: number of defined flags */ enum ieee80211_sta_info_flags { WLAN_STA_AUTH, WLAN_STA_ASSOC, WLAN_STA_PS_STA, WLAN_STA_AUTHORIZED, WLAN_STA_SHORT_PREAMBLE, WLAN_STA_WDS, WLAN_STA_CLEAR_PS_FILT, WLAN_STA_MFP, WLAN_STA_BLOCK_BA, WLAN_STA_PS_DRIVER, WLAN_STA_PSPOLL, WLAN_STA_TDLS_PEER, WLAN_STA_TDLS_PEER_AUTH, WLAN_STA_TDLS_INITIATOR, WLAN_STA_TDLS_CHAN_SWITCH, WLAN_STA_TDLS_OFF_CHANNEL, WLAN_STA_TDLS_WIDER_BW, WLAN_STA_UAPSD, WLAN_STA_SP, WLAN_STA_4ADDR_EVENT, WLAN_STA_INSERTED, WLAN_STA_RATE_CONTROL, WLAN_STA_TOFFSET_KNOWN, WLAN_STA_MPSP_OWNER, WLAN_STA_MPSP_RECIPIENT, WLAN_STA_PS_DELIVER, WLAN_STA_USES_ENCRYPTION, WLAN_STA_DECAP_OFFLOAD, NUM_WLAN_STA_FLAGS, }; #define ADDBA_RESP_INTERVAL HZ #define HT_AGG_MAX_RETRIES 15 #define HT_AGG_BURST_RETRIES 3 #define HT_AGG_RETRIES_PERIOD (15 * HZ) #define HT_AGG_STATE_DRV_READY 0 #define HT_AGG_STATE_RESPONSE_RECEIVED 1 #define HT_AGG_STATE_OPERATIONAL 2 #define HT_AGG_STATE_STOPPING 3 #define HT_AGG_STATE_WANT_START 4 #define HT_AGG_STATE_WANT_STOP 5 #define HT_AGG_STATE_START_CB 6 #define HT_AGG_STATE_STOP_CB 7 #define HT_AGG_STATE_SENT_ADDBA 8 DECLARE_EWMA(avg_signal, 10, 8) enum ieee80211_agg_stop_reason { AGG_STOP_DECLINED, AGG_STOP_LOCAL_REQUEST, AGG_STOP_PEER_REQUEST, AGG_STOP_DESTROY_STA, }; /* Debugfs flags to enable/disable use of RX/TX airtime in scheduler */ #define AIRTIME_USE_TX BIT(0) #define AIRTIME_USE_RX BIT(1) struct airtime_info { u64 rx_airtime; u64 tx_airtime; unsigned long last_active; s32 deficit; atomic_t aql_tx_pending; /* Estimated airtime for frames pending */ u32 aql_limit_low; u32 aql_limit_high; }; void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local, struct sta_info *sta, u8 ac, u16 tx_airtime, bool tx_completed); struct sta_info; /** * struct tid_ampdu_tx - TID aggregation information (Tx). * * @rcu_head: rcu head for freeing structure * @session_timer: check if we keep Tx-ing on the TID (by timeout value) * @addba_resp_timer: timer for peer's response to addba request * @pending: pending frames queue -- use sta's spinlock to protect * @sta: station we are attached to * @dialog_token: dialog token for aggregation session * @timeout: session timeout value to be filled in ADDBA requests * @tid: TID number * @state: session state (see above) * @last_tx: jiffies of last tx activity * @stop_initiator: initiator of a session stop * @tx_stop: TX DelBA frame when stopping * @buf_size: reorder buffer size at receiver * @failed_bar_ssn: ssn of the last failed BAR tx attempt * @bar_pending: BAR needs to be re-sent * @amsdu: support A-MSDU within A-MDPU * @ssn: starting sequence number of the session * * This structure's lifetime is managed by RCU, assignments to * the array holding it must hold the aggregation mutex. * * The TX path can access it under RCU lock-free if, and * only if, the state has the flag %HT_AGG_STATE_OPERATIONAL * set. Otherwise, the TX path must also acquire the spinlock * and re-check the state, see comments in the tx code * touching it. */ struct tid_ampdu_tx { struct rcu_head rcu_head; struct timer_list session_timer; struct timer_list addba_resp_timer; struct sk_buff_head pending; struct sta_info *sta; unsigned long state; unsigned long last_tx; u16 timeout; u8 dialog_token; u8 stop_initiator; bool tx_stop; u16 buf_size; u16 ssn; u16 failed_bar_ssn; bool bar_pending; bool amsdu; u8 tid; }; /** * struct tid_ampdu_rx - TID aggregation information (Rx). * * @reorder_buf: buffer to reorder incoming aggregated MPDUs. An MPDU may be an * A-MSDU with individually reported subframes. * @reorder_buf_filtered: bitmap indicating where there are filtered frames in * the reorder buffer that should be ignored when releasing frames * @reorder_time: jiffies when skb was added * @session_timer: check if peer keeps Tx-ing on the TID (by timeout value) * @reorder_timer: releases expired frames from the reorder buffer. * @sta: station we are attached to * @last_rx: jiffies of last rx activity * @head_seq_num: head sequence number in reordering buffer. * @stored_mpdu_num: number of MPDUs in reordering buffer * @ssn: Starting Sequence Number expected to be aggregated. * @buf_size: buffer size for incoming A-MPDUs * @timeout: reset timer value (in TUs). * @tid: TID number * @rcu_head: RCU head used for freeing this struct * @reorder_lock: serializes access to reorder buffer, see below. * @auto_seq: used for offloaded BA sessions to automatically pick head_seq_and * and ssn. * @removed: this session is removed (but might have been found due to RCU) * @started: this session has started (head ssn or higher was received) * * This structure's lifetime is managed by RCU, assignments to * the array holding it must hold the aggregation mutex. * * The @reorder_lock is used to protect the members of this * struct, except for @timeout, @buf_size and @dialog_token, * which are constant across the lifetime of the struct (the * dialog token being used only for debugging). */ struct tid_ampdu_rx { struct rcu_head rcu_head; spinlock_t reorder_lock; u64 reorder_buf_filtered; struct sk_buff_head *reorder_buf; unsigned long *reorder_time; struct sta_info *sta; struct timer_list session_timer; struct timer_list reorder_timer; unsigned long last_rx; u16 head_seq_num; u16 stored_mpdu_num; u16 ssn; u16 buf_size; u16 timeout; u8 tid; u8 auto_seq:1, removed:1, started:1; }; /** * struct sta_ampdu_mlme - STA aggregation information. * * @tid_rx: aggregation info for Rx per TID -- RCU protected * @tid_rx_token: dialog tokens for valid aggregation sessions * @tid_rx_timer_expired: bitmap indicating on which TIDs the * RX timer expired until the work for it runs * @tid_rx_stop_requested: bitmap indicating which BA sessions per TID the * driver requested to close until the work for it runs * @tid_rx_manage_offl: bitmap indicating which BA sessions were requested * to be treated as started/stopped due to offloading * @agg_session_valid: bitmap indicating which TID has a rx BA session open on * @unexpected_agg: bitmap indicating which TID already sent a delBA due to * unexpected aggregation related frames outside a session * @work: work struct for starting/stopping aggregation * @tid_tx: aggregation info for Tx per TID * @tid_start_tx: sessions where start was requested, not just protected * by wiphy mutex but also sta->lock * @last_addba_req_time: timestamp of the last addBA request. * @addba_req_num: number of times addBA request has been sent. * @dialog_token_allocator: dialog token enumerator for each new session; */ struct sta_ampdu_mlme { /* rx */ struct tid_ampdu_rx __rcu *tid_rx[IEEE80211_NUM_TIDS]; u8 tid_rx_token[IEEE80211_NUM_TIDS]; unsigned long tid_rx_timer_expired[BITS_TO_LONGS(IEEE80211_NUM_TIDS)]; unsigned long tid_rx_stop_requested[BITS_TO_LONGS(IEEE80211_NUM_TIDS)]; unsigned long tid_rx_manage_offl[BITS_TO_LONGS(2 * IEEE80211_NUM_TIDS)]; unsigned long agg_session_valid[BITS_TO_LONGS(IEEE80211_NUM_TIDS)]; unsigned long unexpected_agg[BITS_TO_LONGS(IEEE80211_NUM_TIDS)]; /* tx */ struct wiphy_work work; struct tid_ampdu_tx __rcu *tid_tx[IEEE80211_NUM_TIDS]; struct tid_ampdu_tx *tid_start_tx[IEEE80211_NUM_TIDS]; unsigned long last_addba_req_time[IEEE80211_NUM_TIDS]; u8 addba_req_num[IEEE80211_NUM_TIDS]; u8 dialog_token_allocator; }; /* Value to indicate no TID reservation */ #define IEEE80211_TID_UNRESERVED 0xff #define IEEE80211_FAST_XMIT_MAX_IV 18 /** * struct ieee80211_fast_tx - TX fastpath information * @key: key to use for hw crypto * @hdr: the 802.11 header to put with the frame * @hdr_len: actual 802.11 header length * @sa_offs: offset of the SA * @da_offs: offset of the DA * @pn_offs: offset where to put PN for crypto (or 0 if not needed) * @band: band this will be transmitted on, for tx_info * @rcu_head: RCU head to free this struct * * This struct is small enough so that the common case (maximum crypto * header length of 8 like for CCMP/GCMP) fits into a single 64-byte * cache line. */ struct ieee80211_fast_tx { struct ieee80211_key *key; u8 hdr_len; u8 sa_offs, da_offs, pn_offs; u8 band; u8 hdr[30 + 2 + IEEE80211_FAST_XMIT_MAX_IV + sizeof(rfc1042_header)] __aligned(2); struct rcu_head rcu_head; }; /** * struct ieee80211_fast_rx - RX fastpath information * @dev: netdevice for reporting the SKB * @vif_type: (P2P-less) interface type of the original sdata (sdata->vif.type) * @vif_addr: interface address * @rfc1042_hdr: copy of the RFC 1042 SNAP header (to have in cache) * @control_port_protocol: control port protocol copied from sdata * @expected_ds_bits: from/to DS bits expected * @icv_len: length of the MIC if present * @key: bool indicating encryption is expected (key is set) * @internal_forward: forward froms internally on AP/VLAN type interfaces * @uses_rss: copy of USES_RSS hw flag * @da_offs: offset of the DA in the header (for header conversion) * @sa_offs: offset of the SA in the header (for header conversion) * @rcu_head: RCU head for freeing this structure */ struct ieee80211_fast_rx { struct net_device *dev; enum nl80211_iftype vif_type; u8 vif_addr[ETH_ALEN] __aligned(2); u8 rfc1042_hdr[6] __aligned(2); __be16 control_port_protocol; __le16 expected_ds_bits; u8 icv_len; u8 key:1, internal_forward:1, uses_rss:1; u8 da_offs, sa_offs; struct rcu_head rcu_head; }; /* we use only values in the range 0-100, so pick a large precision */ DECLARE_EWMA(mesh_fail_avg, 20, 8) DECLARE_EWMA(mesh_tx_rate_avg, 8, 16) /** * struct mesh_sta - mesh STA information * @plink_lock: serialize access to plink fields * @llid: Local link ID * @plid: Peer link ID * @aid: local aid supplied by peer * @reason: Cancel reason on PLINK_HOLDING state * @plink_retries: Retries in establishment * @plink_state: peer link state * @plink_timeout: timeout of peer link * @plink_timer: peer link watch timer * @plink_sta: peer link watch timer's sta_info * @t_offset: timing offset relative to this host * @t_offset_setpoint: reference timing offset of this sta to be used when * calculating clockdrift * @local_pm: local link-specific power save mode * @peer_pm: peer-specific power save mode towards local STA * @nonpeer_pm: STA power save mode towards non-peer neighbors * @processed_beacon: set to true after peer rates and capabilities are * processed * @connected_to_gate: true if mesh STA has a path to a mesh gate * @connected_to_as: true if mesh STA has a path to a authentication server * @fail_avg: moving percentage of failed MSDUs * @tx_rate_avg: moving average of tx bitrate */ struct mesh_sta { struct timer_list plink_timer; struct sta_info *plink_sta; s64 t_offset; s64 t_offset_setpoint; spinlock_t plink_lock; u16 llid; u16 plid; u16 aid; u16 reason; u8 plink_retries; bool processed_beacon; bool connected_to_gate; bool connected_to_as; enum nl80211_plink_state plink_state; u32 plink_timeout; /* mesh power save */ enum nl80211_mesh_power_mode local_pm; enum nl80211_mesh_power_mode peer_pm; enum nl80211_mesh_power_mode nonpeer_pm; /* moving percentage of failed MSDUs */ struct ewma_mesh_fail_avg fail_avg; /* moving average of tx bitrate */ struct ewma_mesh_tx_rate_avg tx_rate_avg; }; DECLARE_EWMA(signal, 10, 8) struct ieee80211_sta_rx_stats { unsigned long packets; unsigned long last_rx; unsigned long num_duplicates; unsigned long fragments; unsigned long dropped; int last_signal; u8 chains; s8 chain_signal_last[IEEE80211_MAX_CHAINS]; u32 last_rate; struct u64_stats_sync syncp; u64 bytes; u64 msdu[IEEE80211_NUM_TIDS + 1]; }; /* * IEEE 802.11-2016 (10.6 "Defragmentation") recommends support for "concurrent * reception of at least one MSDU per access category per associated STA" * on APs, or "at least one MSDU per access category" on other interface types. * * This limit can be increased by changing this define, at the cost of slower * frame reassembly and increased memory use while fragments are pending. */ #define IEEE80211_FRAGMENT_MAX 4 struct ieee80211_fragment_entry { struct sk_buff_head skb_list; unsigned long first_frag_time; u16 seq; u16 extra_len; u16 last_frag; u8 rx_queue; u8 check_sequential_pn:1, /* needed for CCMP/GCMP */ is_protected:1; u8 last_pn[6]; /* PN of the last fragment if CCMP was used */ unsigned int key_color; }; struct ieee80211_fragment_cache { struct ieee80211_fragment_entry entries[IEEE80211_FRAGMENT_MAX]; unsigned int next; }; /** * struct link_sta_info - Link STA information * All link specific sta info are stored here for reference. This can be * a single entry for non-MLD STA or multiple entries for MLD STA * @addr: Link MAC address - Can be same as MLD STA mac address and is always * same for non-MLD STA. This is used as key for searching link STA * @link_id: Link ID uniquely identifying the link STA. This is 0 for non-MLD * and set to the corresponding vif LinkId for MLD STA * @op_mode_nss: NSS limit as set by operating mode notification, or 0 * @capa_nss: NSS limit as determined by local and peer capabilities * @link_hash_node: hash node for rhashtable * @sta: Points to the STA info * @gtk: group keys negotiated with this station, if any * @tx_stats: TX statistics * @tx_stats.packets: # of packets transmitted * @tx_stats.bytes: # of bytes in all packets transmitted * @tx_stats.last_rate: last TX rate * @tx_stats.msdu: # of transmitted MSDUs per TID * @rx_stats: RX statistics * @rx_stats_avg: averaged RX statistics * @rx_stats_avg.signal: averaged signal * @rx_stats_avg.chain_signal: averaged per-chain signal * @pcpu_rx_stats: per-CPU RX statistics, assigned only if the driver needs * this (by advertising the USES_RSS hw flag) * @status_stats: TX status statistics * @status_stats.filtered: # of filtered frames * @status_stats.retry_failed: # of frames that failed after retry * @status_stats.retry_count: # of retries attempted * @status_stats.lost_packets: # of lost packets * @status_stats.last_pkt_time: timestamp of last ACKed packet * @status_stats.msdu_retries: # of MSDU retries * @status_stats.msdu_failed: # of failed MSDUs * @status_stats.last_ack: last ack timestamp (jiffies) * @status_stats.last_ack_signal: last ACK signal * @status_stats.ack_signal_filled: last ACK signal validity * @status_stats.avg_ack_signal: average ACK signal * @cur_max_bandwidth: maximum bandwidth to use for TX to the station, * taken from HT/VHT capabilities or VHT operating mode notification * @rx_omi_bw_rx: RX OMI bandwidth restriction to apply for RX * @rx_omi_bw_tx: RX OMI bandwidth restriction to apply for TX * @rx_omi_bw_staging: RX OMI bandwidth restriction to apply later * during finalize * @debugfs_dir: debug filesystem directory dentry * @pub: public (driver visible) link STA data * TODO Move other link params from sta_info as required for MLD operation */ struct link_sta_info { u8 addr[ETH_ALEN]; u8 link_id; u8 op_mode_nss, capa_nss; struct rhlist_head link_hash_node; struct sta_info *sta; struct ieee80211_key __rcu *gtk[NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS + NUM_DEFAULT_BEACON_KEYS]; struct ieee80211_sta_rx_stats __percpu *pcpu_rx_stats; /* Updated from RX path only, no locking requirements */ struct ieee80211_sta_rx_stats rx_stats; struct { struct ewma_signal signal; struct ewma_signal chain_signal[IEEE80211_MAX_CHAINS]; } rx_stats_avg; /* Updated from TX status path only, no locking requirements */ struct { unsigned long filtered; unsigned long retry_failed, retry_count; unsigned int lost_packets; unsigned long last_pkt_time; u64 msdu_retries[IEEE80211_NUM_TIDS + 1]; u64 msdu_failed[IEEE80211_NUM_TIDS + 1]; unsigned long last_ack; s8 last_ack_signal; bool ack_signal_filled; struct ewma_avg_signal avg_ack_signal; } status_stats; /* Updated from TX path only, no locking requirements */ struct { u64 packets[IEEE80211_NUM_ACS]; u64 bytes[IEEE80211_NUM_ACS]; struct ieee80211_tx_rate last_rate; struct rate_info last_rate_info; u64 msdu[IEEE80211_NUM_TIDS + 1]; } tx_stats; enum ieee80211_sta_rx_bandwidth cur_max_bandwidth; enum ieee80211_sta_rx_bandwidth rx_omi_bw_rx, rx_omi_bw_tx, rx_omi_bw_staging; #ifdef CONFIG_MAC80211_DEBUGFS struct dentry *debugfs_dir; #endif struct ieee80211_link_sta *pub; }; /** * struct ieee80211_sta_removed_link_stats - Removed link sta data * * keep required accumulated removed link data for stats * * @rx_packets: accumulated packets (MSDUs & MMPDUs) received from * this station for removed links * @tx_packets: accumulated packets (MSDUs & MMPDUs) transmitted to * this station for removed links * @rx_bytes: accumulated bytes (size of MPDUs) received from this * station for removed links * @tx_bytes: accumulated bytes (size of MPDUs) transmitted to this * station for removed links * @tx_retries: cumulative retry counts (MPDUs) for removed links * @tx_failed: accumulated number of failed transmissions (MPDUs) * (retries exceeded, no ACK) for removed links * @rx_dropped_misc: accumulated dropped packets for un-specified reason * from this station for removed links * @beacon_loss_count: Number of times beacon loss event has triggered * from this station for removed links. * @expected_throughput: expected throughput in kbps (including 802.11 * headers) towards this station for removed links * @pertid_stats: accumulated per-TID statistics for removed link of * station * @pertid_stats.rx_msdu : accumulated number of received MSDUs towards * this station for removed links. * @pertid_stats.tx_msdu: accumulated number of (attempted) transmitted * MSDUs towards this station for removed links * @pertid_stats.tx_msdu_retries: accumulated number of retries (not * counting the first) for transmitted MSDUs towards this station * for removed links * @pertid_stats.tx_msdu_failed: accumulated number of failed transmitted * MSDUs towards this station for removed links */ struct ieee80211_sta_removed_link_stats { u32 rx_packets; u32 tx_packets; u64 rx_bytes; u64 tx_bytes; u32 tx_retries; u32 tx_failed; u32 rx_dropped_misc; u32 beacon_loss_count; u32 expected_throughput; struct { u64 rx_msdu; u64 tx_msdu; u64 tx_msdu_retries; u64 tx_msdu_failed; } pertid_stats; }; /** * struct sta_info - STA information * * This structure collects information about a station that * mac80211 is communicating with. * * @list: global linked list entry * @free_list: list entry for keeping track of stations to free * @hash_node: hash node for rhashtable * @addr: station's MAC address - duplicated from public part to * let the hash table work with just a single cacheline * @local: pointer to the global information * @sdata: virtual interface this station belongs to * @ptk: peer keys negotiated with this station, if any * @ptk_idx: last installed peer key index * @rate_ctrl: rate control algorithm reference * @rate_ctrl_lock: spinlock used to protect rate control data * (data inside the algorithm, so serializes calls there) * @rate_ctrl_priv: rate control private per-STA pointer * @lock: used for locking all fields that require locking, see comments * in the header file. * @drv_deliver_wk: used for delivering frames after driver PS unblocking * @listen_interval: listen interval of this station, when we're acting as AP * @_flags: STA flags, see &enum ieee80211_sta_info_flags, do not use directly * @ps_lock: used for powersave (when mac80211 is the AP) related locking * @ps_tx_buf: buffers (per AC) of frames to transmit to this station * when it leaves power saving state or polls * @tx_filtered: buffers (per AC) of frames we already tried to * transmit but were filtered by hardware due to STA having * entered power saving state, these are also delivered to * the station when it leaves powersave or polls for frames * @driver_buffered_tids: bitmap of TIDs the driver has data buffered on * @txq_buffered_tids: bitmap of TIDs that mac80211 has txq data buffered on * @assoc_at: clock boottime (in ns) of last association * @last_connected: time (in seconds) when a station got connected * @last_seq_ctrl: last received seq/frag number from this STA (per TID * plus one for non-QoS frames) * @tid_seq: per-TID sequence numbers for sending to this STA * @airtime: per-AC struct airtime_info describing airtime statistics for this * station * @airtime_weight: station weight for airtime fairness calculation purposes * @ampdu_mlme: A-MPDU state machine state * @mesh: mesh STA information * @debugfs_dir: debug filesystem directory dentry * @dead: set to true when sta is unlinked * @removed: set to true when sta is being removed from sta_list * @uploaded: set to true when sta is uploaded to the driver * @sta: station information we share with the driver * @sta_state: duplicates information about station state (for debug) * @rcu_head: RCU head used for freeing this station struct * @reserved_tid: reserved TID (if any, otherwise IEEE80211_TID_UNRESERVED) * @amsdu_mesh_control: track the mesh A-MSDU format used by the peer: * * * -1: not yet known * * 0: non-mesh A-MSDU length field * * 1: big-endian mesh A-MSDU length field * * 2: little-endian mesh A-MSDU length field * * @fast_tx: TX fastpath information * @fast_rx: RX fastpath information * @tdls_chandef: a TDLS peer can have a wider chandef that is compatible to * the BSS one. * @frags: fragment cache * @cur: storage for aggregation data * &struct ieee80211_sta points either here or to deflink.agg. * @deflink: This is the default link STA information, for non MLO STA all link * specific STA information is accessed through @deflink or through * link[0] which points to address of @deflink. For MLO Link STA * the first added link STA will point to deflink. * @link: reference to Link Sta entries. For Non MLO STA, except 1st link, * i.e link[0] all links would be assigned to NULL by default and * would access link information via @deflink or link[0]. For MLO * STA, first link STA being added will point its link pointer to * @deflink address and remaining would be allocated and the address * would be assigned to link[link_id] where link_id is the id assigned * by the AP. * @rem_link_stats: accumulated removed link stats */ struct sta_info { /* General information, mostly static */ struct list_head list, free_list; struct rcu_head rcu_head; struct rhlist_head hash_node; u8 addr[ETH_ALEN]; struct ieee80211_local *local; struct ieee80211_sub_if_data *sdata; struct ieee80211_key __rcu *ptk[NUM_DEFAULT_KEYS]; u8 ptk_idx; struct rate_control_ref *rate_ctrl; void *rate_ctrl_priv; spinlock_t rate_ctrl_lock; spinlock_t lock; struct ieee80211_fast_tx __rcu *fast_tx; struct ieee80211_fast_rx __rcu *fast_rx; #ifdef CONFIG_MAC80211_MESH struct mesh_sta *mesh; #endif struct work_struct drv_deliver_wk; u16 listen_interval; bool dead; bool removed; bool uploaded; enum ieee80211_sta_state sta_state; /* use the accessors defined below */ unsigned long _flags; /* STA powersave lock and frame queues */ spinlock_t ps_lock; struct sk_buff_head ps_tx_buf[IEEE80211_NUM_ACS]; struct sk_buff_head tx_filtered[IEEE80211_NUM_ACS]; unsigned long driver_buffered_tids; unsigned long txq_buffered_tids; u64 assoc_at; long last_connected; /* Plus 1 for non-QoS frames */ __le16 last_seq_ctrl[IEEE80211_NUM_TIDS + 1]; u16 tid_seq[IEEE80211_QOS_CTL_TID_MASK + 1]; struct airtime_info airtime[IEEE80211_NUM_ACS]; u16 airtime_weight; /* * Aggregation information, locked with lock. */ struct sta_ampdu_mlme ampdu_mlme; #ifdef CONFIG_MAC80211_DEBUGFS struct dentry *debugfs_dir; #endif u8 reserved_tid; s8 amsdu_mesh_control; struct cfg80211_chan_def tdls_chandef; struct ieee80211_fragment_cache frags; struct ieee80211_sta_aggregates cur; struct link_sta_info deflink; struct link_sta_info __rcu *link[IEEE80211_MLD_MAX_NUM_LINKS]; struct ieee80211_sta_removed_link_stats rem_link_stats; /* keep last! */ struct ieee80211_sta sta; }; static inline int ieee80211_tdls_sta_link_id(struct sta_info *sta) { /* TDLS STA can only have a single link */ return sta->sta.valid_links ? __ffs(sta->sta.valid_links) : 0; } static inline enum nl80211_plink_state sta_plink_state(struct sta_info *sta) { #ifdef CONFIG_MAC80211_MESH return sta->mesh->plink_state; #endif return NL80211_PLINK_LISTEN; } static inline void set_sta_flag(struct sta_info *sta, enum ieee80211_sta_info_flags flag) { WARN_ON(flag == WLAN_STA_AUTH || flag == WLAN_STA_ASSOC || flag == WLAN_STA_AUTHORIZED); set_bit(flag, &sta->_flags); } static inline void clear_sta_flag(struct sta_info *sta, enum ieee80211_sta_info_flags flag) { WARN_ON(flag == WLAN_STA_AUTH || flag == WLAN_STA_ASSOC || flag == WLAN_STA_AUTHORIZED); clear_bit(flag, &sta->_flags); } static inline int test_sta_flag(struct sta_info *sta, enum ieee80211_sta_info_flags flag) { return test_bit(flag, &sta->_flags); } static inline int test_and_clear_sta_flag(struct sta_info *sta, enum ieee80211_sta_info_flags flag) { WARN_ON(flag == WLAN_STA_AUTH || flag == WLAN_STA_ASSOC || flag == WLAN_STA_AUTHORIZED); return test_and_clear_bit(flag, &sta->_flags); } static inline int test_and_set_sta_flag(struct sta_info *sta, enum ieee80211_sta_info_flags flag) { WARN_ON(flag == WLAN_STA_AUTH || flag == WLAN_STA_ASSOC || flag == WLAN_STA_AUTHORIZED); return test_and_set_bit(flag, &sta->_flags); } int sta_info_move_state(struct sta_info *sta, enum ieee80211_sta_state new_state); static inline void sta_info_pre_move_state(struct sta_info *sta, enum ieee80211_sta_state new_state) { int ret; WARN_ON_ONCE(test_sta_flag(sta, WLAN_STA_INSERTED)); ret = sta_info_move_state(sta, new_state); WARN_ON_ONCE(ret); } void ieee80211_assign_tid_tx(struct sta_info *sta, int tid, struct tid_ampdu_tx *tid_tx); #define rcu_dereference_protected_tid_tx(sta, tid) \ rcu_dereference_protected((sta)->ampdu_mlme.tid_tx[tid], \ lockdep_is_held(&(sta)->lock) || \ lockdep_is_held(&(sta)->local->hw.wiphy->mtx)); /* Maximum number of frames to buffer per power saving station per AC */ #define STA_MAX_TX_BUFFER 64 /* Minimum buffered frame expiry time. If STA uses listen interval that is * smaller than this value, the minimum value here is used instead. */ #define STA_TX_BUFFER_EXPIRE (10 * HZ) /* How often station data is cleaned up (e.g., expiration of buffered frames) */ #define STA_INFO_CLEANUP_INTERVAL (10 * HZ) struct rhlist_head *sta_info_hash_lookup(struct ieee80211_local *local, const u8 *addr); /* * Get a STA info, must be under RCU read lock. */ struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata, const u8 *addr); struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr); /* user must hold wiphy mutex or be in RCU critical section */ struct sta_info *sta_info_get_by_addrs(struct ieee80211_local *local, const u8 *sta_addr, const u8 *vif_addr); #define for_each_sta_info(local, _addr, _sta, _tmp) \ rhl_for_each_entry_rcu(_sta, _tmp, \ sta_info_hash_lookup(local, _addr), hash_node) struct rhlist_head *link_sta_info_hash_lookup(struct ieee80211_local *local, const u8 *addr); #define for_each_link_sta_info(local, _addr, _sta, _tmp) \ rhl_for_each_entry_rcu(_sta, _tmp, \ link_sta_info_hash_lookup(local, _addr), \ link_hash_node) struct link_sta_info * link_sta_info_get_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr); /* * Get STA info by index, BROKEN! */ struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata, int idx); /* * Create a new STA info, caller owns returned structure * until sta_info_insert(). */ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, const u8 *addr, gfp_t gfp); struct sta_info *sta_info_alloc_with_link(struct ieee80211_sub_if_data *sdata, const u8 *mld_addr, unsigned int link_id, const u8 *link_addr, gfp_t gfp); void sta_info_free(struct ieee80211_local *local, struct sta_info *sta); /* * Insert STA info into hash table/list, returns zero or a * -EEXIST if (if the same MAC address is already present). * * Calling the non-rcu version makes the caller relinquish, * the _rcu version calls read_lock_rcu() and must be called * without it held. */ int sta_info_insert(struct sta_info *sta); int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU); int __must_check __sta_info_destroy(struct sta_info *sta); int sta_info_destroy_addr(struct ieee80211_sub_if_data *sdata, const u8 *addr); int sta_info_destroy_addr_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr); void sta_info_recalc_tim(struct sta_info *sta); int sta_info_init(struct ieee80211_local *local); void sta_info_stop(struct ieee80211_local *local); /** * __sta_info_flush - flush matching STA entries from the STA table * * Return: the number of removed STA entries. * * @sdata: sdata to remove all stations from * @vlans: if the given interface is an AP interface, also flush VLANs * @link_id: if given (>=0), all those STA entries using @link_id only * will be removed. If -1 is passed, all STA entries will be * removed. * @do_not_flush_sta: a station that shouldn't be flushed. */ int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans, int link_id, struct sta_info *do_not_flush_sta); /** * sta_info_flush - flush matching STA entries from the STA table * * Return: the number of removed STA entries. * * @sdata: sdata to remove all stations from * @link_id: if given (>=0), all those STA entries using @link_id only * will be removed. If -1 is passed, all STA entries will be * removed. */ static inline int sta_info_flush(struct ieee80211_sub_if_data *sdata, int link_id) { return __sta_info_flush(sdata, false, link_id, NULL); } void sta_set_rate_info_tx(struct sta_info *sta, const struct ieee80211_tx_rate *rate, struct rate_info *rinfo); void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, bool tidstats); void sta_set_accumulated_removed_links_sinfo(struct sta_info *sta, struct station_info *sinfo); u32 sta_get_expected_throughput(struct sta_info *sta); void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata, unsigned long exp_time); int ieee80211_sta_allocate_link(struct sta_info *sta, unsigned int link_id); void ieee80211_sta_free_link(struct sta_info *sta, unsigned int link_id); int ieee80211_sta_activate_link(struct sta_info *sta, unsigned int link_id); void ieee80211_sta_remove_link(struct sta_info *sta, unsigned int link_id); void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta); void ieee80211_sta_ps_deliver_poll_response(struct sta_info *sta); void ieee80211_sta_ps_deliver_uapsd(struct sta_info *sta); unsigned long ieee80211_sta_last_active(struct sta_info *sta, int link_id); void ieee80211_sta_set_max_amsdu_subframes(struct sta_info *sta, const u8 *ext_capab, unsigned int ext_capab_len); void __ieee80211_sta_recalc_aggregates(struct sta_info *sta, u16 active_links); enum sta_stats_type { STA_STATS_RATE_TYPE_INVALID = 0, STA_STATS_RATE_TYPE_LEGACY, STA_STATS_RATE_TYPE_HT, STA_STATS_RATE_TYPE_VHT, STA_STATS_RATE_TYPE_HE, STA_STATS_RATE_TYPE_S1G, STA_STATS_RATE_TYPE_EHT, }; #define STA_STATS_FIELD_HT_MCS GENMASK( 7, 0) #define STA_STATS_FIELD_LEGACY_IDX GENMASK( 3, 0) #define STA_STATS_FIELD_LEGACY_BAND GENMASK( 7, 4) #define STA_STATS_FIELD_VHT_MCS GENMASK( 3, 0) #define STA_STATS_FIELD_VHT_NSS GENMASK( 7, 4) #define STA_STATS_FIELD_HE_MCS GENMASK( 3, 0) #define STA_STATS_FIELD_HE_NSS GENMASK( 7, 4) #define STA_STATS_FIELD_EHT_MCS GENMASK( 3, 0) #define STA_STATS_FIELD_EHT_NSS GENMASK( 7, 4) #define STA_STATS_FIELD_BW GENMASK(12, 8) #define STA_STATS_FIELD_SGI GENMASK(13, 13) #define STA_STATS_FIELD_TYPE GENMASK(16, 14) #define STA_STATS_FIELD_HE_RU GENMASK(19, 17) #define STA_STATS_FIELD_HE_GI GENMASK(21, 20) #define STA_STATS_FIELD_HE_DCM GENMASK(22, 22) #define STA_STATS_FIELD_EHT_RU GENMASK(20, 17) #define STA_STATS_FIELD_EHT_GI GENMASK(22, 21) #define STA_STATS_FIELD(_n, _v) FIELD_PREP(STA_STATS_FIELD_ ## _n, _v) #define STA_STATS_GET(_n, _v) FIELD_GET(STA_STATS_FIELD_ ## _n, _v) #define STA_STATS_RATE_INVALID 0 static inline u32 sta_stats_encode_rate(struct ieee80211_rx_status *s) { u32 r; r = STA_STATS_FIELD(BW, s->bw); if (s->enc_flags & RX_ENC_FLAG_SHORT_GI) r |= STA_STATS_FIELD(SGI, 1); switch (s->encoding) { case RX_ENC_VHT: r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_VHT); r |= STA_STATS_FIELD(VHT_NSS, s->nss); r |= STA_STATS_FIELD(VHT_MCS, s->rate_idx); break; case RX_ENC_HT: r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_HT); r |= STA_STATS_FIELD(HT_MCS, s->rate_idx); break; case RX_ENC_LEGACY: r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_LEGACY); r |= STA_STATS_FIELD(LEGACY_BAND, s->band); r |= STA_STATS_FIELD(LEGACY_IDX, s->rate_idx); break; case RX_ENC_HE: r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_HE); r |= STA_STATS_FIELD(HE_NSS, s->nss); r |= STA_STATS_FIELD(HE_MCS, s->rate_idx); r |= STA_STATS_FIELD(HE_GI, s->he_gi); r |= STA_STATS_FIELD(HE_RU, s->he_ru); r |= STA_STATS_FIELD(HE_DCM, s->he_dcm); break; case RX_ENC_EHT: r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_EHT); r |= STA_STATS_FIELD(EHT_NSS, s->nss); r |= STA_STATS_FIELD(EHT_MCS, s->rate_idx); r |= STA_STATS_FIELD(EHT_GI, s->eht.gi); r |= STA_STATS_FIELD(EHT_RU, s->eht.ru); break; default: WARN_ON(1); return STA_STATS_RATE_INVALID; } return r; } #endif /* STA_INFO_H */
194 76 2 1 40 24 40 23 3 1 2 3 1 1 53 53 53 48 48 1 1 48 3 3 1 1 1 2 2 3 3 3 2 2 2 2 2 2 21 21 20 9 11 1 13 66 66 10 70 70 3 39 42 35 48 48 48 39 9 9 48 25 3 3 9 30 2 36 63 63 25 42 63 53 16 16 53 53 16 16 16 53 5 5 5 51 2 272 272 272 272 269 6 3 268 3 2 1 268 3 3 268 268 271 253 21 271 2179 2174 2178 573 1986 2168 1983 7 1 5 6 5 2 2 2 1 2 395 2 253 253 53 53 181 182 52 396 399 399 1 400 398 402 1 399 396 401 397 398 400 1 396 397 399 396 397 396 393 392 391 1 392 27 344 259 1 214 21 16 9 21 19 16 4 21 1 2 22 2 21 2 1 1 11 11 11 11 11 11 5 11 11 2 1 53 53 255 127 130 24 66 50 28 28 27 16 23 28 1 240 237 58 2 56 56 2 407 1400 1404 334 5 1062 25 77 40 4 3 2 2 2 31 46 1097 1096 1103 1 1062 78 78 78 78 9 72 4 40 3 25 25 25 25 55 55 1764 1219 1 540 1414 350 334 1424 1746 412 1109 1076 241 409 5 645 646 26 297 1 7 5 305 1382 1688 1684 1691 25 1685 1685 1658 38 4 1 1659 4 1658 332 1 229 1 2 1406 1632 1627 3 6 6 362 328 73 1768 250 1765 1761 1 1 1 1 13 30 7 2 20 13 34 12 26 26 19 23 3 5 3 31 31 49 44 44 3 2 44 43 3 50 1 49 46 1 43 51 21 50 19 32 48 57 55 2 57 57 1 53 53 255 1 1 700 717 1 717 718 1 1 551 179 1626 2 1619 7 3 277 218 61 3 47 36 2 1 2 18 127 131 253 4 28 225 253 253 4 4 6 2 2 2 2 2 4 2 10 10 3 25 18 24 7 3 3 2 2 3 12 3 9 4 8 2 2 2 3 36 23 2 2 1 8 419 295 4 298 153 1 19 1 1 37 268 1 1 1 1 10 149 2 2 2 2 4 2 2 2 3 2 7 1 2 1 3 2 2 36 2 6 2 7 2 2 2 1 1 3 1 1 25 1 3 2 2 7 2 1 1 3 14 7 149 419 16 1 15 101 101 63 63 90 34 4 2 2 27 6 11 2 4 32 30 32 32 2739 2207 681 32 182 180 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 // SPDX-License-Identifier: GPL-2.0-or-later /* * TUN - Universal TUN/TAP device driver. * Copyright (C) 1999-2002 Maxim Krasnyansky <maxk@qualcomm.com> * * $Id: tun.c,v 1.15 2002/03/01 02:44:24 maxk Exp $ */ /* * Changes: * * Mike Kershaw <dragorn@kismetwireless.net> 2005/08/14 * Add TUNSETLINK ioctl to set the link encapsulation * * Mark Smith <markzzzsmith@yahoo.com.au> * Use eth_random_addr() for tap MAC address. * * Harald Roelle <harald.roelle@ifi.lmu.de> 2004/04/20 * Fixes in packet dropping, queue length setting and queue wakeup. * Increased default tx queue length. * Added ethtool API. * Minor cleanups * * Daniel Podlejski <underley@underley.eu.org> * Modifications for 2.3.99-pre5 kernel. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define DRV_NAME "tun" #define DRV_VERSION "1.6" #define DRV_DESCRIPTION "Universal TUN/TAP device driver" #define DRV_COPYRIGHT "(C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>" #include <linux/module.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/major.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/fcntl.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/miscdevice.h> #include <linux/ethtool.h> #include <linux/rtnetlink.h> #include <linux/compat.h> #include <linux/if.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/if_tun.h> #include <linux/if_vlan.h> #include <linux/crc32.h> #include <linux/math.h> #include <linux/nsproxy.h> #include <linux/virtio_net.h> #include <linux/rcupdate.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/sock.h> #include <net/xdp.h> #include <net/ip_tunnels.h> #include <linux/seq_file.h> #include <linux/uio.h> #include <linux/skb_array.h> #include <linux/bpf.h> #include <linux/bpf_trace.h> #include <linux/mutex.h> #include <linux/ieee802154.h> #include <uapi/linux/if_ltalk.h> #include <uapi/linux/if_fddi.h> #include <uapi/linux/if_hippi.h> #include <uapi/linux/if_fc.h> #include <net/ax25.h> #include <net/rose.h> #include <net/6lowpan.h> #include <net/rps.h> #include <linux/uaccess.h> #include <linux/proc_fs.h> #include "tun_vnet.h" static void tun_default_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd); #define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) /* TUN device flags */ /* IFF_ATTACH_QUEUE is never stored in device flags, * overload it to mean fasync when stored there. */ #define TUN_FASYNC IFF_ATTACH_QUEUE #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \ IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS) #define GOODCOPY_LEN 128 #define FLT_EXACT_COUNT 8 struct tap_filter { unsigned int count; /* Number of addrs. Zero means disabled */ u32 mask[2]; /* Mask of the hashed addrs */ unsigned char addr[FLT_EXACT_COUNT][ETH_ALEN]; }; /* MAX_TAP_QUEUES 256 is chosen to allow rx/tx queues to be equal * to max number of VCPUs in guest. */ #define MAX_TAP_QUEUES 256 #define MAX_TAP_FLOWS 4096 #define TUN_FLOW_EXPIRE (3 * HZ) /* A tun_file connects an open character device to a tuntap netdevice. It * also contains all socket related structures (except sock_fprog and tap_filter) * to serve as one transmit queue for tuntap device. The sock_fprog and * tap_filter were kept in tun_struct since they were used for filtering for the * netdevice not for a specific queue (at least I didn't see the requirement for * this). * * RCU usage: * The tun_file and tun_struct are loosely coupled, the pointer from one to the * other can only be read while rcu_read_lock or rtnl_lock is held. */ struct tun_file { struct sock sk; struct socket socket; struct tun_struct __rcu *tun; struct fasync_struct *fasync; /* only used for fasnyc */ unsigned int flags; union { u16 queue_index; unsigned int ifindex; }; struct napi_struct napi; bool napi_enabled; bool napi_frags_enabled; struct mutex napi_mutex; /* Protects access to the above napi */ struct list_head next; struct tun_struct *detached; struct ptr_ring tx_ring; struct xdp_rxq_info xdp_rxq; }; struct tun_page { struct page *page; int count; }; struct tun_flow_entry { struct hlist_node hash_link; struct rcu_head rcu; struct tun_struct *tun; u32 rxhash; u32 rps_rxhash; int queue_index; unsigned long updated ____cacheline_aligned_in_smp; }; #define TUN_NUM_FLOW_ENTRIES 1024 #define TUN_MASK_FLOW_ENTRIES (TUN_NUM_FLOW_ENTRIES - 1) struct tun_prog { struct rcu_head rcu; struct bpf_prog *prog; }; /* Since the socket were moved to tun_file, to preserve the behavior of persist * device, socket filter, sndbuf and vnet header size were restore when the * file were attached to a persist device. */ struct tun_struct { struct tun_file __rcu *tfiles[MAX_TAP_QUEUES]; unsigned int numqueues; unsigned int flags; kuid_t owner; kgid_t group; struct net_device *dev; netdev_features_t set_features; #define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \ NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4 | \ NETIF_F_GSO_UDP_TUNNEL | NETIF_F_GSO_UDP_TUNNEL_CSUM) int align; int vnet_hdr_sz; int sndbuf; struct tap_filter txflt; struct sock_fprog fprog; /* protected by rtnl lock */ bool filter_attached; u32 msg_enable; spinlock_t lock; struct hlist_head flows[TUN_NUM_FLOW_ENTRIES]; struct timer_list flow_gc_timer; unsigned long ageing_time; unsigned int numdisabled; struct list_head disabled; void *security; u32 flow_count; u32 rx_batched; atomic_long_t rx_frame_errors; struct bpf_prog __rcu *xdp_prog; struct tun_prog __rcu *steering_prog; struct tun_prog __rcu *filter_prog; struct ethtool_link_ksettings link_ksettings; /* init args */ struct file *file; struct ifreq *ifr; }; struct veth { __be16 h_vlan_proto; __be16 h_vlan_TCI; }; static void tun_flow_init(struct tun_struct *tun); static void tun_flow_uninit(struct tun_struct *tun); static int tun_napi_receive(struct napi_struct *napi, int budget) { struct tun_file *tfile = container_of(napi, struct tun_file, napi); struct sk_buff_head *queue = &tfile->sk.sk_write_queue; struct sk_buff_head process_queue; struct sk_buff *skb; int received = 0; __skb_queue_head_init(&process_queue); spin_lock(&queue->lock); skb_queue_splice_tail_init(queue, &process_queue); spin_unlock(&queue->lock); while (received < budget && (skb = __skb_dequeue(&process_queue))) { napi_gro_receive(napi, skb); ++received; } if (!skb_queue_empty(&process_queue)) { spin_lock(&queue->lock); skb_queue_splice(&process_queue, queue); spin_unlock(&queue->lock); } return received; } static int tun_napi_poll(struct napi_struct *napi, int budget) { unsigned int received; received = tun_napi_receive(napi, budget); if (received < budget) napi_complete_done(napi, received); return received; } static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile, bool napi_en, bool napi_frags) { tfile->napi_enabled = napi_en; tfile->napi_frags_enabled = napi_en && napi_frags; if (napi_en) { netif_napi_add_tx(tun->dev, &tfile->napi, tun_napi_poll); napi_enable(&tfile->napi); } } static void tun_napi_enable(struct tun_file *tfile) { if (tfile->napi_enabled) napi_enable(&tfile->napi); } static void tun_napi_disable(struct tun_file *tfile) { if (tfile->napi_enabled) napi_disable(&tfile->napi); } static void tun_napi_del(struct tun_file *tfile) { if (tfile->napi_enabled) netif_napi_del(&tfile->napi); } static bool tun_napi_frags_enabled(const struct tun_file *tfile) { return tfile->napi_frags_enabled; } static inline u32 tun_hashfn(u32 rxhash) { return rxhash & TUN_MASK_FLOW_ENTRIES; } static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash) { struct tun_flow_entry *e; hlist_for_each_entry_rcu(e, head, hash_link) { if (e->rxhash == rxhash) return e; } return NULL; } static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun, struct hlist_head *head, u32 rxhash, u16 queue_index) { struct tun_flow_entry *e = kmalloc(sizeof(*e), GFP_ATOMIC); if (e) { netif_info(tun, tx_queued, tun->dev, "create flow: hash %u index %u\n", rxhash, queue_index); e->updated = jiffies; e->rxhash = rxhash; e->rps_rxhash = 0; e->queue_index = queue_index; e->tun = tun; hlist_add_head_rcu(&e->hash_link, head); ++tun->flow_count; } return e; } static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e) { netif_info(tun, tx_queued, tun->dev, "delete flow: hash %u index %u\n", e->rxhash, e->queue_index); hlist_del_rcu(&e->hash_link); kfree_rcu(e, rcu); --tun->flow_count; } static void tun_flow_flush(struct tun_struct *tun) { int i; spin_lock_bh(&tun->lock); for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { struct tun_flow_entry *e; struct hlist_node *n; hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) tun_flow_delete(tun, e); } spin_unlock_bh(&tun->lock); } static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index) { int i; spin_lock_bh(&tun->lock); for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { struct tun_flow_entry *e; struct hlist_node *n; hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) { if (e->queue_index == queue_index) tun_flow_delete(tun, e); } } spin_unlock_bh(&tun->lock); } static void tun_flow_cleanup(struct timer_list *t) { struct tun_struct *tun = timer_container_of(tun, t, flow_gc_timer); unsigned long delay = tun->ageing_time; unsigned long next_timer = jiffies + delay; unsigned long count = 0; int i; spin_lock(&tun->lock); for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { struct tun_flow_entry *e; struct hlist_node *n; hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) { unsigned long this_timer; this_timer = e->updated + delay; if (time_before_eq(this_timer, jiffies)) { tun_flow_delete(tun, e); continue; } count++; if (time_before(this_timer, next_timer)) next_timer = this_timer; } } if (count) mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer)); spin_unlock(&tun->lock); } static void tun_flow_update(struct tun_struct *tun, u32 rxhash, struct tun_file *tfile) { struct hlist_head *head; struct tun_flow_entry *e; unsigned long delay = tun->ageing_time; u16 queue_index = tfile->queue_index; head = &tun->flows[tun_hashfn(rxhash)]; rcu_read_lock(); e = tun_flow_find(head, rxhash); if (likely(e)) { /* TODO: keep queueing to old queue until it's empty? */ if (READ_ONCE(e->queue_index) != queue_index) WRITE_ONCE(e->queue_index, queue_index); if (e->updated != jiffies) e->updated = jiffies; sock_rps_record_flow_hash(e->rps_rxhash); } else { spin_lock_bh(&tun->lock); if (!tun_flow_find(head, rxhash) && tun->flow_count < MAX_TAP_FLOWS) tun_flow_create(tun, head, rxhash, queue_index); if (!timer_pending(&tun->flow_gc_timer)) mod_timer(&tun->flow_gc_timer, round_jiffies_up(jiffies + delay)); spin_unlock_bh(&tun->lock); } rcu_read_unlock(); } /* Save the hash received in the stack receive path and update the * flow_hash table accordingly. */ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) { if (unlikely(e->rps_rxhash != hash)) e->rps_rxhash = hash; } /* We try to identify a flow through its rxhash. The reason that * we do not check rxq no. is because some cards(e.g 82599), chooses * the rxq based on the txq where the last packet of the flow comes. As * the userspace application move between processors, we may get a * different rxq no. here. */ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) { struct tun_flow_entry *e; u32 txq, numqueues; numqueues = READ_ONCE(tun->numqueues); txq = __skb_get_hash_symmetric(skb); e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); if (e) { tun_flow_save_rps_rxhash(e, txq); txq = e->queue_index; } else { txq = reciprocal_scale(txq, numqueues); } return txq; } static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb) { struct tun_prog *prog; u32 numqueues; u16 ret = 0; numqueues = READ_ONCE(tun->numqueues); if (!numqueues) return 0; prog = rcu_dereference(tun->steering_prog); if (prog) ret = bpf_prog_run_clear_cb(prog->prog, skb); return ret % numqueues; } static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { struct tun_struct *tun = netdev_priv(dev); u16 ret; rcu_read_lock(); if (rcu_dereference(tun->steering_prog)) ret = tun_ebpf_select_queue(tun, skb); else ret = tun_automq_select_queue(tun, skb); rcu_read_unlock(); return ret; } static inline bool tun_not_capable(struct tun_struct *tun) { const struct cred *cred = current_cred(); struct net *net = dev_net(tun->dev); return ((uid_valid(tun->owner) && !uid_eq(cred->euid, tun->owner)) || (gid_valid(tun->group) && !in_egroup_p(tun->group))) && !ns_capable(net->user_ns, CAP_NET_ADMIN); } static void tun_set_real_num_queues(struct tun_struct *tun) { netif_set_real_num_tx_queues(tun->dev, tun->numqueues); netif_set_real_num_rx_queues(tun->dev, tun->numqueues); } static void tun_disable_queue(struct tun_struct *tun, struct tun_file *tfile) { tfile->detached = tun; list_add_tail(&tfile->next, &tun->disabled); ++tun->numdisabled; } static struct tun_struct *tun_enable_queue(struct tun_file *tfile) { struct tun_struct *tun = tfile->detached; tfile->detached = NULL; list_del_init(&tfile->next); --tun->numdisabled; return tun; } void tun_ptr_free(void *ptr) { if (!ptr) return; if (tun_is_xdp_frame(ptr)) { struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); xdp_return_frame(xdpf); } else { __skb_array_destroy_skb(ptr); } } EXPORT_SYMBOL_GPL(tun_ptr_free); static void tun_queue_purge(struct tun_file *tfile) { void *ptr; while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL) tun_ptr_free(ptr); skb_queue_purge(&tfile->sk.sk_write_queue); skb_queue_purge(&tfile->sk.sk_error_queue); } static void __tun_detach(struct tun_file *tfile, bool clean) { struct tun_file *ntfile; struct tun_struct *tun; tun = rtnl_dereference(tfile->tun); if (tun && clean) { if (!tfile->detached) tun_napi_disable(tfile); tun_napi_del(tfile); } if (tun && !tfile->detached) { u16 index = tfile->queue_index; BUG_ON(index >= tun->numqueues); rcu_assign_pointer(tun->tfiles[index], tun->tfiles[tun->numqueues - 1]); ntfile = rtnl_dereference(tun->tfiles[index]); ntfile->queue_index = index; ntfile->xdp_rxq.queue_index = index; rcu_assign_pointer(tun->tfiles[tun->numqueues - 1], NULL); --tun->numqueues; if (clean) { RCU_INIT_POINTER(tfile->tun, NULL); sock_put(&tfile->sk); } else { tun_disable_queue(tun, tfile); tun_napi_disable(tfile); } synchronize_net(); tun_flow_delete_by_queue(tun, tun->numqueues + 1); /* Drop read queue */ tun_queue_purge(tfile); tun_set_real_num_queues(tun); } else if (tfile->detached && clean) { tun = tun_enable_queue(tfile); sock_put(&tfile->sk); } if (clean) { if (tun && tun->numqueues == 0 && tun->numdisabled == 0) { netif_carrier_off(tun->dev); if (!(tun->flags & IFF_PERSIST) && tun->dev->reg_state == NETREG_REGISTERED) unregister_netdevice(tun->dev); } if (tun) xdp_rxq_info_unreg(&tfile->xdp_rxq); ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free); } } static void tun_detach(struct tun_file *tfile, bool clean) { struct tun_struct *tun; struct net_device *dev; rtnl_lock(); tun = rtnl_dereference(tfile->tun); dev = tun ? tun->dev : NULL; __tun_detach(tfile, clean); if (dev) netdev_state_change(dev); rtnl_unlock(); if (clean) sock_put(&tfile->sk); } static void tun_detach_all(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); struct tun_file *tfile, *tmp; int i, n = tun->numqueues; for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); BUG_ON(!tfile); tun_napi_disable(tfile); tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN; tfile->socket.sk->sk_data_ready(tfile->socket.sk); RCU_INIT_POINTER(tfile->tun, NULL); --tun->numqueues; } list_for_each_entry(tfile, &tun->disabled, next) { tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN; tfile->socket.sk->sk_data_ready(tfile->socket.sk); RCU_INIT_POINTER(tfile->tun, NULL); } BUG_ON(tun->numqueues != 0); synchronize_net(); for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); tun_napi_del(tfile); /* Drop read queue */ tun_queue_purge(tfile); xdp_rxq_info_unreg(&tfile->xdp_rxq); sock_put(&tfile->sk); } list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) { tun_napi_del(tfile); tun_enable_queue(tfile); tun_queue_purge(tfile); xdp_rxq_info_unreg(&tfile->xdp_rxq); sock_put(&tfile->sk); } BUG_ON(tun->numdisabled != 0); if (tun->flags & IFF_PERSIST) module_put(THIS_MODULE); } static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter, bool napi, bool napi_frags, bool publish_tun) { struct tun_file *tfile = file->private_data; struct net_device *dev = tun->dev; int err; err = security_tun_dev_attach(tfile->socket.sk, tun->security); if (err < 0) goto out; err = -EINVAL; if (rtnl_dereference(tfile->tun) && !tfile->detached) goto out; err = -EBUSY; if (!(tun->flags & IFF_MULTI_QUEUE) && tun->numqueues == 1) goto out; err = -E2BIG; if (!tfile->detached && tun->numqueues + tun->numdisabled == MAX_TAP_QUEUES) goto out; err = 0; /* Re-attach the filter to persist device */ if (!skip_filter && (tun->filter_attached == true)) { lock_sock(tfile->socket.sk); err = sk_attach_filter(&tun->fprog, tfile->socket.sk); release_sock(tfile->socket.sk); if (!err) goto out; } if (!tfile->detached && ptr_ring_resize(&tfile->tx_ring, dev->tx_queue_len, GFP_KERNEL, tun_ptr_free)) { err = -ENOMEM; goto out; } tfile->queue_index = tun->numqueues; tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN; if (tfile->detached) { /* Re-attach detached tfile, updating XDP queue_index */ WARN_ON(!xdp_rxq_info_is_reg(&tfile->xdp_rxq)); if (tfile->xdp_rxq.queue_index != tfile->queue_index) tfile->xdp_rxq.queue_index = tfile->queue_index; } else { /* Setup XDP RX-queue info, for new tfile getting attached */ err = xdp_rxq_info_reg(&tfile->xdp_rxq, tun->dev, tfile->queue_index, 0); if (err < 0) goto out; err = xdp_rxq_info_reg_mem_model(&tfile->xdp_rxq, MEM_TYPE_PAGE_SHARED, NULL); if (err < 0) { xdp_rxq_info_unreg(&tfile->xdp_rxq); goto out; } err = 0; } if (tfile->detached) { tun_enable_queue(tfile); tun_napi_enable(tfile); } else { sock_hold(&tfile->sk); tun_napi_init(tun, tfile, napi, napi_frags); } if (rtnl_dereference(tun->xdp_prog)) sock_set_flag(&tfile->sk, SOCK_XDP); /* device is allowed to go away first, so no need to hold extra * refcnt. */ /* Publish tfile->tun and tun->tfiles only after we've fully * initialized tfile; otherwise we risk using half-initialized * object. */ if (publish_tun) rcu_assign_pointer(tfile->tun, tun); rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile); tun->numqueues++; tun_set_real_num_queues(tun); out: return err; } static struct tun_struct *tun_get(struct tun_file *tfile) { struct tun_struct *tun; rcu_read_lock(); tun = rcu_dereference(tfile->tun); if (tun) dev_hold(tun->dev); rcu_read_unlock(); return tun; } static void tun_put(struct tun_struct *tun) { dev_put(tun->dev); } /* TAP filtering */ static void addr_hash_set(u32 *mask, const u8 *addr) { int n = ether_crc(ETH_ALEN, addr) >> 26; mask[n >> 5] |= (1 << (n & 31)); } static unsigned int addr_hash_test(const u32 *mask, const u8 *addr) { int n = ether_crc(ETH_ALEN, addr) >> 26; return mask[n >> 5] & (1 << (n & 31)); } static int update_filter(struct tap_filter *filter, void __user *arg) { struct { u8 u[ETH_ALEN]; } *addr; struct tun_filter uf; int err, alen, n, nexact; if (copy_from_user(&uf, arg, sizeof(uf))) return -EFAULT; if (!uf.count) { /* Disabled */ filter->count = 0; return 0; } alen = ETH_ALEN * uf.count; addr = memdup_user(arg + sizeof(uf), alen); if (IS_ERR(addr)) return PTR_ERR(addr); /* The filter is updated without holding any locks. Which is * perfectly safe. We disable it first and in the worst * case we'll accept a few undesired packets. */ filter->count = 0; wmb(); /* Use first set of addresses as an exact filter */ for (n = 0; n < uf.count && n < FLT_EXACT_COUNT; n++) memcpy(filter->addr[n], addr[n].u, ETH_ALEN); nexact = n; /* Remaining multicast addresses are hashed, * unicast will leave the filter disabled. */ memset(filter->mask, 0, sizeof(filter->mask)); for (; n < uf.count; n++) { if (!is_multicast_ether_addr(addr[n].u)) { err = 0; /* no filter */ goto free_addr; } addr_hash_set(filter->mask, addr[n].u); } /* For ALLMULTI just set the mask to all ones. * This overrides the mask populated above. */ if ((uf.flags & TUN_FLT_ALLMULTI)) memset(filter->mask, ~0, sizeof(filter->mask)); /* Now enable the filter */ wmb(); filter->count = nexact; /* Return the number of exact filters */ err = nexact; free_addr: kfree(addr); return err; } /* Returns: 0 - drop, !=0 - accept */ static int run_filter(struct tap_filter *filter, const struct sk_buff *skb) { /* Cannot use eth_hdr(skb) here because skb_mac_hdr() is incorrect * at this point. */ struct ethhdr *eh = (struct ethhdr *) skb->data; int i; /* Exact match */ for (i = 0; i < filter->count; i++) if (ether_addr_equal(eh->h_dest, filter->addr[i])) return 1; /* Inexact match (multicast only) */ if (is_multicast_ether_addr(eh->h_dest)) return addr_hash_test(filter->mask, eh->h_dest); return 0; } /* * Checks whether the packet is accepted or not. * Returns: 0 - drop, !=0 - accept */ static int check_filter(struct tap_filter *filter, const struct sk_buff *skb) { if (!filter->count) return 1; return run_filter(filter, skb); } /* Network device part of the driver */ static const struct ethtool_ops tun_ethtool_ops; static int tun_net_init(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); struct ifreq *ifr = tun->ifr; int err; spin_lock_init(&tun->lock); err = security_tun_dev_alloc_security(&tun->security); if (err < 0) return err; tun_flow_init(tun); dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; dev->hw_enc_features = dev->hw_features; dev->features = dev->hw_features; dev->vlan_features = dev->features & ~(NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); dev->lltx = true; tun->flags = (tun->flags & ~TUN_FEATURES) | (ifr->ifr_flags & TUN_FEATURES); INIT_LIST_HEAD(&tun->disabled); err = tun_attach(tun, tun->file, false, ifr->ifr_flags & IFF_NAPI, ifr->ifr_flags & IFF_NAPI_FRAGS, false); if (err < 0) { tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); return err; } return 0; } /* Net device detach from fd. */ static void tun_net_uninit(struct net_device *dev) { tun_detach_all(dev); } /* Net device open. */ static int tun_net_open(struct net_device *dev) { netif_tx_start_all_queues(dev); return 0; } /* Net device close. */ static int tun_net_close(struct net_device *dev) { netif_tx_stop_all_queues(dev); return 0; } /* Net device start xmit */ static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb) { #ifdef CONFIG_RPS if (tun->numqueues == 1 && static_branch_unlikely(&rps_needed)) { /* Select queue was not called for the skbuff, so we extract the * RPS hash and save it into the flow_table here. */ struct tun_flow_entry *e; __u32 rxhash; rxhash = __skb_get_hash_symmetric(skb); e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)], rxhash); if (e) tun_flow_save_rps_rxhash(e, rxhash); } #endif } static unsigned int run_ebpf_filter(struct tun_struct *tun, struct sk_buff *skb, int len) { struct tun_prog *prog = rcu_dereference(tun->filter_prog); if (prog) len = bpf_prog_run_clear_cb(prog->prog, skb); return len; } /* Net device start xmit */ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) { enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; struct tun_struct *tun = netdev_priv(dev); int txq = skb->queue_mapping; struct netdev_queue *queue; struct tun_file *tfile; int len = skb->len; rcu_read_lock(); tfile = rcu_dereference(tun->tfiles[txq]); /* Drop packet if interface is not attached */ if (!tfile) { drop_reason = SKB_DROP_REASON_DEV_READY; goto drop; } if (!rcu_dereference(tun->steering_prog)) tun_automq_xmit(tun, skb); netif_info(tun, tx_queued, tun->dev, "%s %d\n", __func__, skb->len); /* Drop if the filter does not like it. * This is a noop if the filter is disabled. * Filter can be enabled only for the TAP devices. */ if (!check_filter(&tun->txflt, skb)) { drop_reason = SKB_DROP_REASON_TAP_TXFILTER; goto drop; } if (tfile->socket.sk->sk_filter && sk_filter_reason(tfile->socket.sk, skb, &drop_reason)) goto drop; len = run_ebpf_filter(tun, skb, len); if (len == 0) { drop_reason = SKB_DROP_REASON_TAP_FILTER; goto drop; } if (pskb_trim(skb, len)) { drop_reason = SKB_DROP_REASON_NOMEM; goto drop; } if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) { drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT; goto drop; } skb_tx_timestamp(skb); /* Orphan the skb - required as we might hang on to it * for indefinite time. */ skb_orphan(skb); nf_reset_ct(skb); if (ptr_ring_produce(&tfile->tx_ring, skb)) { drop_reason = SKB_DROP_REASON_FULL_RING; goto drop; } /* dev->lltx requires to do our own update of trans_start */ queue = netdev_get_tx_queue(dev, txq); txq_trans_cond_update(queue); /* Notify and wake up reader process */ if (tfile->flags & TUN_FASYNC) kill_fasync(&tfile->fasync, SIGIO, POLL_IN); tfile->socket.sk->sk_data_ready(tfile->socket.sk); rcu_read_unlock(); return NETDEV_TX_OK; drop: dev_core_stats_tx_dropped_inc(dev); skb_tx_error(skb); kfree_skb_reason(skb, drop_reason); rcu_read_unlock(); return NET_XMIT_DROP; } static void tun_net_mclist(struct net_device *dev) { /* * This callback is supposed to deal with mc filter in * _rx_ path and has nothing to do with the _tx_ path. * In rx path we always accept everything userspace gives us. */ } static netdev_features_t tun_net_fix_features(struct net_device *dev, netdev_features_t features) { struct tun_struct *tun = netdev_priv(dev); return (features & tun->set_features) | (features & ~TUN_USER_FEATURES); } static void tun_set_headroom(struct net_device *dev, int new_hr) { struct tun_struct *tun = netdev_priv(dev); if (new_hr < NET_SKB_PAD) new_hr = NET_SKB_PAD; tun->align = new_hr; } static void tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct tun_struct *tun = netdev_priv(dev); dev_get_tstats64(dev, stats); stats->rx_frame_errors += (unsigned long)atomic_long_read(&tun->rx_frame_errors); } static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog, struct netlink_ext_ack *extack) { struct tun_struct *tun = netdev_priv(dev); struct tun_file *tfile; struct bpf_prog *old_prog; int i; old_prog = rtnl_dereference(tun->xdp_prog); rcu_assign_pointer(tun->xdp_prog, prog); if (old_prog) bpf_prog_put(old_prog); for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); if (prog) sock_set_flag(&tfile->sk, SOCK_XDP); else sock_reset_flag(&tfile->sk, SOCK_XDP); } list_for_each_entry(tfile, &tun->disabled, next) { if (prog) sock_set_flag(&tfile->sk, SOCK_XDP); else sock_reset_flag(&tfile->sk, SOCK_XDP); } return 0; } static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: return tun_xdp_set(dev, xdp->prog, xdp->extack); default: return -EINVAL; } } static int tun_net_change_carrier(struct net_device *dev, bool new_carrier) { if (new_carrier) { struct tun_struct *tun = netdev_priv(dev); if (!tun->numqueues) return -EPERM; netif_carrier_on(dev); } else { netif_carrier_off(dev); } return 0; } static const struct net_device_ops tun_netdev_ops = { .ndo_init = tun_net_init, .ndo_uninit = tun_net_uninit, .ndo_open = tun_net_open, .ndo_stop = tun_net_close, .ndo_start_xmit = tun_net_xmit, .ndo_fix_features = tun_net_fix_features, .ndo_select_queue = tun_select_queue, .ndo_set_rx_headroom = tun_set_headroom, .ndo_get_stats64 = tun_net_get_stats64, .ndo_change_carrier = tun_net_change_carrier, }; static void __tun_xdp_flush_tfile(struct tun_file *tfile) { /* Notify and wake up reader process */ if (tfile->flags & TUN_FASYNC) kill_fasync(&tfile->fasync, SIGIO, POLL_IN); tfile->socket.sk->sk_data_ready(tfile->socket.sk); } static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags) { struct tun_struct *tun = netdev_priv(dev); struct tun_file *tfile; u32 numqueues; int nxmit = 0; int i; if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) return -EINVAL; rcu_read_lock(); resample: numqueues = READ_ONCE(tun->numqueues); if (!numqueues) { rcu_read_unlock(); return -ENXIO; /* Caller will free/return all frames */ } tfile = rcu_dereference(tun->tfiles[smp_processor_id() % numqueues]); if (unlikely(!tfile)) goto resample; spin_lock(&tfile->tx_ring.producer_lock); for (i = 0; i < n; i++) { struct xdp_frame *xdp = frames[i]; /* Encode the XDP flag into lowest bit for consumer to differ * XDP buffer from sk_buff. */ void *frame = tun_xdp_to_ptr(xdp); if (__ptr_ring_produce(&tfile->tx_ring, frame)) { dev_core_stats_tx_dropped_inc(dev); break; } nxmit++; } spin_unlock(&tfile->tx_ring.producer_lock); if (flags & XDP_XMIT_FLUSH) __tun_xdp_flush_tfile(tfile); rcu_read_unlock(); return nxmit; } static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) { struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp); int nxmit; if (unlikely(!frame)) return -EOVERFLOW; nxmit = tun_xdp_xmit(dev, 1, &frame, XDP_XMIT_FLUSH); if (!nxmit) xdp_return_frame_rx_napi(frame); return nxmit; } static const struct net_device_ops tap_netdev_ops = { .ndo_init = tun_net_init, .ndo_uninit = tun_net_uninit, .ndo_open = tun_net_open, .ndo_stop = tun_net_close, .ndo_start_xmit = tun_net_xmit, .ndo_fix_features = tun_net_fix_features, .ndo_set_rx_mode = tun_net_mclist, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_select_queue = tun_select_queue, .ndo_features_check = passthru_features_check, .ndo_set_rx_headroom = tun_set_headroom, .ndo_bpf = tun_xdp, .ndo_xdp_xmit = tun_xdp_xmit, .ndo_change_carrier = tun_net_change_carrier, }; static void tun_flow_init(struct tun_struct *tun) { int i; for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) INIT_HLIST_HEAD(&tun->flows[i]); tun->ageing_time = TUN_FLOW_EXPIRE; timer_setup(&tun->flow_gc_timer, tun_flow_cleanup, 0); mod_timer(&tun->flow_gc_timer, round_jiffies_up(jiffies + tun->ageing_time)); } static void tun_flow_uninit(struct tun_struct *tun) { timer_delete_sync(&tun->flow_gc_timer); tun_flow_flush(tun); } #define MIN_MTU 68 #define MAX_MTU 65535 /* Initialize net device. */ static void tun_net_initialize(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); switch (tun->flags & TUN_TYPE_MASK) { case IFF_TUN: dev->netdev_ops = &tun_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; /* Point-to-Point TUN Device */ dev->hard_header_len = 0; dev->addr_len = 0; dev->mtu = 1500; /* Zero header length */ dev->type = ARPHRD_NONE; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; break; case IFF_TAP: dev->netdev_ops = &tap_netdev_ops; /* Ethernet TAP Device */ ether_setup(dev); dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; eth_hw_addr_random(dev); /* Currently tun does not support XDP, only tap does. */ dev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | NETDEV_XDP_ACT_NDO_XMIT; break; } dev->min_mtu = MIN_MTU; dev->max_mtu = MAX_MTU - dev->hard_header_len; } static bool tun_sock_writeable(struct tun_struct *tun, struct tun_file *tfile) { struct sock *sk = tfile->socket.sk; return (tun->dev->flags & IFF_UP) && sock_writeable(sk); } /* Character device part */ /* Poll */ static __poll_t tun_chr_poll(struct file *file, poll_table *wait) { struct tun_file *tfile = file->private_data; struct tun_struct *tun = tun_get(tfile); struct sock *sk; __poll_t mask = 0; if (!tun) return EPOLLERR; sk = tfile->socket.sk; poll_wait(file, sk_sleep(sk), wait); if (!ptr_ring_empty(&tfile->tx_ring)) mask |= EPOLLIN | EPOLLRDNORM; /* Make sure SOCKWQ_ASYNC_NOSPACE is set if not writable to * guarantee EPOLLOUT to be raised by either here or * tun_sock_write_space(). Then process could get notification * after it writes to a down device and meets -EIO. */ if (tun_sock_writeable(tun, tfile) || (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) && tun_sock_writeable(tun, tfile))) mask |= EPOLLOUT | EPOLLWRNORM; if (tun->dev->reg_state != NETREG_REGISTERED) mask = EPOLLERR; tun_put(tun); return mask; } static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile, size_t len, const struct iov_iter *it) { struct sk_buff *skb; size_t linear; int err; int i; if (it->nr_segs > MAX_SKB_FRAGS + 1 || len > (ETH_MAX_MTU - NET_SKB_PAD - NET_IP_ALIGN)) return ERR_PTR(-EMSGSIZE); local_bh_disable(); skb = napi_get_frags(&tfile->napi); local_bh_enable(); if (!skb) return ERR_PTR(-ENOMEM); linear = iov_iter_single_seg_count(it); err = __skb_grow(skb, linear); if (err) goto free; skb->len = len; skb->data_len = len - linear; skb->truesize += skb->data_len; for (i = 1; i < it->nr_segs; i++) { const struct iovec *iov = iter_iov(it) + i; size_t fragsz = iov->iov_len; struct page *page; void *frag; if (fragsz == 0 || fragsz > PAGE_SIZE) { err = -EINVAL; goto free; } frag = netdev_alloc_frag(fragsz); if (!frag) { err = -ENOMEM; goto free; } page = virt_to_head_page(frag); skb_fill_page_desc(skb, i - 1, page, frag - page_address(page), fragsz); } return skb; free: /* frees skb and all frags allocated with napi_alloc_frag() */ napi_free_frags(&tfile->napi); return ERR_PTR(err); } /* prepad is the amount to reserve at front. len is length after that. * linear is a hint as to how much to copy (usually headers). */ static struct sk_buff *tun_alloc_skb(struct tun_file *tfile, size_t prepad, size_t len, size_t linear, int noblock) { struct sock *sk = tfile->socket.sk; struct sk_buff *skb; int err; /* Under a page? Don't bother with paged skb. */ if (prepad + len < PAGE_SIZE) linear = len; if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER); skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, &err, PAGE_ALLOC_COSTLY_ORDER); if (!skb) return ERR_PTR(err); skb_reserve(skb, prepad); skb_put(skb, linear); skb->data_len = len - linear; skb->len += len - linear; return skb; } static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile, struct sk_buff *skb, int more) { struct sk_buff_head *queue = &tfile->sk.sk_write_queue; struct sk_buff_head process_queue; u32 rx_batched = tun->rx_batched; bool rcv = false; if (!rx_batched || (!more && skb_queue_empty(queue))) { local_bh_disable(); skb_record_rx_queue(skb, tfile->queue_index); netif_receive_skb(skb); local_bh_enable(); return; } spin_lock(&queue->lock); if (!more || skb_queue_len(queue) == rx_batched) { __skb_queue_head_init(&process_queue); skb_queue_splice_tail_init(queue, &process_queue); rcv = true; } else { __skb_queue_tail(queue, skb); } spin_unlock(&queue->lock); if (rcv) { struct sk_buff *nskb; local_bh_disable(); while ((nskb = __skb_dequeue(&process_queue))) { skb_record_rx_queue(nskb, tfile->queue_index); netif_receive_skb(nskb); } skb_record_rx_queue(skb, tfile->queue_index); netif_receive_skb(skb); local_bh_enable(); } } static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile, int len, int noblock, bool zerocopy) { if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) return false; if (tfile->socket.sk->sk_sndbuf != INT_MAX) return false; if (!noblock) return false; if (zerocopy) return false; if (SKB_DATA_ALIGN(len + TUN_RX_PAD + XDP_PACKET_HEADROOM) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE) return false; return true; } static struct sk_buff *__tun_build_skb(struct tun_file *tfile, struct page_frag *alloc_frag, char *buf, int buflen, int len, int pad, int metasize) { struct sk_buff *skb = build_skb(buf, buflen); if (!skb) return ERR_PTR(-ENOMEM); skb_reserve(skb, pad); skb_put(skb, len); if (metasize) skb_metadata_set(skb, metasize); skb_set_owner_w(skb, tfile->socket.sk); get_page(alloc_frag->page); alloc_frag->offset += buflen; return skb; } static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog, struct xdp_buff *xdp, u32 act) { int err; switch (act) { case XDP_REDIRECT: err = xdp_do_redirect(tun->dev, xdp, xdp_prog); if (err) { dev_core_stats_rx_dropped_inc(tun->dev); return err; } dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data); break; case XDP_TX: err = tun_xdp_tx(tun->dev, xdp); if (err < 0) { dev_core_stats_rx_dropped_inc(tun->dev); return err; } dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data); break; case XDP_PASS: break; default: bpf_warn_invalid_xdp_action(tun->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(tun->dev, xdp_prog, act); fallthrough; case XDP_DROP: dev_core_stats_rx_dropped_inc(tun->dev); break; } return act; } static struct sk_buff *tun_build_skb(struct tun_struct *tun, struct tun_file *tfile, struct iov_iter *from, struct virtio_net_hdr *hdr, int len, int *skb_xdp) { struct page_frag *alloc_frag = &current->task_frag; struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct bpf_prog *xdp_prog; int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); char *buf; size_t copied; int pad = TUN_RX_PAD; int metasize = 0; int err = 0; rcu_read_lock(); xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) pad += XDP_PACKET_HEADROOM; buflen += SKB_DATA_ALIGN(len + pad); rcu_read_unlock(); alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES); if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL))) return ERR_PTR(-ENOMEM); buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; copied = copy_page_from_iter(alloc_frag->page, alloc_frag->offset + pad, len, from); if (copied != len) return ERR_PTR(-EFAULT); /* There's a small window that XDP may be set after the check * of xdp_prog above, this should be rare and for simplicity * we do XDP on skb in case the headroom is not enough. */ if (hdr->gso_type || !xdp_prog) { *skb_xdp = 1; return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad, metasize); } *skb_xdp = 0; local_bh_disable(); rcu_read_lock(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) { struct xdp_buff xdp; u32 act; xdp_init_buff(&xdp, buflen, &tfile->xdp_rxq); xdp_prepare_buff(&xdp, buf, pad, len, true); act = bpf_prog_run_xdp(xdp_prog, &xdp); if (act == XDP_REDIRECT || act == XDP_TX) { get_page(alloc_frag->page); alloc_frag->offset += buflen; } err = tun_xdp_act(tun, xdp_prog, &xdp, act); if (err < 0) { if (act == XDP_REDIRECT || act == XDP_TX) put_page(alloc_frag->page); goto out; } if (err == XDP_REDIRECT) xdp_do_flush(); if (err != XDP_PASS) goto out; pad = xdp.data - xdp.data_hard_start; len = xdp.data_end - xdp.data; /* It is known that the xdp_buff was prepared with metadata * support, so the metasize will never be negative. */ metasize = xdp.data - xdp.data_meta; } bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock(); local_bh_enable(); return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad, metasize); out: bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock(); local_bh_enable(); return NULL; } /* Get packet from user space buffer */ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, void *msg_control, struct iov_iter *from, int noblock, bool more) { struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) }; struct sk_buff *skb; size_t total_len = iov_iter_count(from); size_t len = total_len, align = tun->align, linear; struct virtio_net_hdr_v1_hash_tunnel hdr; struct virtio_net_hdr *gso; int good_linear; int copylen; int hdr_len = 0; bool zerocopy = false; int err; u32 rxhash = 0; int skb_xdp = 1; bool frags = tun_napi_frags_enabled(tfile); enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; netdev_features_t features = 0; /* * Keep it easy and always zero the whole buffer, even if the * tunnel-related field will be touched only when the feature * is enabled and the hdr size id compatible. */ memset(&hdr, 0, sizeof(hdr)); gso = (struct virtio_net_hdr *)&hdr; if (!(tun->flags & IFF_NO_PI)) { if (len < sizeof(pi)) return -EINVAL; len -= sizeof(pi); if (!copy_from_iter_full(&pi, sizeof(pi), from)) return -EFAULT; } if (tun->flags & IFF_VNET_HDR) { int vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); features = tun_vnet_hdr_guest_features(vnet_hdr_sz); hdr_len = __tun_vnet_hdr_get(vnet_hdr_sz, tun->flags, features, from, gso); if (hdr_len < 0) return hdr_len; len -= vnet_hdr_sz; } if ((tun->flags & TUN_TYPE_MASK) == IFF_TAP) { align += NET_IP_ALIGN; if (unlikely(len < ETH_HLEN || (hdr_len && hdr_len < ETH_HLEN))) return -EINVAL; } good_linear = SKB_MAX_HEAD(align); if (msg_control) { struct iov_iter i = *from; /* There are 256 bytes to be copied in skb, so there is * enough room for skb expand head in case it is used. * The rest of the buffer is mapped from userspace. */ copylen = min(hdr_len ? hdr_len : GOODCOPY_LEN, good_linear); linear = copylen; iov_iter_advance(&i, copylen); if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS) zerocopy = true; } if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) { /* For the packet that is not easy to be processed * (e.g gso or jumbo packet), we will do it at after * skb was created with generic XDP routine. */ skb = tun_build_skb(tun, tfile, from, gso, len, &skb_xdp); err = PTR_ERR_OR_ZERO(skb); if (err) goto drop; if (!skb) return total_len; } else { if (!zerocopy) { copylen = len; linear = min(hdr_len, good_linear); } if (frags) { mutex_lock(&tfile->napi_mutex); skb = tun_napi_alloc_frags(tfile, copylen, from); /* tun_napi_alloc_frags() enforces a layout for the skb. * If zerocopy is enabled, then this layout will be * overwritten by zerocopy_sg_from_iter(). */ zerocopy = false; } else { if (!linear) linear = min_t(size_t, good_linear, copylen); skb = tun_alloc_skb(tfile, align, copylen, linear, noblock); } err = PTR_ERR_OR_ZERO(skb); if (err) goto drop; if (zerocopy) err = zerocopy_sg_from_iter(skb, from); else err = skb_copy_datagram_from_iter(skb, 0, from, len); if (err) { err = -EFAULT; drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT; goto drop; } } if (tun_vnet_hdr_tnl_to_skb(tun->flags, features, skb, &hdr)) { atomic_long_inc(&tun->rx_frame_errors); err = -EINVAL; goto free_skb; } switch (tun->flags & TUN_TYPE_MASK) { case IFF_TUN: if (tun->flags & IFF_NO_PI) { u8 ip_version = skb->len ? (skb->data[0] >> 4) : 0; switch (ip_version) { case 4: pi.proto = htons(ETH_P_IP); break; case 6: pi.proto = htons(ETH_P_IPV6); break; default: err = -EINVAL; goto drop; } } skb_reset_mac_header(skb); skb->protocol = pi.proto; skb->dev = tun->dev; break; case IFF_TAP: if (frags && !pskb_may_pull(skb, ETH_HLEN)) { err = -ENOMEM; drop_reason = SKB_DROP_REASON_HDR_TRUNC; goto drop; } skb->protocol = eth_type_trans(skb, tun->dev); break; } /* copy skb_ubuf_info for callback when skb has no error */ if (zerocopy) { skb_zcopy_init(skb, msg_control); } else if (msg_control) { struct ubuf_info *uarg = msg_control; uarg->ops->complete(NULL, uarg, false); } skb_reset_network_header(skb); skb_probe_transport_header(skb); skb_record_rx_queue(skb, tfile->queue_index); if (skb_xdp) { struct bpf_prog *xdp_prog; int ret; local_bh_disable(); rcu_read_lock(); xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) { ret = do_xdp_generic(xdp_prog, &skb); if (ret != XDP_PASS) { rcu_read_unlock(); local_bh_enable(); goto unlock_frags; } if (frags && skb != tfile->napi.skb) tfile->napi.skb = skb; } rcu_read_unlock(); local_bh_enable(); } /* Compute the costly rx hash only if needed for flow updates. * We may get a very small possibility of OOO during switching, not * worth to optimize. */ if (!rcu_access_pointer(tun->steering_prog) && tun->numqueues > 1 && !tfile->detached) rxhash = __skb_get_hash_symmetric(skb); rcu_read_lock(); if (unlikely(!(tun->dev->flags & IFF_UP))) { err = -EIO; rcu_read_unlock(); drop_reason = SKB_DROP_REASON_DEV_READY; goto drop; } if (frags) { u32 headlen; /* Exercise flow dissector code path. */ skb_push(skb, ETH_HLEN); headlen = eth_get_headlen(tun->dev, skb->data, skb_headlen(skb)); if (unlikely(headlen > skb_headlen(skb))) { WARN_ON_ONCE(1); err = -ENOMEM; dev_core_stats_rx_dropped_inc(tun->dev); napi_busy: napi_free_frags(&tfile->napi); rcu_read_unlock(); mutex_unlock(&tfile->napi_mutex); return err; } if (likely(napi_schedule_prep(&tfile->napi))) { local_bh_disable(); napi_gro_frags(&tfile->napi); napi_complete(&tfile->napi); local_bh_enable(); } else { err = -EBUSY; goto napi_busy; } mutex_unlock(&tfile->napi_mutex); } else if (tfile->napi_enabled) { struct sk_buff_head *queue = &tfile->sk.sk_write_queue; int queue_len; spin_lock_bh(&queue->lock); if (unlikely(tfile->detached)) { spin_unlock_bh(&queue->lock); rcu_read_unlock(); err = -EBUSY; goto free_skb; } __skb_queue_tail(queue, skb); queue_len = skb_queue_len(queue); spin_unlock(&queue->lock); if (!more || queue_len > NAPI_POLL_WEIGHT) napi_schedule(&tfile->napi); local_bh_enable(); } else if (!IS_ENABLED(CONFIG_4KSTACKS)) { tun_rx_batched(tun, tfile, skb, more); } else { netif_rx(skb); } rcu_read_unlock(); preempt_disable(); dev_sw_netstats_rx_add(tun->dev, len); preempt_enable(); if (rxhash) tun_flow_update(tun, rxhash, tfile); return total_len; drop: if (err != -EAGAIN) dev_core_stats_rx_dropped_inc(tun->dev); free_skb: if (!IS_ERR_OR_NULL(skb)) kfree_skb_reason(skb, drop_reason); unlock_frags: if (frags) { tfile->napi.skb = NULL; mutex_unlock(&tfile->napi_mutex); } return err ?: total_len; } static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct tun_file *tfile = file->private_data; struct tun_struct *tun = tun_get(tfile); ssize_t result; int noblock = 0; if (!tun) return -EBADFD; if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) noblock = 1; result = tun_get_user(tun, tfile, NULL, from, noblock, false); tun_put(tun); return result; } static ssize_t tun_put_user_xdp(struct tun_struct *tun, struct tun_file *tfile, struct xdp_frame *xdp_frame, struct iov_iter *iter) { int vnet_hdr_sz = 0; size_t size = xdp_frame->len; ssize_t ret; if (tun->flags & IFF_VNET_HDR) { struct virtio_net_hdr gso = { 0 }; vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso); if (ret) return ret; } ret = copy_to_iter(xdp_frame->data, size, iter) + vnet_hdr_sz; preempt_disable(); dev_sw_netstats_tx_add(tun->dev, 1, ret); preempt_enable(); return ret; } /* Put packet to the user space buffer */ static ssize_t tun_put_user(struct tun_struct *tun, struct tun_file *tfile, struct sk_buff *skb, struct iov_iter *iter) { struct tun_pi pi = { 0, skb->protocol }; ssize_t total; int vlan_offset = 0; int vlan_hlen = 0; int vnet_hdr_sz = 0; int ret; if (skb_vlan_tag_present(skb)) vlan_hlen = VLAN_HLEN; if (tun->flags & IFF_VNET_HDR) vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); total = skb->len + vlan_hlen + vnet_hdr_sz; if (!(tun->flags & IFF_NO_PI)) { if (iov_iter_count(iter) < sizeof(pi)) return -EINVAL; total += sizeof(pi); if (iov_iter_count(iter) < total) { /* Packet will be striped */ pi.flags |= TUN_PKT_STRIP; } if (copy_to_iter(&pi, sizeof(pi), iter) != sizeof(pi)) return -EFAULT; } if (vnet_hdr_sz) { struct virtio_net_hdr_v1_hash_tunnel hdr; struct virtio_net_hdr *gso; ret = tun_vnet_hdr_tnl_from_skb(tun->flags, tun->dev, skb, &hdr); if (ret) return ret; /* * Drop the packet if the configured header size is too small * WRT the enabled offloads. */ gso = (struct virtio_net_hdr *)&hdr; ret = __tun_vnet_hdr_put(vnet_hdr_sz, tun->dev->features, iter, gso); if (ret) return ret; } if (vlan_hlen) { int ret; struct veth veth; veth.h_vlan_proto = skb->vlan_proto; veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb)); vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset); if (ret || !iov_iter_count(iter)) goto done; ret = copy_to_iter(&veth, sizeof(veth), iter); if (ret != sizeof(veth) || !iov_iter_count(iter)) goto done; } skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset); done: /* caller is in process context, */ preempt_disable(); dev_sw_netstats_tx_add(tun->dev, 1, skb->len + vlan_hlen); preempt_enable(); return total; } static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err) { DECLARE_WAITQUEUE(wait, current); void *ptr = NULL; int error = 0; ptr = ptr_ring_consume(&tfile->tx_ring); if (ptr) goto out; if (noblock) { error = -EAGAIN; goto out; } add_wait_queue(&tfile->socket.wq.wait, &wait); while (1) { set_current_state(TASK_INTERRUPTIBLE); ptr = ptr_ring_consume(&tfile->tx_ring); if (ptr) break; if (signal_pending(current)) { error = -ERESTARTSYS; break; } if (tfile->socket.sk->sk_shutdown & RCV_SHUTDOWN) { error = -EFAULT; break; } schedule(); } __set_current_state(TASK_RUNNING); remove_wait_queue(&tfile->socket.wq.wait, &wait); out: *err = error; return ptr; } static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, struct iov_iter *to, int noblock, void *ptr) { ssize_t ret; int err; if (!iov_iter_count(to)) { tun_ptr_free(ptr); return 0; } if (!ptr) { /* Read frames from ring */ ptr = tun_ring_recv(tfile, noblock, &err); if (!ptr) return err; } if (tun_is_xdp_frame(ptr)) { struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); ret = tun_put_user_xdp(tun, tfile, xdpf, to); xdp_return_frame(xdpf); } else { struct sk_buff *skb = ptr; ret = tun_put_user(tun, tfile, skb, to); if (unlikely(ret < 0)) kfree_skb(skb); else consume_skb(skb); } return ret; } static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct tun_file *tfile = file->private_data; struct tun_struct *tun = tun_get(tfile); ssize_t len = iov_iter_count(to), ret; int noblock = 0; if (!tun) return -EBADFD; if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) noblock = 1; ret = tun_do_read(tun, tfile, to, noblock, NULL); ret = min_t(ssize_t, ret, len); if (ret > 0) iocb->ki_pos = ret; tun_put(tun); return ret; } static void tun_prog_free(struct rcu_head *rcu) { struct tun_prog *prog = container_of(rcu, struct tun_prog, rcu); bpf_prog_destroy(prog->prog); kfree(prog); } static int __tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, struct bpf_prog *prog) { struct tun_prog *old, *new = NULL; if (prog) { new = kmalloc(sizeof(*new), GFP_KERNEL); if (!new) return -ENOMEM; new->prog = prog; } spin_lock_bh(&tun->lock); old = rcu_dereference_protected(*prog_p, lockdep_is_held(&tun->lock)); rcu_assign_pointer(*prog_p, new); spin_unlock_bh(&tun->lock); if (old) call_rcu(&old->rcu, tun_prog_free); return 0; } static void tun_free_netdev(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); BUG_ON(!(list_empty(&tun->disabled))); tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); __tun_set_ebpf(tun, &tun->steering_prog, NULL); __tun_set_ebpf(tun, &tun->filter_prog, NULL); } static void tun_setup(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); tun->owner = INVALID_UID; tun->group = INVALID_GID; tun_default_link_ksettings(dev, &tun->link_ksettings); dev->ethtool_ops = &tun_ethtool_ops; dev->needs_free_netdev = true; dev->priv_destructor = tun_free_netdev; /* We prefer our own queue length */ dev->tx_queue_len = TUN_READQ_SIZE; } /* Trivial set of netlink ops to allow deleting tun or tap * device with netlink. */ static int tun_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "tun/tap creation via rtnetlink is not supported."); return -EOPNOTSUPP; } static size_t tun_get_size(const struct net_device *dev) { BUILD_BUG_ON(sizeof(u32) != sizeof(uid_t)); BUILD_BUG_ON(sizeof(u32) != sizeof(gid_t)); return nla_total_size(sizeof(uid_t)) + /* OWNER */ nla_total_size(sizeof(gid_t)) + /* GROUP */ nla_total_size(sizeof(u8)) + /* TYPE */ nla_total_size(sizeof(u8)) + /* PI */ nla_total_size(sizeof(u8)) + /* VNET_HDR */ nla_total_size(sizeof(u8)) + /* PERSIST */ nla_total_size(sizeof(u8)) + /* MULTI_QUEUE */ nla_total_size(sizeof(u32)) + /* NUM_QUEUES */ nla_total_size(sizeof(u32)) + /* NUM_DISABLED_QUEUES */ 0; } static int tun_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); if (nla_put_u8(skb, IFLA_TUN_TYPE, tun->flags & TUN_TYPE_MASK)) goto nla_put_failure; if (uid_valid(tun->owner) && nla_put_u32(skb, IFLA_TUN_OWNER, from_kuid_munged(current_user_ns(), tun->owner))) goto nla_put_failure; if (gid_valid(tun->group) && nla_put_u32(skb, IFLA_TUN_GROUP, from_kgid_munged(current_user_ns(), tun->group))) goto nla_put_failure; if (nla_put_u8(skb, IFLA_TUN_PI, !(tun->flags & IFF_NO_PI))) goto nla_put_failure; if (nla_put_u8(skb, IFLA_TUN_VNET_HDR, !!(tun->flags & IFF_VNET_HDR))) goto nla_put_failure; if (nla_put_u8(skb, IFLA_TUN_PERSIST, !!(tun->flags & IFF_PERSIST))) goto nla_put_failure; if (nla_put_u8(skb, IFLA_TUN_MULTI_QUEUE, !!(tun->flags & IFF_MULTI_QUEUE))) goto nla_put_failure; if (tun->flags & IFF_MULTI_QUEUE) { if (nla_put_u32(skb, IFLA_TUN_NUM_QUEUES, tun->numqueues)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_TUN_NUM_DISABLED_QUEUES, tun->numdisabled)) goto nla_put_failure; } return 0; nla_put_failure: return -EMSGSIZE; } static struct rtnl_link_ops tun_link_ops __read_mostly = { .kind = DRV_NAME, .priv_size = sizeof(struct tun_struct), .setup = tun_setup, .validate = tun_validate, .get_size = tun_get_size, .fill_info = tun_fill_info, }; static void tun_sock_write_space(struct sock *sk) { struct tun_file *tfile; wait_queue_head_t *wqueue; if (!sock_writeable(sk)) return; if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags)) return; wqueue = sk_sleep(sk); if (wqueue && waitqueue_active(wqueue)) wake_up_interruptible_sync_poll(wqueue, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); tfile = container_of(sk, struct tun_file, sk); kill_fasync(&tfile->fasync, SIGIO, POLL_OUT); } static void tun_put_page(struct tun_page *tpage) { if (tpage->page) __page_frag_cache_drain(tpage->page, tpage->count); } static int tun_xdp_one(struct tun_struct *tun, struct tun_file *tfile, struct xdp_buff *xdp, int *flush, struct tun_page *tpage) { unsigned int datasize = xdp->data_end - xdp->data; struct virtio_net_hdr *gso = xdp->data_hard_start; struct virtio_net_hdr_v1_hash_tunnel *tnl_hdr; struct bpf_prog *xdp_prog; struct sk_buff *skb = NULL; struct sk_buff_head *queue; netdev_features_t features; u32 rxhash = 0, act; int buflen = xdp->frame_sz; int metasize = 0; int ret = 0; bool skb_xdp = false; struct page *page; if (unlikely(datasize < ETH_HLEN)) return -EINVAL; xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) { if (gso->gso_type) { skb_xdp = true; goto build; } xdp_init_buff(xdp, buflen, &tfile->xdp_rxq); act = bpf_prog_run_xdp(xdp_prog, xdp); ret = tun_xdp_act(tun, xdp_prog, xdp, act); if (ret < 0) { put_page(virt_to_head_page(xdp->data)); return ret; } switch (ret) { case XDP_REDIRECT: *flush = true; fallthrough; case XDP_TX: return 0; case XDP_PASS: break; default: page = virt_to_head_page(xdp->data); if (tpage->page == page) { ++tpage->count; } else { tun_put_page(tpage); tpage->page = page; tpage->count = 1; } return 0; } } build: skb = build_skb(xdp->data_hard_start, buflen); if (!skb) { ret = -ENOMEM; goto out; } skb_reserve(skb, xdp->data - xdp->data_hard_start); skb_put(skb, xdp->data_end - xdp->data); /* The externally provided xdp_buff may have no metadata support, which * is marked by xdp->data_meta being xdp->data + 1. This will lead to a * metasize of -1 and is the reason why the condition checks for > 0. */ metasize = xdp->data - xdp->data_meta; if (metasize > 0) skb_metadata_set(skb, metasize); features = tun_vnet_hdr_guest_features(READ_ONCE(tun->vnet_hdr_sz)); tnl_hdr = (struct virtio_net_hdr_v1_hash_tunnel *)gso; if (tun_vnet_hdr_tnl_to_skb(tun->flags, features, skb, tnl_hdr)) { atomic_long_inc(&tun->rx_frame_errors); kfree_skb(skb); ret = -EINVAL; goto out; } skb->protocol = eth_type_trans(skb, tun->dev); skb_reset_network_header(skb); skb_probe_transport_header(skb); skb_record_rx_queue(skb, tfile->queue_index); if (skb_xdp) { ret = do_xdp_generic(xdp_prog, &skb); if (ret != XDP_PASS) { ret = 0; goto out; } } if (!rcu_dereference(tun->steering_prog) && tun->numqueues > 1 && !tfile->detached) rxhash = __skb_get_hash_symmetric(skb); if (tfile->napi_enabled) { queue = &tfile->sk.sk_write_queue; spin_lock(&queue->lock); if (unlikely(tfile->detached)) { spin_unlock(&queue->lock); kfree_skb(skb); return -EBUSY; } __skb_queue_tail(queue, skb); spin_unlock(&queue->lock); ret = 1; } else { netif_receive_skb(skb); ret = 0; } /* No need to disable preemption here since this function is * always called with bh disabled */ dev_sw_netstats_rx_add(tun->dev, datasize); if (rxhash) tun_flow_update(tun, rxhash, tfile); out: return ret; } static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { int ret, i; struct tun_file *tfile = container_of(sock, struct tun_file, socket); struct tun_struct *tun = tun_get(tfile); struct tun_msg_ctl *ctl = m->msg_control; struct xdp_buff *xdp; if (!tun) return -EBADFD; if (m->msg_controllen == sizeof(struct tun_msg_ctl) && ctl && ctl->type == TUN_MSG_PTR) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct tun_page tpage; int n = ctl->num; int flush = 0, queued = 0; memset(&tpage, 0, sizeof(tpage)); local_bh_disable(); rcu_read_lock(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); for (i = 0; i < n; i++) { xdp = &((struct xdp_buff *)ctl->ptr)[i]; ret = tun_xdp_one(tun, tfile, xdp, &flush, &tpage); if (ret > 0) queued += ret; } if (flush) xdp_do_flush(); if (tfile->napi_enabled && queued > 0) napi_schedule(&tfile->napi); bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock(); local_bh_enable(); tun_put_page(&tpage); ret = total_len; goto out; } ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter, m->msg_flags & MSG_DONTWAIT, m->msg_flags & MSG_MORE); out: tun_put(tun); return ret; } static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len, int flags) { struct tun_file *tfile = container_of(sock, struct tun_file, socket); struct tun_struct *tun = tun_get(tfile); void *ptr = m->msg_control; int ret; if (!tun) { ret = -EBADFD; goto out_free; } if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) { ret = -EINVAL; goto out_put_tun; } if (flags & MSG_ERRQUEUE) { ret = sock_recv_errqueue(sock->sk, m, total_len, SOL_PACKET, TUN_TX_TIMESTAMP); goto out; } ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, ptr); if (ret > (ssize_t)total_len) { m->msg_flags |= MSG_TRUNC; ret = flags & MSG_TRUNC ? ret : total_len; } out: tun_put(tun); return ret; out_put_tun: tun_put(tun); out_free: tun_ptr_free(ptr); return ret; } static int tun_ptr_peek_len(void *ptr) { if (likely(ptr)) { if (tun_is_xdp_frame(ptr)) { struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); return xdpf->len; } return __skb_array_len_with_tag(ptr); } else { return 0; } } static int tun_peek_len(struct socket *sock) { struct tun_file *tfile = container_of(sock, struct tun_file, socket); struct tun_struct *tun; int ret = 0; tun = tun_get(tfile); if (!tun) return 0; ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len); tun_put(tun); return ret; } /* Ops structure to mimic raw sockets with tun */ static const struct proto_ops tun_socket_ops = { .peek_len = tun_peek_len, .sendmsg = tun_sendmsg, .recvmsg = tun_recvmsg, }; static struct proto tun_proto = { .name = "tun", .owner = THIS_MODULE, .obj_size = sizeof(struct tun_file), }; static int tun_flags(struct tun_struct *tun) { return tun->flags & (TUN_FEATURES | IFF_PERSIST | IFF_TUN | IFF_TAP); } static ssize_t tun_flags_show(struct device *dev, struct device_attribute *attr, char *buf) { struct tun_struct *tun = netdev_priv(to_net_dev(dev)); return sysfs_emit(buf, "0x%x\n", tun_flags(tun)); } static ssize_t owner_show(struct device *dev, struct device_attribute *attr, char *buf) { struct tun_struct *tun = netdev_priv(to_net_dev(dev)); return uid_valid(tun->owner)? sysfs_emit(buf, "%u\n", from_kuid_munged(current_user_ns(), tun->owner)) : sysfs_emit(buf, "-1\n"); } static ssize_t group_show(struct device *dev, struct device_attribute *attr, char *buf) { struct tun_struct *tun = netdev_priv(to_net_dev(dev)); return gid_valid(tun->group) ? sysfs_emit(buf, "%u\n", from_kgid_munged(current_user_ns(), tun->group)) : sysfs_emit(buf, "-1\n"); } static DEVICE_ATTR_RO(tun_flags); static DEVICE_ATTR_RO(owner); static DEVICE_ATTR_RO(group); static struct attribute *tun_dev_attrs[] = { &dev_attr_tun_flags.attr, &dev_attr_owner.attr, &dev_attr_group.attr, NULL }; static const struct attribute_group tun_attr_group = { .attrs = tun_dev_attrs }; static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) { struct tun_struct *tun; struct tun_file *tfile = file->private_data; struct net_device *dev; int err; if (tfile->detached) return -EINVAL; if ((ifr->ifr_flags & IFF_NAPI_FRAGS)) { if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!(ifr->ifr_flags & IFF_NAPI) || (ifr->ifr_flags & TUN_TYPE_MASK) != IFF_TAP) return -EINVAL; } dev = __dev_get_by_name(net, ifr->ifr_name); if (dev) { if (ifr->ifr_flags & IFF_TUN_EXCL) return -EBUSY; if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops) tun = netdev_priv(dev); else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops) tun = netdev_priv(dev); else return -EINVAL; if (!!(ifr->ifr_flags & IFF_MULTI_QUEUE) != !!(tun->flags & IFF_MULTI_QUEUE)) return -EINVAL; if (tun_not_capable(tun)) return -EPERM; err = security_tun_dev_open(tun->security); if (err < 0) return err; err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER, ifr->ifr_flags & IFF_NAPI, ifr->ifr_flags & IFF_NAPI_FRAGS, true); if (err < 0) return err; if (tun->flags & IFF_MULTI_QUEUE && (tun->numqueues + tun->numdisabled > 1)) { /* One or more queue has already been attached, no need * to initialize the device again. */ netdev_state_change(dev); return 0; } tun->flags = (tun->flags & ~TUN_FEATURES) | (ifr->ifr_flags & TUN_FEATURES); netdev_state_change(dev); } else { char *name; unsigned long flags = 0; int queues = ifr->ifr_flags & IFF_MULTI_QUEUE ? MAX_TAP_QUEUES : 1; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; err = security_tun_dev_create(); if (err < 0) return err; /* Set dev type */ if (ifr->ifr_flags & IFF_TUN) { /* TUN device */ flags |= IFF_TUN; name = "tun%d"; } else if (ifr->ifr_flags & IFF_TAP) { /* TAP device */ flags |= IFF_TAP; name = "tap%d"; } else return -EINVAL; if (*ifr->ifr_name) name = ifr->ifr_name; dev = alloc_netdev_mqs(sizeof(struct tun_struct), name, NET_NAME_UNKNOWN, tun_setup, queues, queues); if (!dev) return -ENOMEM; dev_net_set(dev, net); dev->rtnl_link_ops = &tun_link_ops; dev->ifindex = tfile->ifindex; dev->sysfs_groups[0] = &tun_attr_group; tun = netdev_priv(dev); tun->dev = dev; tun->flags = flags; tun->txflt.count = 0; tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr); tun->align = NET_SKB_PAD; tun->filter_attached = false; tun->sndbuf = tfile->socket.sk->sk_sndbuf; tun->rx_batched = 0; RCU_INIT_POINTER(tun->steering_prog, NULL); tun->ifr = ifr; tun->file = file; tun_net_initialize(dev); err = register_netdevice(tun->dev); if (err < 0) { free_netdev(dev); return err; } /* free_netdev() won't check refcnt, to avoid race * with dev_put() we need publish tun after registration. */ rcu_assign_pointer(tfile->tun, tun); } if (ifr->ifr_flags & IFF_NO_CARRIER) netif_carrier_off(tun->dev); else netif_carrier_on(tun->dev); /* Make sure persistent devices do not get stuck in * xoff state. */ if (netif_running(tun->dev)) netif_tx_wake_all_queues(tun->dev); strscpy(ifr->ifr_name, tun->dev->name); return 0; } static void tun_get_iff(struct tun_struct *tun, struct ifreq *ifr) { strscpy(ifr->ifr_name, tun->dev->name); ifr->ifr_flags = tun_flags(tun); } #define PLAIN_GSO (NETIF_F_GSO_UDP_L4 | NETIF_F_TSO | NETIF_F_TSO6) /* This is like a cut-down ethtool ops, except done via tun fd so no * privs required. */ static int set_offload(struct tun_struct *tun, unsigned long arg) { netdev_features_t features = 0; if (arg & TUN_F_CSUM) { features |= NETIF_F_HW_CSUM; arg &= ~TUN_F_CSUM; if (arg & (TUN_F_TSO4|TUN_F_TSO6)) { if (arg & TUN_F_TSO_ECN) { features |= NETIF_F_TSO_ECN; arg &= ~TUN_F_TSO_ECN; } if (arg & TUN_F_TSO4) features |= NETIF_F_TSO; if (arg & TUN_F_TSO6) features |= NETIF_F_TSO6; arg &= ~(TUN_F_TSO4|TUN_F_TSO6); } arg &= ~TUN_F_UFO; /* TODO: for now USO4 and USO6 should work simultaneously */ if (arg & TUN_F_USO4 && arg & TUN_F_USO6) { features |= NETIF_F_GSO_UDP_L4; arg &= ~(TUN_F_USO4 | TUN_F_USO6); } /* * Tunnel offload is allowed only if some plain offload is * available, too. */ if (features & PLAIN_GSO && arg & TUN_F_UDP_TUNNEL_GSO) { features |= NETIF_F_GSO_UDP_TUNNEL; if (arg & TUN_F_UDP_TUNNEL_GSO_CSUM) features |= NETIF_F_GSO_UDP_TUNNEL_CSUM; arg &= ~(TUN_F_UDP_TUNNEL_GSO | TUN_F_UDP_TUNNEL_GSO_CSUM); } } /* This gives the user a way to test for new features in future by * trying to set them. */ if (arg) return -EINVAL; tun->set_features = features; tun->dev->wanted_features &= ~TUN_USER_FEATURES; tun->dev->wanted_features |= features; netdev_update_features(tun->dev); return 0; } static void tun_detach_filter(struct tun_struct *tun, int n) { int i; struct tun_file *tfile; for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); lock_sock(tfile->socket.sk); sk_detach_filter(tfile->socket.sk); release_sock(tfile->socket.sk); } tun->filter_attached = false; } static int tun_attach_filter(struct tun_struct *tun) { int i, ret = 0; struct tun_file *tfile; for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); lock_sock(tfile->socket.sk); ret = sk_attach_filter(&tun->fprog, tfile->socket.sk); release_sock(tfile->socket.sk); if (ret) { tun_detach_filter(tun, i); return ret; } } tun->filter_attached = true; return ret; } static void tun_set_sndbuf(struct tun_struct *tun) { struct tun_file *tfile; int i; for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); tfile->socket.sk->sk_sndbuf = tun->sndbuf; } } static int tun_set_queue(struct file *file, struct ifreq *ifr) { struct tun_file *tfile = file->private_data; struct tun_struct *tun; int ret = 0; rtnl_lock(); if (ifr->ifr_flags & IFF_ATTACH_QUEUE) { tun = tfile->detached; if (!tun) { ret = -EINVAL; goto unlock; } ret = security_tun_dev_attach_queue(tun->security); if (ret < 0) goto unlock; ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI, tun->flags & IFF_NAPI_FRAGS, true); } else if (ifr->ifr_flags & IFF_DETACH_QUEUE) { tun = rtnl_dereference(tfile->tun); if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached) ret = -EINVAL; else __tun_detach(tfile, false); } else ret = -EINVAL; if (ret >= 0) netdev_state_change(tun->dev); unlock: rtnl_unlock(); return ret; } static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, void __user *data) { struct bpf_prog *prog; int fd; if (copy_from_user(&fd, data, sizeof(fd))) return -EFAULT; if (fd == -1) { prog = NULL; } else { prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); if (IS_ERR(prog)) return PTR_ERR(prog); } return __tun_set_ebpf(tun, prog_p, prog); } /* Return correct value for tun->dev->addr_len based on tun->dev->type. */ static unsigned char tun_get_addr_len(unsigned short type) { switch (type) { case ARPHRD_IP6GRE: case ARPHRD_TUNNEL6: return sizeof(struct in6_addr); case ARPHRD_IPGRE: case ARPHRD_TUNNEL: case ARPHRD_SIT: return 4; case ARPHRD_ETHER: return ETH_ALEN; case ARPHRD_IEEE802154: case ARPHRD_IEEE802154_MONITOR: return IEEE802154_EXTENDED_ADDR_LEN; case ARPHRD_PHONET_PIPE: case ARPHRD_PPP: case ARPHRD_NONE: return 0; case ARPHRD_6LOWPAN: return EUI64_ADDR_LEN; case ARPHRD_FDDI: return FDDI_K_ALEN; case ARPHRD_HIPPI: return HIPPI_ALEN; case ARPHRD_IEEE802: return FC_ALEN; case ARPHRD_ROSE: return ROSE_ADDR_LEN; case ARPHRD_NETROM: return AX25_ADDR_LEN; case ARPHRD_LOCALTLK: return LTALK_ALEN; default: return 0; } } static long __tun_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg, int ifreq_len) { struct tun_file *tfile = file->private_data; struct net *net = sock_net(&tfile->sk); struct tun_struct *tun; void __user* argp = (void __user*)arg; unsigned int carrier; struct ifreq ifr; kuid_t owner; kgid_t group; int ifindex; int sndbuf; int ret; bool do_notify = false; if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) { if (copy_from_user(&ifr, argp, ifreq_len)) return -EFAULT; } else { memset(&ifr, 0, sizeof(ifr)); } if (cmd == TUNGETFEATURES) { /* Currently this just means: "what IFF flags are valid?". * This is needed because we never checked for invalid flags on * TUNSETIFF. */ return put_user(IFF_TUN | IFF_TAP | IFF_NO_CARRIER | TUN_FEATURES, (unsigned int __user*)argp); } else if (cmd == TUNSETQUEUE) { return tun_set_queue(file, &ifr); } else if (cmd == SIOCGSKNS) { if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; return open_related_ns(&net->ns, get_net_ns); } rtnl_lock(); tun = tun_get(tfile); if (cmd == TUNSETIFF) { ret = -EEXIST; if (tun) goto unlock; ifr.ifr_name[IFNAMSIZ-1] = '\0'; ret = tun_set_iff(net, file, &ifr); if (ret) goto unlock; if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; goto unlock; } if (cmd == TUNSETIFINDEX) { ret = -EPERM; if (tun) goto unlock; ret = -EFAULT; if (copy_from_user(&ifindex, argp, sizeof(ifindex))) goto unlock; ret = -EINVAL; if (ifindex < 0) goto unlock; ret = 0; tfile->ifindex = ifindex; goto unlock; } ret = -EBADFD; if (!tun) goto unlock; netif_info(tun, drv, tun->dev, "tun_chr_ioctl cmd %u\n", cmd); net = dev_net(tun->dev); ret = 0; switch (cmd) { case TUNGETIFF: tun_get_iff(tun, &ifr); if (tfile->detached) ifr.ifr_flags |= IFF_DETACH_QUEUE; if (!tfile->socket.sk->sk_filter) ifr.ifr_flags |= IFF_NOFILTER; if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; break; case TUNSETNOCSUM: /* Disable/Enable checksum */ /* [unimplemented] */ netif_info(tun, drv, tun->dev, "ignored: set checksum %s\n", arg ? "disabled" : "enabled"); break; case TUNSETPERSIST: /* Disable/Enable persist mode. Keep an extra reference to the * module to prevent the module being unprobed. */ if (arg && !(tun->flags & IFF_PERSIST)) { tun->flags |= IFF_PERSIST; __module_get(THIS_MODULE); do_notify = true; } if (!arg && (tun->flags & IFF_PERSIST)) { tun->flags &= ~IFF_PERSIST; module_put(THIS_MODULE); do_notify = true; } netif_info(tun, drv, tun->dev, "persist %s\n", arg ? "enabled" : "disabled"); break; case TUNSETOWNER: /* Set owner of the device */ owner = make_kuid(current_user_ns(), arg); if (!uid_valid(owner)) { ret = -EINVAL; break; } tun->owner = owner; do_notify = true; netif_info(tun, drv, tun->dev, "owner set to %u\n", from_kuid(&init_user_ns, tun->owner)); break; case TUNSETGROUP: /* Set group of the device */ group = make_kgid(current_user_ns(), arg); if (!gid_valid(group)) { ret = -EINVAL; break; } tun->group = group; do_notify = true; netif_info(tun, drv, tun->dev, "group set to %u\n", from_kgid(&init_user_ns, tun->group)); break; case TUNSETLINK: /* Only allow setting the type when the interface is down */ if (tun->dev->flags & IFF_UP) { netif_info(tun, drv, tun->dev, "Linktype set failed because interface is up\n"); ret = -EBUSY; } else { ret = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE, tun->dev); ret = notifier_to_errno(ret); if (ret) { netif_info(tun, drv, tun->dev, "Refused to change device type\n"); break; } tun->dev->type = (int) arg; tun->dev->addr_len = tun_get_addr_len(tun->dev->type); netif_info(tun, drv, tun->dev, "linktype set to %d\n", tun->dev->type); call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, tun->dev); } break; case TUNSETDEBUG: tun->msg_enable = (u32)arg; break; case TUNSETOFFLOAD: ret = set_offload(tun, arg); break; case TUNSETTXFILTER: /* Can be set only for TAPs */ ret = -EINVAL; if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = update_filter(&tun->txflt, (void __user *)arg); break; case SIOCGIFHWADDR: /* Get hw address */ netif_get_mac_address(&ifr.ifr_hwaddr, net, tun->dev->name); if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; break; case SIOCSIFHWADDR: /* Set hw address */ if (tun->dev->addr_len > sizeof(ifr.ifr_hwaddr)) { ret = -EINVAL; break; } ret = dev_set_mac_address_user(tun->dev, (struct sockaddr_storage *)&ifr.ifr_hwaddr, NULL); break; case TUNGETSNDBUF: sndbuf = tfile->socket.sk->sk_sndbuf; if (copy_to_user(argp, &sndbuf, sizeof(sndbuf))) ret = -EFAULT; break; case TUNSETSNDBUF: if (copy_from_user(&sndbuf, argp, sizeof(sndbuf))) { ret = -EFAULT; break; } if (sndbuf <= 0) { ret = -EINVAL; break; } tun->sndbuf = sndbuf; tun_set_sndbuf(tun); break; case TUNATTACHFILTER: /* Can be set only for TAPs */ ret = -EINVAL; if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = -EFAULT; if (copy_from_user(&tun->fprog, argp, sizeof(tun->fprog))) break; ret = tun_attach_filter(tun); break; case TUNDETACHFILTER: /* Can be set only for TAPs */ ret = -EINVAL; if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = 0; tun_detach_filter(tun, tun->numqueues); break; case TUNGETFILTER: ret = -EINVAL; if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = -EFAULT; if (copy_to_user(argp, &tun->fprog, sizeof(tun->fprog))) break; ret = 0; break; case TUNSETSTEERINGEBPF: ret = tun_set_ebpf(tun, &tun->steering_prog, argp); break; case TUNSETFILTEREBPF: ret = tun_set_ebpf(tun, &tun->filter_prog, argp); break; case TUNSETCARRIER: ret = -EFAULT; if (copy_from_user(&carrier, argp, sizeof(carrier))) goto unlock; ret = tun_net_change_carrier(tun->dev, (bool)carrier); break; case TUNGETDEVNETNS: ret = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto unlock; ret = open_related_ns(&net->ns, get_net_ns); break; default: ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, cmd, argp); break; } if (do_notify) netdev_state_change(tun->dev); unlock: rtnl_unlock(); if (tun) tun_put(tun); return ret; } static long tun_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return __tun_chr_ioctl(file, cmd, arg, sizeof (struct ifreq)); } #ifdef CONFIG_COMPAT static long tun_chr_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { switch (cmd) { case TUNSETIFF: case TUNGETIFF: case TUNSETTXFILTER: case TUNGETSNDBUF: case TUNSETSNDBUF: case SIOCGIFHWADDR: case SIOCSIFHWADDR: arg = (unsigned long)compat_ptr(arg); break; default: arg = (compat_ulong_t)arg; break; } /* * compat_ifreq is shorter than ifreq, so we must not access beyond * the end of that structure. All fields that are used in this * driver are compatible though, we don't need to convert the * contents. */ return __tun_chr_ioctl(file, cmd, arg, sizeof(struct compat_ifreq)); } #endif /* CONFIG_COMPAT */ static int tun_chr_fasync(int fd, struct file *file, int on) { struct tun_file *tfile = file->private_data; int ret; if (on) { ret = file_f_owner_allocate(file); if (ret) goto out; } if ((ret = fasync_helper(fd, file, on, &tfile->fasync)) < 0) goto out; if (on) { __f_setown(file, task_pid(current), PIDTYPE_TGID, 0); tfile->flags |= TUN_FASYNC; } else tfile->flags &= ~TUN_FASYNC; ret = 0; out: return ret; } static int tun_chr_open(struct inode *inode, struct file * file) { struct net *net = current->nsproxy->net_ns; struct tun_file *tfile; tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, &tun_proto, 0); if (!tfile) return -ENOMEM; if (ptr_ring_init(&tfile->tx_ring, 0, GFP_KERNEL)) { sk_free(&tfile->sk); return -ENOMEM; } mutex_init(&tfile->napi_mutex); RCU_INIT_POINTER(tfile->tun, NULL); tfile->flags = 0; tfile->ifindex = 0; init_waitqueue_head(&tfile->socket.wq.wait); tfile->socket.file = file; tfile->socket.ops = &tun_socket_ops; sock_init_data_uid(&tfile->socket, &tfile->sk, current_fsuid()); tfile->sk.sk_write_space = tun_sock_write_space; tfile->sk.sk_sndbuf = INT_MAX; file->private_data = tfile; INIT_LIST_HEAD(&tfile->next); sock_set_flag(&tfile->sk, SOCK_ZEROCOPY); /* tun groks IOCB_NOWAIT just fine, mark it as such */ file->f_mode |= FMODE_NOWAIT; return 0; } static int tun_chr_close(struct inode *inode, struct file *file) { struct tun_file *tfile = file->private_data; tun_detach(tfile, true); return 0; } #ifdef CONFIG_PROC_FS static void tun_chr_show_fdinfo(struct seq_file *m, struct file *file) { struct tun_file *tfile = file->private_data; struct tun_struct *tun; struct ifreq ifr; memset(&ifr, 0, sizeof(ifr)); rtnl_lock(); tun = tun_get(tfile); if (tun) tun_get_iff(tun, &ifr); rtnl_unlock(); if (tun) tun_put(tun); seq_printf(m, "iff:\t%s\n", ifr.ifr_name); } #endif static const struct file_operations tun_fops = { .owner = THIS_MODULE, .read_iter = tun_chr_read_iter, .write_iter = tun_chr_write_iter, .poll = tun_chr_poll, .unlocked_ioctl = tun_chr_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = tun_chr_compat_ioctl, #endif .open = tun_chr_open, .release = tun_chr_close, .fasync = tun_chr_fasync, #ifdef CONFIG_PROC_FS .show_fdinfo = tun_chr_show_fdinfo, #endif }; static struct miscdevice tun_miscdev = { .minor = TUN_MINOR, .name = "tun", .nodename = "net/tun", .fops = &tun_fops, }; /* ethtool interface */ static void tun_default_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { ethtool_link_ksettings_zero_link_mode(cmd, supported); ethtool_link_ksettings_zero_link_mode(cmd, advertising); cmd->base.speed = SPEED_10000; cmd->base.duplex = DUPLEX_FULL; cmd->base.port = PORT_TP; cmd->base.phy_address = 0; cmd->base.autoneg = AUTONEG_DISABLE; } static int tun_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { struct tun_struct *tun = netdev_priv(dev); memcpy(cmd, &tun->link_ksettings, sizeof(*cmd)); return 0; } static int tun_set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *cmd) { struct tun_struct *tun = netdev_priv(dev); memcpy(&tun->link_ksettings, cmd, sizeof(*cmd)); return 0; } static void tun_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { struct tun_struct *tun = netdev_priv(dev); strscpy(info->driver, DRV_NAME, sizeof(info->driver)); strscpy(info->version, DRV_VERSION, sizeof(info->version)); switch (tun->flags & TUN_TYPE_MASK) { case IFF_TUN: strscpy(info->bus_info, "tun", sizeof(info->bus_info)); break; case IFF_TAP: strscpy(info->bus_info, "tap", sizeof(info->bus_info)); break; } } static u32 tun_get_msglevel(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); return tun->msg_enable; } static void tun_set_msglevel(struct net_device *dev, u32 value) { struct tun_struct *tun = netdev_priv(dev); tun->msg_enable = value; } static int tun_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec, struct kernel_ethtool_coalesce *kernel_coal, struct netlink_ext_ack *extack) { struct tun_struct *tun = netdev_priv(dev); ec->rx_max_coalesced_frames = tun->rx_batched; return 0; } static int tun_set_coalesce(struct net_device *dev, struct ethtool_coalesce *ec, struct kernel_ethtool_coalesce *kernel_coal, struct netlink_ext_ack *extack) { struct tun_struct *tun = netdev_priv(dev); if (ec->rx_max_coalesced_frames > NAPI_POLL_WEIGHT) tun->rx_batched = NAPI_POLL_WEIGHT; else tun->rx_batched = ec->rx_max_coalesced_frames; return 0; } static void tun_get_channels(struct net_device *dev, struct ethtool_channels *channels) { struct tun_struct *tun = netdev_priv(dev); channels->combined_count = tun->numqueues; channels->max_combined = tun->flags & IFF_MULTI_QUEUE ? MAX_TAP_QUEUES : 1; } static const struct ethtool_ops tun_ethtool_ops = { .supported_coalesce_params = ETHTOOL_COALESCE_RX_MAX_FRAMES, .get_drvinfo = tun_get_drvinfo, .get_msglevel = tun_get_msglevel, .set_msglevel = tun_set_msglevel, .get_link = ethtool_op_get_link, .get_channels = tun_get_channels, .get_ts_info = ethtool_op_get_ts_info, .get_coalesce = tun_get_coalesce, .set_coalesce = tun_set_coalesce, .get_link_ksettings = tun_get_link_ksettings, .set_link_ksettings = tun_set_link_ksettings, }; static int tun_queue_resize(struct tun_struct *tun) { struct net_device *dev = tun->dev; struct tun_file *tfile; struct ptr_ring **rings; int n = tun->numqueues + tun->numdisabled; int ret, i; rings = kmalloc_array(n, sizeof(*rings), GFP_KERNEL); if (!rings) return -ENOMEM; for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); rings[i] = &tfile->tx_ring; } list_for_each_entry(tfile, &tun->disabled, next) rings[i++] = &tfile->tx_ring; ret = ptr_ring_resize_multiple_bh(rings, n, dev->tx_queue_len, GFP_KERNEL, tun_ptr_free); kfree(rings); return ret; } static int tun_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct tun_struct *tun = netdev_priv(dev); int i; if (dev->rtnl_link_ops != &tun_link_ops) return NOTIFY_DONE; switch (event) { case NETDEV_CHANGE_TX_QUEUE_LEN: if (tun_queue_resize(tun)) return NOTIFY_BAD; break; case NETDEV_UP: for (i = 0; i < tun->numqueues; i++) { struct tun_file *tfile; tfile = rtnl_dereference(tun->tfiles[i]); tfile->socket.sk->sk_write_space(tfile->socket.sk); } break; default: break; } return NOTIFY_DONE; } static struct notifier_block tun_notifier_block __read_mostly = { .notifier_call = tun_device_event, }; static int __init tun_init(void) { int ret = 0; pr_info("%s, %s\n", DRV_DESCRIPTION, DRV_VERSION); ret = rtnl_link_register(&tun_link_ops); if (ret) { pr_err("Can't register link_ops\n"); goto err_linkops; } ret = misc_register(&tun_miscdev); if (ret) { pr_err("Can't register misc device %d\n", TUN_MINOR); goto err_misc; } ret = register_netdevice_notifier(&tun_notifier_block); if (ret) { pr_err("Can't register netdevice notifier\n"); goto err_notifier; } return 0; err_notifier: misc_deregister(&tun_miscdev); err_misc: rtnl_link_unregister(&tun_link_ops); err_linkops: return ret; } static void __exit tun_cleanup(void) { misc_deregister(&tun_miscdev); rtnl_link_unregister(&tun_link_ops); unregister_netdevice_notifier(&tun_notifier_block); } /* Get an underlying socket object from tun file. Returns error unless file is * attached to a device. The returned object works like a packet socket, it * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for * holding a reference to the file for as long as the socket is in use. */ struct socket *tun_get_socket(struct file *file) { struct tun_file *tfile; if (file->f_op != &tun_fops) return ERR_PTR(-EINVAL); tfile = file->private_data; if (!tfile) return ERR_PTR(-EBADFD); return &tfile->socket; } EXPORT_SYMBOL_GPL(tun_get_socket); struct ptr_ring *tun_get_tx_ring(struct file *file) { struct tun_file *tfile; if (file->f_op != &tun_fops) return ERR_PTR(-EINVAL); tfile = file->private_data; if (!tfile) return ERR_PTR(-EBADFD); return &tfile->tx_ring; } EXPORT_SYMBOL_GPL(tun_get_tx_ring); module_init(tun_init); module_exit(tun_cleanup); MODULE_DESCRIPTION(DRV_DESCRIPTION); MODULE_AUTHOR(DRV_COPYRIGHT); MODULE_LICENSE("GPL"); MODULE_ALIAS_MISCDEV(TUN_MINOR); MODULE_ALIAS("devname:net/tun"); MODULE_IMPORT_NS("NETDEV_INTERNAL");
18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 // SPDX-License-Identifier: GPL-2.0-or-later /* * History * 03-01-2007 Added forwarding for x.25 Andrew Hendry */ #define pr_fmt(fmt) "X25: " fmt #include <linux/if_arp.h> #include <linux/init.h> #include <linux/slab.h> #include <net/x25.h> LIST_HEAD(x25_forward_list); DEFINE_RWLOCK(x25_forward_list_lock); int x25_forward_call(struct x25_address *dest_addr, struct x25_neigh *from, struct sk_buff *skb, int lci) { struct x25_route *rt; struct x25_neigh *neigh_new = NULL; struct x25_forward *x25_frwd, *new_frwd; struct sk_buff *skbn; short same_lci = 0; int rc = 0; if ((rt = x25_get_route(dest_addr)) == NULL) goto out_no_route; if ((neigh_new = x25_get_neigh(rt->dev)) == NULL) { /* This shouldn't happen, if it occurs somehow * do something sensible */ goto out_put_route; } /* Avoid a loop. This is the normal exit path for a * system with only one x.25 iface and default route */ if (rt->dev == from->dev) { goto out_put_nb; } /* Remote end sending a call request on an already * established LCI? It shouldn't happen, just in case.. */ read_lock_bh(&x25_forward_list_lock); list_for_each_entry(x25_frwd, &x25_forward_list, node) { if (x25_frwd->lci == lci) { pr_warn("call request for lci which is already registered!, transmitting but not registering new pair\n"); same_lci = 1; } } read_unlock_bh(&x25_forward_list_lock); /* Save the forwarding details for future traffic */ if (!same_lci){ if ((new_frwd = kmalloc(sizeof(struct x25_forward), GFP_ATOMIC)) == NULL){ rc = -ENOMEM; goto out_put_nb; } new_frwd->lci = lci; new_frwd->dev1 = rt->dev; new_frwd->dev2 = from->dev; write_lock_bh(&x25_forward_list_lock); list_add(&new_frwd->node, &x25_forward_list); write_unlock_bh(&x25_forward_list_lock); } /* Forward the call request */ if ( (skbn = skb_clone(skb, GFP_ATOMIC)) == NULL){ goto out_put_nb; } x25_transmit_link(skbn, neigh_new); rc = 1; out_put_nb: x25_neigh_put(neigh_new); out_put_route: x25_route_put(rt); out_no_route: return rc; } int x25_forward_data(int lci, struct x25_neigh *from, struct sk_buff *skb) { struct x25_forward *frwd; struct net_device *peer = NULL; struct x25_neigh *nb; struct sk_buff *skbn; int rc = 0; read_lock_bh(&x25_forward_list_lock); list_for_each_entry(frwd, &x25_forward_list, node) { if (frwd->lci == lci) { /* The call is established, either side can send */ if (from->dev == frwd->dev1) { peer = frwd->dev2; } else { peer = frwd->dev1; } break; } } read_unlock_bh(&x25_forward_list_lock); if ( (nb = x25_get_neigh(peer)) == NULL) goto out; if ( (skbn = pskb_copy(skb, GFP_ATOMIC)) == NULL){ goto output; } x25_transmit_link(skbn, nb); rc = 1; output: x25_neigh_put(nb); out: return rc; } void x25_clear_forward_by_lci(unsigned int lci) { struct x25_forward *fwd, *tmp; write_lock_bh(&x25_forward_list_lock); list_for_each_entry_safe(fwd, tmp, &x25_forward_list, node) { if (fwd->lci == lci) { list_del(&fwd->node); kfree(fwd); } } write_unlock_bh(&x25_forward_list_lock); } void x25_clear_forward_by_dev(struct net_device *dev) { struct x25_forward *fwd, *tmp; write_lock_bh(&x25_forward_list_lock); list_for_each_entry_safe(fwd, tmp, &x25_forward_list, node) { if ((fwd->dev1 == dev) || (fwd->dev2 == dev)){ list_del(&fwd->node); kfree(fwd); } } write_unlock_bh(&x25_forward_list_lock); }
5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 // SPDX-License-Identifier: GPL-2.0-only /* * NFC hardware simulation driver * Copyright (c) 2013, Intel Corporation. */ #include <linux/device.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/ctype.h> #include <linux/debugfs.h> #include <linux/nfc.h> #include <net/nfc/nfc.h> #include <net/nfc/digital.h> #define NFCSIM_ERR(d, fmt, args...) nfc_err(&d->nfc_digital_dev->nfc_dev->dev, \ "%s: " fmt, __func__, ## args) #define NFCSIM_DBG(d, fmt, args...) dev_dbg(&d->nfc_digital_dev->nfc_dev->dev, \ "%s: " fmt, __func__, ## args) #define NFCSIM_VERSION "0.2" #define NFCSIM_MODE_NONE 0 #define NFCSIM_MODE_INITIATOR 1 #define NFCSIM_MODE_TARGET 2 #define NFCSIM_CAPABILITIES (NFC_DIGITAL_DRV_CAPS_IN_CRC | \ NFC_DIGITAL_DRV_CAPS_TG_CRC) struct nfcsim { struct nfc_digital_dev *nfc_digital_dev; struct work_struct recv_work; struct delayed_work send_work; struct nfcsim_link *link_in; struct nfcsim_link *link_out; bool up; u8 mode; u8 rf_tech; u16 recv_timeout; nfc_digital_cmd_complete_t cb; void *arg; u8 dropframe; }; struct nfcsim_link { struct mutex lock; u8 rf_tech; u8 mode; u8 shutdown; struct sk_buff *skb; wait_queue_head_t recv_wait; u8 cond; }; static struct nfcsim_link *nfcsim_link_new(void) { struct nfcsim_link *link; link = kzalloc(sizeof(struct nfcsim_link), GFP_KERNEL); if (!link) return NULL; mutex_init(&link->lock); init_waitqueue_head(&link->recv_wait); return link; } static void nfcsim_link_free(struct nfcsim_link *link) { dev_kfree_skb(link->skb); kfree(link); } static void nfcsim_link_recv_wake(struct nfcsim_link *link) { link->cond = 1; wake_up_interruptible(&link->recv_wait); } static void nfcsim_link_set_skb(struct nfcsim_link *link, struct sk_buff *skb, u8 rf_tech, u8 mode) { mutex_lock(&link->lock); dev_kfree_skb(link->skb); link->skb = skb; link->rf_tech = rf_tech; link->mode = mode; mutex_unlock(&link->lock); } static void nfcsim_link_recv_cancel(struct nfcsim_link *link) { mutex_lock(&link->lock); link->mode = NFCSIM_MODE_NONE; mutex_unlock(&link->lock); nfcsim_link_recv_wake(link); } static void nfcsim_link_shutdown(struct nfcsim_link *link) { mutex_lock(&link->lock); link->shutdown = 1; link->mode = NFCSIM_MODE_NONE; mutex_unlock(&link->lock); nfcsim_link_recv_wake(link); } static struct sk_buff *nfcsim_link_recv_skb(struct nfcsim_link *link, int timeout, u8 rf_tech, u8 mode) { int rc; struct sk_buff *skb; rc = wait_event_interruptible_timeout(link->recv_wait, link->cond, msecs_to_jiffies(timeout)); mutex_lock(&link->lock); skb = link->skb; link->skb = NULL; if (!rc) { rc = -ETIMEDOUT; goto done; } if (!skb || link->rf_tech != rf_tech || link->mode == mode) { rc = -EINVAL; goto done; } if (link->shutdown) { rc = -ENODEV; goto done; } done: mutex_unlock(&link->lock); if (rc < 0) { dev_kfree_skb(skb); skb = ERR_PTR(rc); } link->cond = 0; return skb; } static void nfcsim_send_wq(struct work_struct *work) { struct nfcsim *dev = container_of(work, struct nfcsim, send_work.work); /* * To effectively send data, the device just wake up its link_out which * is the link_in of the peer device. The exchanged skb has already been * stored in the dev->link_out through nfcsim_link_set_skb(). */ nfcsim_link_recv_wake(dev->link_out); } static void nfcsim_recv_wq(struct work_struct *work) { struct nfcsim *dev = container_of(work, struct nfcsim, recv_work); struct sk_buff *skb; skb = nfcsim_link_recv_skb(dev->link_in, dev->recv_timeout, dev->rf_tech, dev->mode); if (!dev->up) { NFCSIM_ERR(dev, "Device is down\n"); if (!IS_ERR(skb)) dev_kfree_skb(skb); return; } dev->cb(dev->nfc_digital_dev, dev->arg, skb); } static int nfcsim_send(struct nfc_digital_dev *ddev, struct sk_buff *skb, u16 timeout, nfc_digital_cmd_complete_t cb, void *arg) { struct nfcsim *dev = nfc_digital_get_drvdata(ddev); u8 delay; if (!dev->up) { NFCSIM_ERR(dev, "Device is down\n"); return -ENODEV; } dev->recv_timeout = timeout; dev->cb = cb; dev->arg = arg; schedule_work(&dev->recv_work); if (dev->dropframe) { NFCSIM_DBG(dev, "dropping frame (out of %d)\n", dev->dropframe); dev_kfree_skb(skb); dev->dropframe--; return 0; } if (skb) { nfcsim_link_set_skb(dev->link_out, skb, dev->rf_tech, dev->mode); /* Add random delay (between 3 and 10 ms) before sending data */ get_random_bytes(&delay, 1); delay = 3 + (delay & 0x07); schedule_delayed_work(&dev->send_work, msecs_to_jiffies(delay)); } return 0; } static void nfcsim_abort_cmd(struct nfc_digital_dev *ddev) { const struct nfcsim *dev = nfc_digital_get_drvdata(ddev); nfcsim_link_recv_cancel(dev->link_in); } static int nfcsim_switch_rf(struct nfc_digital_dev *ddev, bool on) { struct nfcsim *dev = nfc_digital_get_drvdata(ddev); dev->up = on; return 0; } static int nfcsim_in_configure_hw(struct nfc_digital_dev *ddev, int type, int param) { struct nfcsim *dev = nfc_digital_get_drvdata(ddev); switch (type) { case NFC_DIGITAL_CONFIG_RF_TECH: dev->up = true; dev->mode = NFCSIM_MODE_INITIATOR; dev->rf_tech = param; break; case NFC_DIGITAL_CONFIG_FRAMING: break; default: NFCSIM_ERR(dev, "Invalid configuration type: %d\n", type); return -EINVAL; } return 0; } static int nfcsim_in_send_cmd(struct nfc_digital_dev *ddev, struct sk_buff *skb, u16 timeout, nfc_digital_cmd_complete_t cb, void *arg) { return nfcsim_send(ddev, skb, timeout, cb, arg); } static int nfcsim_tg_configure_hw(struct nfc_digital_dev *ddev, int type, int param) { struct nfcsim *dev = nfc_digital_get_drvdata(ddev); switch (type) { case NFC_DIGITAL_CONFIG_RF_TECH: dev->up = true; dev->mode = NFCSIM_MODE_TARGET; dev->rf_tech = param; break; case NFC_DIGITAL_CONFIG_FRAMING: break; default: NFCSIM_ERR(dev, "Invalid configuration type: %d\n", type); return -EINVAL; } return 0; } static int nfcsim_tg_send_cmd(struct nfc_digital_dev *ddev, struct sk_buff *skb, u16 timeout, nfc_digital_cmd_complete_t cb, void *arg) { return nfcsim_send(ddev, skb, timeout, cb, arg); } static int nfcsim_tg_listen(struct nfc_digital_dev *ddev, u16 timeout, nfc_digital_cmd_complete_t cb, void *arg) { return nfcsim_send(ddev, NULL, timeout, cb, arg); } static const struct nfc_digital_ops nfcsim_digital_ops = { .in_configure_hw = nfcsim_in_configure_hw, .in_send_cmd = nfcsim_in_send_cmd, .tg_listen = nfcsim_tg_listen, .tg_configure_hw = nfcsim_tg_configure_hw, .tg_send_cmd = nfcsim_tg_send_cmd, .abort_cmd = nfcsim_abort_cmd, .switch_rf = nfcsim_switch_rf, }; static struct dentry *nfcsim_debugfs_root; static void nfcsim_debugfs_init(void) { nfcsim_debugfs_root = debugfs_create_dir("nfcsim", NULL); } static void nfcsim_debugfs_remove(void) { debugfs_remove_recursive(nfcsim_debugfs_root); } static void nfcsim_debugfs_init_dev(struct nfcsim *dev) { struct dentry *dev_dir; char devname[5]; /* nfcX\0 */ u32 idx; int n; if (!nfcsim_debugfs_root) { NFCSIM_ERR(dev, "nfcsim debugfs not initialized\n"); return; } idx = dev->nfc_digital_dev->nfc_dev->idx; n = snprintf(devname, sizeof(devname), "nfc%d", idx); if (n >= sizeof(devname)) { NFCSIM_ERR(dev, "Could not compute dev name for dev %d\n", idx); return; } dev_dir = debugfs_create_dir(devname, nfcsim_debugfs_root); debugfs_create_u8("dropframe", 0664, dev_dir, &dev->dropframe); } static struct nfcsim *nfcsim_device_new(struct nfcsim_link *link_in, struct nfcsim_link *link_out) { struct nfcsim *dev; int rc; dev = kzalloc(sizeof(struct nfcsim), GFP_KERNEL); if (!dev) return ERR_PTR(-ENOMEM); INIT_DELAYED_WORK(&dev->send_work, nfcsim_send_wq); INIT_WORK(&dev->recv_work, nfcsim_recv_wq); dev->nfc_digital_dev = nfc_digital_allocate_device(&nfcsim_digital_ops, NFC_PROTO_NFC_DEP_MASK, NFCSIM_CAPABILITIES, 0, 0); if (!dev->nfc_digital_dev) { kfree(dev); return ERR_PTR(-ENOMEM); } nfc_digital_set_drvdata(dev->nfc_digital_dev, dev); dev->link_in = link_in; dev->link_out = link_out; rc = nfc_digital_register_device(dev->nfc_digital_dev); if (rc) { pr_err("Could not register digital device (%d)\n", rc); nfc_digital_free_device(dev->nfc_digital_dev); kfree(dev); return ERR_PTR(rc); } nfcsim_debugfs_init_dev(dev); return dev; } static void nfcsim_device_free(struct nfcsim *dev) { nfc_digital_unregister_device(dev->nfc_digital_dev); dev->up = false; nfcsim_link_shutdown(dev->link_in); cancel_delayed_work_sync(&dev->send_work); cancel_work_sync(&dev->recv_work); nfc_digital_free_device(dev->nfc_digital_dev); kfree(dev); } static struct nfcsim *dev0; static struct nfcsim *dev1; static int __init nfcsim_init(void) { struct nfcsim_link *link0, *link1; int rc; link0 = nfcsim_link_new(); link1 = nfcsim_link_new(); if (!link0 || !link1) { rc = -ENOMEM; goto exit_err; } nfcsim_debugfs_init(); dev0 = nfcsim_device_new(link0, link1); if (IS_ERR(dev0)) { rc = PTR_ERR(dev0); goto exit_err; } dev1 = nfcsim_device_new(link1, link0); if (IS_ERR(dev1)) { nfcsim_device_free(dev0); rc = PTR_ERR(dev1); goto exit_err; } pr_info("nfcsim " NFCSIM_VERSION " initialized\n"); return 0; exit_err: pr_err("Failed to initialize nfcsim driver (%d)\n", rc); if (link0) nfcsim_link_free(link0); if (link1) nfcsim_link_free(link1); return rc; } static void __exit nfcsim_exit(void) { struct nfcsim_link *link0, *link1; link0 = dev0->link_in; link1 = dev0->link_out; nfcsim_device_free(dev0); nfcsim_device_free(dev1); nfcsim_link_free(link0); nfcsim_link_free(link1); nfcsim_debugfs_remove(); } module_init(nfcsim_init); module_exit(nfcsim_exit); MODULE_DESCRIPTION("NFCSim driver ver " NFCSIM_VERSION); MODULE_VERSION(NFCSIM_VERSION); MODULE_LICENSE("GPL");
16 16 16 2 12 11 1 1 10 2 2 1 12 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 // SPDX-License-Identifier: GPL-2.0 /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Socket Closing - normal and abnormal * * Copyright IBM Corp. 2016 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> */ #include <linux/workqueue.h> #include <linux/sched/signal.h> #include <net/sock.h> #include <net/tcp.h> #include "smc.h" #include "smc_tx.h" #include "smc_cdc.h" #include "smc_close.h" /* release the clcsock that is assigned to the smc_sock */ void smc_clcsock_release(struct smc_sock *smc) { struct socket *tcp; if (smc->listen_smc && current_work() != &smc->smc_listen_work) cancel_work_sync(&smc->smc_listen_work); mutex_lock(&smc->clcsock_release_lock); if (smc->clcsock) { tcp = smc->clcsock; smc->clcsock = NULL; sock_release(tcp); } mutex_unlock(&smc->clcsock_release_lock); } static void smc_close_cleanup_listen(struct sock *parent) { struct sock *sk; /* Close non-accepted connections */ while ((sk = smc_accept_dequeue(parent, NULL))) smc_close_non_accepted(sk); } /* wait for sndbuf data being transmitted */ static void smc_close_stream_wait(struct smc_sock *smc, long timeout) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct sock *sk = &smc->sk; if (!timeout) return; if (!smc_tx_prepared_sends(&smc->conn)) return; /* Send out corked data remaining in sndbuf */ smc_tx_pending(&smc->conn); smc->wait_close_tx_prepared = 1; add_wait_queue(sk_sleep(sk), &wait); while (!signal_pending(current) && timeout) { int rc; rc = sk_wait_event(sk, &timeout, !smc_tx_prepared_sends(&smc->conn) || READ_ONCE(sk->sk_err) == ECONNABORTED || READ_ONCE(sk->sk_err) == ECONNRESET || smc->conn.killed, &wait); if (rc) break; } remove_wait_queue(sk_sleep(sk), &wait); smc->wait_close_tx_prepared = 0; } void smc_close_wake_tx_prepared(struct smc_sock *smc) { if (smc->wait_close_tx_prepared) /* wake up socket closing */ smc->sk.sk_state_change(&smc->sk); } static int smc_close_wr(struct smc_connection *conn) { conn->local_tx_ctrl.conn_state_flags.peer_done_writing = 1; return smc_cdc_get_slot_and_msg_send(conn); } static int smc_close_final(struct smc_connection *conn) { if (atomic_read(&conn->bytes_to_rcv)) conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; else conn->local_tx_ctrl.conn_state_flags.peer_conn_closed = 1; if (conn->killed) return -EPIPE; return smc_cdc_get_slot_and_msg_send(conn); } int smc_close_abort(struct smc_connection *conn) { conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; return smc_cdc_get_slot_and_msg_send(conn); } static void smc_close_cancel_work(struct smc_sock *smc) { struct sock *sk = &smc->sk; release_sock(sk); if (cancel_work_sync(&smc->conn.close_work)) sock_put(sk); cancel_delayed_work_sync(&smc->conn.tx_work); lock_sock(sk); } /* terminate smc socket abnormally - active abort * link group is terminated, i.e. RDMA communication no longer possible */ void smc_close_active_abort(struct smc_sock *smc) { struct sock *sk = &smc->sk; bool release_clcsock = false; if (sk->sk_state != SMC_INIT && smc->clcsock && smc->clcsock->sk) { sk->sk_err = ECONNABORTED; if (smc->clcsock && smc->clcsock->sk) tcp_abort(smc->clcsock->sk, ECONNABORTED); } switch (sk->sk_state) { case SMC_ACTIVE: case SMC_APPCLOSEWAIT1: case SMC_APPCLOSEWAIT2: sk->sk_state = SMC_PEERABORTWAIT; smc_close_cancel_work(smc); if (sk->sk_state != SMC_PEERABORTWAIT) break; sk->sk_state = SMC_CLOSED; sock_put(sk); /* (postponed) passive closing */ break; case SMC_PEERCLOSEWAIT1: case SMC_PEERCLOSEWAIT2: case SMC_PEERFINCLOSEWAIT: sk->sk_state = SMC_PEERABORTWAIT; smc_close_cancel_work(smc); if (sk->sk_state != SMC_PEERABORTWAIT) break; sk->sk_state = SMC_CLOSED; smc_conn_free(&smc->conn); release_clcsock = true; sock_put(sk); /* passive closing */ break; case SMC_PROCESSABORT: case SMC_APPFINCLOSEWAIT: sk->sk_state = SMC_PEERABORTWAIT; smc_close_cancel_work(smc); if (sk->sk_state != SMC_PEERABORTWAIT) break; sk->sk_state = SMC_CLOSED; smc_conn_free(&smc->conn); release_clcsock = true; break; case SMC_INIT: case SMC_PEERABORTWAIT: case SMC_CLOSED: break; } smc_sock_set_flag(sk, SOCK_DEAD); sk->sk_state_change(sk); if (release_clcsock) { release_sock(sk); smc_clcsock_release(smc); lock_sock(sk); } } static inline bool smc_close_sent_any_close(struct smc_connection *conn) { return conn->local_tx_ctrl.conn_state_flags.peer_conn_abort || conn->local_tx_ctrl.conn_state_flags.peer_conn_closed; } int smc_close_active(struct smc_sock *smc) { struct smc_cdc_conn_state_flags *txflags = &smc->conn.local_tx_ctrl.conn_state_flags; struct smc_connection *conn = &smc->conn; struct sock *sk = &smc->sk; int old_state; long timeout; int rc = 0; int rc1 = 0; timeout = current->flags & PF_EXITING ? 0 : sock_flag(sk, SOCK_LINGER) ? sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT; old_state = sk->sk_state; again: switch (sk->sk_state) { case SMC_INIT: sk->sk_state = SMC_CLOSED; break; case SMC_LISTEN: sk->sk_state = SMC_CLOSED; sk->sk_state_change(sk); /* wake up accept */ if (smc->clcsock && smc->clcsock->sk) { write_lock_bh(&smc->clcsock->sk->sk_callback_lock); smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready, &smc->clcsk_data_ready); smc->clcsock->sk->sk_user_data = NULL; write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); } smc_close_cleanup_listen(sk); release_sock(sk); flush_work(&smc->tcp_listen_work); lock_sock(sk); break; case SMC_ACTIVE: smc_close_stream_wait(smc, timeout); release_sock(sk); cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); if (sk->sk_state == SMC_ACTIVE) { /* send close request */ rc = smc_close_final(conn); sk->sk_state = SMC_PEERCLOSEWAIT1; /* actively shutdown clcsock before peer close it, * prevent peer from entering TIME_WAIT state. */ if (smc->clcsock && smc->clcsock->sk) { rc1 = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); rc = rc ? rc : rc1; } } else { /* peer event has changed the state */ goto again; } break; case SMC_APPFINCLOSEWAIT: /* socket already shutdown wr or both (active close) */ if (txflags->peer_done_writing && !smc_close_sent_any_close(conn)) { /* just shutdown wr done, send close request */ rc = smc_close_final(conn); } sk->sk_state = SMC_CLOSED; break; case SMC_APPCLOSEWAIT1: case SMC_APPCLOSEWAIT2: if (!smc_cdc_rxed_any_close(conn)) smc_close_stream_wait(smc, timeout); release_sock(sk); cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); if (sk->sk_state != SMC_APPCLOSEWAIT1 && sk->sk_state != SMC_APPCLOSEWAIT2) goto again; /* confirm close from peer */ rc = smc_close_final(conn); if (smc_cdc_rxed_any_close(conn)) { /* peer has closed the socket already */ sk->sk_state = SMC_CLOSED; sock_put(sk); /* postponed passive closing */ } else { /* peer has just issued a shutdown write */ sk->sk_state = SMC_PEERFINCLOSEWAIT; } break; case SMC_PEERCLOSEWAIT1: case SMC_PEERCLOSEWAIT2: if (txflags->peer_done_writing && !smc_close_sent_any_close(conn)) { /* just shutdown wr done, send close request */ rc = smc_close_final(conn); } /* peer sending PeerConnectionClosed will cause transition */ break; case SMC_PEERFINCLOSEWAIT: /* peer sending PeerConnectionClosed will cause transition */ break; case SMC_PROCESSABORT: rc = smc_close_abort(conn); sk->sk_state = SMC_CLOSED; break; case SMC_PEERABORTWAIT: sk->sk_state = SMC_CLOSED; break; case SMC_CLOSED: /* nothing to do, add tracing in future patch */ break; } if (old_state != sk->sk_state) sk->sk_state_change(sk); return rc; } static void smc_close_passive_abort_received(struct smc_sock *smc) { struct smc_cdc_conn_state_flags *txflags = &smc->conn.local_tx_ctrl.conn_state_flags; struct sock *sk = &smc->sk; switch (sk->sk_state) { case SMC_INIT: case SMC_ACTIVE: case SMC_APPCLOSEWAIT1: sk->sk_state = SMC_PROCESSABORT; sock_put(sk); /* passive closing */ break; case SMC_APPFINCLOSEWAIT: sk->sk_state = SMC_PROCESSABORT; break; case SMC_PEERCLOSEWAIT1: case SMC_PEERCLOSEWAIT2: if (txflags->peer_done_writing && !smc_close_sent_any_close(&smc->conn)) /* just shutdown, but not yet closed locally */ sk->sk_state = SMC_PROCESSABORT; else sk->sk_state = SMC_CLOSED; sock_put(sk); /* passive closing */ break; case SMC_APPCLOSEWAIT2: case SMC_PEERFINCLOSEWAIT: sk->sk_state = SMC_CLOSED; sock_put(sk); /* passive closing */ break; case SMC_PEERABORTWAIT: sk->sk_state = SMC_CLOSED; break; case SMC_PROCESSABORT: /* nothing to do, add tracing in future patch */ break; } } /* Either some kind of closing has been received: peer_conn_closed, * peer_conn_abort, or peer_done_writing * or the link group of the connection terminates abnormally. */ static void smc_close_passive_work(struct work_struct *work) { struct smc_connection *conn = container_of(work, struct smc_connection, close_work); struct smc_sock *smc = container_of(conn, struct smc_sock, conn); struct smc_cdc_conn_state_flags *rxflags; bool release_clcsock = false; struct sock *sk = &smc->sk; int old_state; lock_sock(sk); old_state = sk->sk_state; rxflags = &conn->local_rx_ctrl.conn_state_flags; if (rxflags->peer_conn_abort) { /* peer has not received all data */ smc_close_passive_abort_received(smc); release_sock(sk); cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); goto wakeup; } switch (sk->sk_state) { case SMC_INIT: sk->sk_state = SMC_APPCLOSEWAIT1; break; case SMC_ACTIVE: sk->sk_state = SMC_APPCLOSEWAIT1; /* postpone sock_put() for passive closing to cover * received SEND_SHUTDOWN as well */ break; case SMC_PEERCLOSEWAIT1: if (rxflags->peer_done_writing) sk->sk_state = SMC_PEERCLOSEWAIT2; fallthrough; /* to check for closing */ case SMC_PEERCLOSEWAIT2: if (!smc_cdc_rxed_any_close(conn)) break; if (sock_flag(sk, SOCK_DEAD) && smc_close_sent_any_close(conn)) { /* smc_release has already been called locally */ sk->sk_state = SMC_CLOSED; } else { /* just shutdown, but not yet closed locally */ sk->sk_state = SMC_APPFINCLOSEWAIT; } sock_put(sk); /* passive closing */ break; case SMC_PEERFINCLOSEWAIT: if (smc_cdc_rxed_any_close(conn)) { sk->sk_state = SMC_CLOSED; sock_put(sk); /* passive closing */ } break; case SMC_APPCLOSEWAIT1: case SMC_APPCLOSEWAIT2: /* postpone sock_put() for passive closing to cover * received SEND_SHUTDOWN as well */ break; case SMC_APPFINCLOSEWAIT: case SMC_PEERABORTWAIT: case SMC_PROCESSABORT: case SMC_CLOSED: /* nothing to do, add tracing in future patch */ break; } wakeup: sk->sk_data_ready(sk); /* wakeup blocked rcvbuf consumers */ sk->sk_write_space(sk); /* wakeup blocked sndbuf producers */ if (old_state != sk->sk_state) { sk->sk_state_change(sk); if ((sk->sk_state == SMC_CLOSED) && (sock_flag(sk, SOCK_DEAD) || !sk->sk_socket)) { smc_conn_free(conn); if (smc->clcsock) release_clcsock = true; } } release_sock(sk); if (release_clcsock) smc_clcsock_release(smc); sock_put(sk); /* sock_hold done by schedulers of close_work */ } int smc_close_shutdown_write(struct smc_sock *smc) { struct smc_connection *conn = &smc->conn; struct sock *sk = &smc->sk; int old_state; long timeout; int rc = 0; timeout = current->flags & PF_EXITING ? 0 : sock_flag(sk, SOCK_LINGER) ? sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT; old_state = sk->sk_state; again: switch (sk->sk_state) { case SMC_ACTIVE: smc_close_stream_wait(smc, timeout); release_sock(sk); cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); if (sk->sk_state != SMC_ACTIVE) goto again; /* send close wr request */ rc = smc_close_wr(conn); sk->sk_state = SMC_PEERCLOSEWAIT1; break; case SMC_APPCLOSEWAIT1: /* passive close */ if (!smc_cdc_rxed_any_close(conn)) smc_close_stream_wait(smc, timeout); release_sock(sk); cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); if (sk->sk_state != SMC_APPCLOSEWAIT1) goto again; /* confirm close from peer */ rc = smc_close_wr(conn); sk->sk_state = SMC_APPCLOSEWAIT2; break; case SMC_APPCLOSEWAIT2: case SMC_PEERFINCLOSEWAIT: case SMC_PEERCLOSEWAIT1: case SMC_PEERCLOSEWAIT2: case SMC_APPFINCLOSEWAIT: case SMC_PROCESSABORT: case SMC_PEERABORTWAIT: /* nothing to do, add tracing in future patch */ break; } if (old_state != sk->sk_state) sk->sk_state_change(sk); return rc; } /* Initialize close properties on connection establishment. */ void smc_close_init(struct smc_sock *smc) { INIT_WORK(&smc->conn.close_work, smc_close_passive_work); }
12 65 65 8 65 9 9 4 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef INT_BLK_MQ_H #define INT_BLK_MQ_H #include <linux/blk-mq.h> #include "blk-stat.h" struct blk_mq_tag_set; struct elevator_tags; struct blk_mq_ctxs { struct kobject kobj; struct blk_mq_ctx __percpu *queue_ctx; }; /** * struct blk_mq_ctx - State for a software queue facing the submitting CPUs */ struct blk_mq_ctx { struct { spinlock_t lock; struct list_head rq_lists[HCTX_MAX_TYPES]; } ____cacheline_aligned_in_smp; unsigned int cpu; unsigned short index_hw[HCTX_MAX_TYPES]; struct blk_mq_hw_ctx *hctxs[HCTX_MAX_TYPES]; struct request_queue *queue; struct blk_mq_ctxs *ctxs; struct kobject kobj; } ____cacheline_aligned_in_smp; enum { BLK_MQ_NO_TAG = -1U, BLK_MQ_TAG_MIN = 1, BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1, }; #define BLK_MQ_CPU_WORK_BATCH (8) typedef unsigned int __bitwise blk_insert_t; #define BLK_MQ_INSERT_AT_HEAD ((__force blk_insert_t)0x01) void blk_mq_submit_bio(struct bio *bio); int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, unsigned int flags); void blk_mq_exit_queue(struct request_queue *q); struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q, struct elevator_tags *tags, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *, bool); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start); void blk_mq_put_rq_ref(struct request *rq); /* * Internal helpers for allocating/freeing the request map */ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx); void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags); struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int depth); void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx); /* * CPU -> queue mappings */ extern int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int); /* * blk_mq_map_queue_type() - map (hctx_type,cpu) to hardware queue * @q: request queue * @type: the hctx type index * @cpu: CPU */ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *q, enum hctx_type type, unsigned int cpu) { return xa_load(&q->hctx_table, q->tag_set->map[type].mq_map[cpu]); } static inline enum hctx_type blk_mq_get_hctx_type(blk_opf_t opf) { enum hctx_type type = HCTX_TYPE_DEFAULT; /* * The caller ensure that if REQ_POLLED, poll must be enabled. */ if (opf & REQ_POLLED) type = HCTX_TYPE_POLL; else if ((opf & REQ_OP_MASK) == REQ_OP_READ) type = HCTX_TYPE_READ; return type; } /* * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue * @opf: operation type (REQ_OP_*) and flags (e.g. REQ_POLLED). * @ctx: software queue cpu ctx */ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(blk_opf_t opf, struct blk_mq_ctx *ctx) { return ctx->hctxs[blk_mq_get_hctx_type(opf)]; } /* * Default to double of smaller one between hw queue_depth and * 128, since we don't split into sync/async like the old code * did. Additionally, this is a per-hw queue depth. */ static inline unsigned int blk_mq_default_nr_requests( struct blk_mq_tag_set *set) { return 2 * min_t(unsigned int, set->queue_depth, BLKDEV_DEFAULT_RQ); } /* * sysfs helpers */ extern void blk_mq_sysfs_init(struct request_queue *q); extern void blk_mq_sysfs_deinit(struct request_queue *q); int blk_mq_sysfs_register(struct gendisk *disk); void blk_mq_sysfs_unregister(struct gendisk *disk); int blk_mq_sysfs_register_hctxs(struct request_queue *q); void blk_mq_sysfs_unregister_hctxs(struct request_queue *q); extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); void blk_mq_free_plug_rqs(struct blk_plug *plug); void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); void blk_mq_cancel_work_sync(struct request_queue *q); void blk_mq_release(struct request_queue *q); static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, unsigned int cpu) { return per_cpu_ptr(q->queue_ctx, cpu); } /* * This assumes per-cpu software queueing queues. They could be per-node * as well, for instance. For now this is hardcoded as-is. Note that we don't * care about preemption, since we know the ctx's are persistent. This does * mean that we can't rely on ctx always matching the currently running CPU. */ static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) { return __blk_mq_get_ctx(q, raw_smp_processor_id()); } struct blk_mq_alloc_data { /* input parameter */ struct request_queue *q; blk_mq_req_flags_t flags; unsigned int shallow_depth; blk_opf_t cmd_flags; req_flags_t rq_flags; /* allocate multiple requests/tags in one go */ unsigned int nr_tags; struct rq_list *cached_rqs; /* input & output parameter */ struct blk_mq_ctx *ctx; struct blk_mq_hw_ctx *hctx; }; struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, unsigned int flags, int node); void blk_mq_free_tags(struct blk_mq_tag_set *set, struct blk_mq_tags *tags); unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, unsigned int *offset); void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag); void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags); void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size); void blk_mq_tag_update_sched_shared_tags(struct request_queue *q, unsigned int nr); void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, void *priv); void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void *priv); static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt, struct blk_mq_hw_ctx *hctx) { if (!hctx) return &bt->ws[0]; return sbq_wait_ptr(bt, &hctx->wait_index); } void __blk_mq_tag_busy(struct blk_mq_hw_ctx *); void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); static inline void blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_tag_busy(hctx); } static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_tag_idle(hctx); } static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, unsigned int tag) { return tag < tags->nr_reserved_tags; } static inline bool blk_mq_is_shared_tags(unsigned int flags) { return flags & BLK_MQ_F_TAG_HCTX_SHARED; } static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) { if (data->rq_flags & RQF_SCHED_TAGS) return data->hctx->sched_tags; return data->hctx->tags; } static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx) { /* Fast path: hardware queue is not stopped most of the time. */ if (likely(!test_bit(BLK_MQ_S_STOPPED, &hctx->state))) return false; /* * This barrier is used to order adding of dispatch list before and * the test of BLK_MQ_S_STOPPED below. Pairs with the memory barrier * in blk_mq_start_stopped_hw_queue() so that dispatch code could * either see BLK_MQ_S_STOPPED is cleared or dispatch list is not * empty to avoid missing dispatching requests. */ smp_mb(); return test_bit(BLK_MQ_S_STOPPED, &hctx->state); } static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) { return hctx->nr_ctx && hctx->tags; } void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2]); static inline void blk_mq_put_dispatch_budget(struct request_queue *q, int budget_token) { if (q->mq_ops->put_budget) q->mq_ops->put_budget(q, budget_token); } static inline int blk_mq_get_dispatch_budget(struct request_queue *q) { if (q->mq_ops->get_budget) return q->mq_ops->get_budget(q); return 0; } static inline void blk_mq_set_rq_budget_token(struct request *rq, int token) { if (token < 0) return; if (rq->q->mq_ops->set_rq_budget_token) rq->q->mq_ops->set_rq_budget_token(rq, token); } static inline int blk_mq_get_rq_budget_token(struct request *rq) { if (rq->q->mq_ops->get_rq_budget_token) return rq->q->mq_ops->get_rq_budget_token(rq); return -1; } static inline void __blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx, int val) { if (blk_mq_is_shared_tags(hctx->flags)) atomic_add(val, &hctx->queue->nr_active_requests_shared_tags); else atomic_add(val, &hctx->nr_active); } static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) { __blk_mq_add_active_requests(hctx, 1); } static inline void __blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx, int val) { if (blk_mq_is_shared_tags(hctx->flags)) atomic_sub(val, &hctx->queue->nr_active_requests_shared_tags); else atomic_sub(val, &hctx->nr_active); } static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) { __blk_mq_sub_active_requests(hctx, 1); } static inline void blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx, int val) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_add_active_requests(hctx, val); } static inline void blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_inc_active_requests(hctx); } static inline void blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx, int val) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_sub_active_requests(hctx, val); } static inline void blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_dec_active_requests(hctx); } static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx) { if (blk_mq_is_shared_tags(hctx->flags)) return atomic_read(&hctx->queue->nr_active_requests_shared_tags); return atomic_read(&hctx->nr_active); } static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq) { blk_mq_dec_active_requests(hctx); blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag); rq->tag = BLK_MQ_NO_TAG; } static inline void blk_mq_put_driver_tag(struct request *rq) { if (rq->tag == BLK_MQ_NO_TAG || rq->internal_tag == BLK_MQ_NO_TAG) return; __blk_mq_put_driver_tag(rq->mq_hctx, rq); } bool __blk_mq_alloc_driver_tag(struct request *rq); static inline bool blk_mq_get_driver_tag(struct request *rq) { if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq)) return false; return true; } static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) { int cpu; for_each_possible_cpu(cpu) qmap->mq_map[cpu] = 0; } /* Free all requests on the list */ static inline void blk_mq_free_requests(struct list_head *list) { while (!list_empty(list)) { struct request *rq = list_entry_rq(list->next); list_del_init(&rq->queuelist); blk_mq_free_request(rq); } } /* * For shared tag users, we track the number of currently active users * and attempt to provide a fair share of the tag depth for each of them. */ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt) { unsigned int depth, users; if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) return true; /* * Don't try dividing an ant */ if (bt->sb.depth == 1) return true; if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) return true; } else { if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) return true; } users = READ_ONCE(hctx->tags->active_queues); if (!users) return true; /* * Allow at least some tags */ depth = max((bt->sb.depth + users - 1) / users, 4U); return __blk_mq_active_requests(hctx) < depth; } /* run the code block in @dispatch_ops with rcu/srcu read lock held */ #define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \ do { \ if ((q)->tag_set->flags & BLK_MQ_F_BLOCKING) { \ struct blk_mq_tag_set *__tag_set = (q)->tag_set; \ int srcu_idx; \ \ might_sleep_if(check_sleep); \ srcu_idx = srcu_read_lock(__tag_set->srcu); \ (dispatch_ops); \ srcu_read_unlock(__tag_set->srcu, srcu_idx); \ } else { \ rcu_read_lock(); \ (dispatch_ops); \ rcu_read_unlock(); \ } \ } while (0) #define blk_mq_run_dispatch_ops(q, dispatch_ops) \ __blk_mq_run_dispatch_ops(q, true, dispatch_ops) \ static inline bool blk_mq_can_poll(struct request_queue *q) { return (q->limits.features & BLK_FEAT_POLL) && q->tag_set->map[HCTX_TYPE_POLL].nr_queues; } #endif
1 3 1 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #ifndef _NET_BATMAN_ADV_ORIGINATOR_H_ #define _NET_BATMAN_ADV_ORIGINATOR_H_ #include "main.h" #include <linux/compiler.h> #include <linux/if_ether.h> #include <linux/jhash.h> #include <linux/kref.h> #include <linux/netlink.h> #include <linux/skbuff.h> #include <linux/types.h> bool batadv_compare_orig(const struct hlist_node *node, const void *data2); int batadv_originator_init(struct batadv_priv *bat_priv); void batadv_originator_free(struct batadv_priv *bat_priv); void batadv_purge_orig_ref(struct batadv_priv *bat_priv); void batadv_orig_node_release(struct kref *ref); struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, const u8 *addr); struct batadv_hardif_neigh_node * batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, const u8 *neigh_addr); void batadv_hardif_neigh_release(struct kref *ref); struct batadv_neigh_node * batadv_neigh_node_get_or_create(struct batadv_orig_node *orig_node, struct batadv_hard_iface *hard_iface, const u8 *neigh_addr); void batadv_neigh_node_release(struct kref *ref); struct batadv_neigh_node * batadv_orig_router_get(struct batadv_orig_node *orig_node, const struct batadv_hard_iface *if_outgoing); struct batadv_neigh_node * batadv_orig_to_router(struct batadv_priv *bat_priv, u8 *orig_addr, struct batadv_hard_iface *if_outgoing); struct batadv_neigh_ifinfo * batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh, struct batadv_hard_iface *if_outgoing); struct batadv_neigh_ifinfo * batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh, struct batadv_hard_iface *if_outgoing); void batadv_neigh_ifinfo_release(struct kref *ref); int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb); struct batadv_orig_ifinfo * batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_outgoing); struct batadv_orig_ifinfo * batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_outgoing); void batadv_orig_ifinfo_release(struct kref *ref); int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb); struct batadv_orig_node_vlan * batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node, unsigned short vid); struct batadv_orig_node_vlan * batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node, unsigned short vid); void batadv_orig_node_vlan_release(struct kref *ref); /** * batadv_choose_orig() - Return the index of the orig entry in the hash table * @data: mac address of the originator node * @size: the size of the hash table * * Return: the hash index where the object represented by @data should be * stored at. */ static inline u32 batadv_choose_orig(const void *data, u32 size) { u32 hash = 0; hash = jhash(data, ETH_ALEN, hash); return hash % size; } struct batadv_orig_node * batadv_orig_hash_find(struct batadv_priv *bat_priv, const void *data); /** * batadv_orig_node_vlan_put() - decrement the refcounter and possibly release * the originator-vlan object * @orig_vlan: the originator-vlan object to release */ static inline void batadv_orig_node_vlan_put(struct batadv_orig_node_vlan *orig_vlan) { if (!orig_vlan) return; kref_put(&orig_vlan->refcount, batadv_orig_node_vlan_release); } /** * batadv_neigh_ifinfo_put() - decrement the refcounter and possibly release * the neigh_ifinfo * @neigh_ifinfo: the neigh_ifinfo object to release */ static inline void batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo) { if (!neigh_ifinfo) return; kref_put(&neigh_ifinfo->refcount, batadv_neigh_ifinfo_release); } /** * batadv_hardif_neigh_put() - decrement the hardif neighbors refcounter * and possibly release it * @hardif_neigh: hardif neigh neighbor to free */ static inline void batadv_hardif_neigh_put(struct batadv_hardif_neigh_node *hardif_neigh) { if (!hardif_neigh) return; kref_put(&hardif_neigh->refcount, batadv_hardif_neigh_release); } /** * batadv_neigh_node_put() - decrement the neighbors refcounter and possibly * release it * @neigh_node: neigh neighbor to free */ static inline void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node) { if (!neigh_node) return; kref_put(&neigh_node->refcount, batadv_neigh_node_release); } /** * batadv_orig_ifinfo_put() - decrement the refcounter and possibly release * the orig_ifinfo * @orig_ifinfo: the orig_ifinfo object to release */ static inline void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo) { if (!orig_ifinfo) return; kref_put(&orig_ifinfo->refcount, batadv_orig_ifinfo_release); } /** * batadv_orig_node_put() - decrement the orig node refcounter and possibly * release it * @orig_node: the orig node to free */ static inline void batadv_orig_node_put(struct batadv_orig_node *orig_node) { if (!orig_node) return; kref_put(&orig_node->refcount, batadv_orig_node_release); } #endif /* _NET_BATMAN_ADV_ORIGINATOR_H_ */
13 9 1 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 // SPDX-License-Identifier: GPL-2.0-only /* * This is a module which is used for rejecting packets. */ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/ip.h> #include <linux/udp.h> #include <linux/icmp.h> #include <net/icmp.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_REJECT.h> #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) #include <linux/netfilter_bridge.h> #endif #include <net/netfilter/ipv4/nf_reject.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv4"); static unsigned int reject_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ipt_reject_info *reject = par->targinfo; int hook = xt_hooknum(par); switch (reject->with) { case IPT_ICMP_NET_UNREACHABLE: nf_send_unreach(skb, ICMP_NET_UNREACH, hook); break; case IPT_ICMP_HOST_UNREACHABLE: nf_send_unreach(skb, ICMP_HOST_UNREACH, hook); break; case IPT_ICMP_PROT_UNREACHABLE: nf_send_unreach(skb, ICMP_PROT_UNREACH, hook); break; case IPT_ICMP_PORT_UNREACHABLE: nf_send_unreach(skb, ICMP_PORT_UNREACH, hook); break; case IPT_ICMP_NET_PROHIBITED: nf_send_unreach(skb, ICMP_NET_ANO, hook); break; case IPT_ICMP_HOST_PROHIBITED: nf_send_unreach(skb, ICMP_HOST_ANO, hook); break; case IPT_ICMP_ADMIN_PROHIBITED: nf_send_unreach(skb, ICMP_PKT_FILTERED, hook); break; case IPT_TCP_RESET: nf_send_reset(xt_net(par), par->state->sk, skb, hook); break; case IPT_ICMP_ECHOREPLY: /* Doesn't happen. */ break; } return NF_DROP; } static int reject_tg_check(const struct xt_tgchk_param *par) { const struct ipt_reject_info *rejinfo = par->targinfo; const struct ipt_entry *e = par->entryinfo; if (rejinfo->with == IPT_ICMP_ECHOREPLY) { pr_info_ratelimited("ECHOREPLY no longer supported.\n"); return -EINVAL; } else if (rejinfo->with == IPT_TCP_RESET) { /* Must specify that it's a TCP packet */ if (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & XT_INV_PROTO)) { pr_info_ratelimited("TCP_RESET invalid for non-tcp\n"); return -EINVAL; } } return 0; } static struct xt_target reject_tg_reg __read_mostly = { .name = "REJECT", .family = NFPROTO_IPV4, .target = reject_tg, .targetsize = sizeof(struct ipt_reject_info), .table = "filter", .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT), .checkentry = reject_tg_check, .me = THIS_MODULE, }; static int __init reject_tg_init(void) { return xt_register_target(&reject_tg_reg); } static void __exit reject_tg_exit(void) { xt_unregister_target(&reject_tg_reg); } module_init(reject_tg_init); module_exit(reject_tg_exit);
9 1 220 220 113 527 525 9 527 526 527 45 527 44 527 527 513 45 333 333 214 215 4 4 3 3 7 2 1 1 3 1 2 1 1 2 4 4 2 2 1 4 4 3 1 1 3 3 2 2 4 2 2 2 2 2 1 1 4 4 2 2 2 1 1 1 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * * This file is part of the SCTP kernel implementation * * This file contains sctp stream maniuplation primitives and helpers. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Xin Long <lucien.xin@gmail.com> */ #include <linux/list.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> static void sctp_stream_shrink_out(struct sctp_stream *stream, __u16 outcnt) { struct sctp_association *asoc; struct sctp_chunk *ch, *temp; struct sctp_outq *outq; asoc = container_of(stream, struct sctp_association, stream); outq = &asoc->outqueue; list_for_each_entry_safe(ch, temp, &outq->out_chunk_list, list) { __u16 sid = sctp_chunk_stream_no(ch); if (sid < outcnt) continue; sctp_sched_dequeue_common(outq, ch); /* No need to call dequeue_done here because * the chunks are not scheduled by now. */ /* Mark as failed send. */ sctp_chunk_fail(ch, (__force __u32)SCTP_ERROR_INV_STRM); if (asoc->peer.prsctp_capable && SCTP_PR_PRIO_ENABLED(ch->sinfo.sinfo_flags)) asoc->sent_cnt_removable--; sctp_chunk_free(ch); } } static void sctp_stream_free_ext(struct sctp_stream *stream, __u16 sid) { const struct sctp_sched_ops *sched; if (!SCTP_SO(stream, sid)->ext) return; sched = sctp_sched_ops_from_stream(stream); sched->free_sid(stream, sid); kfree(SCTP_SO(stream, sid)->ext); SCTP_SO(stream, sid)->ext = NULL; } /* Migrates chunks from stream queues to new stream queues if needed, * but not across associations. Also, removes those chunks to streams * higher than the new max. */ static void sctp_stream_outq_migrate(struct sctp_stream *stream, struct sctp_stream *new, __u16 outcnt) { int i; if (stream->outcnt > outcnt) sctp_stream_shrink_out(stream, outcnt); if (new) { /* Here we actually move the old ext stuff into the new * buffer, because we want to keep it. Then * sctp_stream_update will swap ->out pointers. */ for (i = 0; i < outcnt; i++) { sctp_stream_free_ext(new, i); SCTP_SO(new, i)->ext = SCTP_SO(stream, i)->ext; SCTP_SO(stream, i)->ext = NULL; } } for (i = outcnt; i < stream->outcnt; i++) sctp_stream_free_ext(stream, i); } static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt, gfp_t gfp) { int ret; if (outcnt <= stream->outcnt) goto out; ret = genradix_prealloc(&stream->out, outcnt, gfp); if (ret) return ret; out: stream->outcnt = outcnt; return 0; } static int sctp_stream_alloc_in(struct sctp_stream *stream, __u16 incnt, gfp_t gfp) { int ret; if (incnt <= stream->incnt) goto out; ret = genradix_prealloc(&stream->in, incnt, gfp); if (ret) return ret; out: stream->incnt = incnt; return 0; } int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, gfp_t gfp) { const struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); int i, ret = 0; gfp |= __GFP_NOWARN; /* Initial stream->out size may be very big, so free it and alloc * a new one with new outcnt to save memory if needed. */ if (outcnt == stream->outcnt) goto handle_in; /* Filter out chunks queued on streams that won't exist anymore */ sched->unsched_all(stream); sctp_stream_outq_migrate(stream, NULL, outcnt); sched->sched_all(stream); ret = sctp_stream_alloc_out(stream, outcnt, gfp); if (ret) return ret; for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; handle_in: sctp_stream_interleave_init(stream); if (!incnt) return 0; return sctp_stream_alloc_in(stream, incnt, gfp); } int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid) { struct sctp_stream_out_ext *soute; int ret; soute = kzalloc(sizeof(*soute), GFP_KERNEL); if (!soute) return -ENOMEM; SCTP_SO(stream, sid)->ext = soute; ret = sctp_sched_init_sid(stream, sid, GFP_KERNEL); if (ret) { kfree(SCTP_SO(stream, sid)->ext); SCTP_SO(stream, sid)->ext = NULL; } return ret; } void sctp_stream_free(struct sctp_stream *stream) { const struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); int i; sched->unsched_all(stream); for (i = 0; i < stream->outcnt; i++) sctp_stream_free_ext(stream, i); genradix_free(&stream->out); genradix_free(&stream->in); } void sctp_stream_clear(struct sctp_stream *stream) { int i; for (i = 0; i < stream->outcnt; i++) { SCTP_SO(stream, i)->mid = 0; SCTP_SO(stream, i)->mid_uo = 0; } for (i = 0; i < stream->incnt; i++) SCTP_SI(stream, i)->mid = 0; } void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new) { const struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); sched->unsched_all(stream); sctp_stream_outq_migrate(stream, new, new->outcnt); sctp_stream_free(stream); stream->out = new->out; stream->in = new->in; stream->outcnt = new->outcnt; stream->incnt = new->incnt; sched->sched_all(stream); new->out.tree.root = NULL; new->in.tree.root = NULL; new->outcnt = 0; new->incnt = 0; } static int sctp_send_reconf(struct sctp_association *asoc, struct sctp_chunk *chunk) { int retval = 0; retval = sctp_primitive_RECONF(asoc->base.net, asoc, chunk); if (retval) sctp_chunk_free(chunk); return retval; } static bool sctp_stream_outq_is_empty(struct sctp_stream *stream, __u16 str_nums, __be16 *str_list) { struct sctp_association *asoc; __u16 i; asoc = container_of(stream, struct sctp_association, stream); if (!asoc->outqueue.out_qlen) return true; if (!str_nums) return false; for (i = 0; i < str_nums; i++) { __u16 sid = ntohs(str_list[i]); if (SCTP_SO(stream, sid)->ext && !list_empty(&SCTP_SO(stream, sid)->ext->outq)) return false; } return true; } int sctp_send_reset_streams(struct sctp_association *asoc, struct sctp_reset_streams *params) { struct sctp_stream *stream = &asoc->stream; __u16 i, str_nums, *str_list; struct sctp_chunk *chunk; int retval = -EINVAL; __be16 *nstr_list; bool out, in; if (!asoc->peer.reconf_capable || !(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ)) { retval = -ENOPROTOOPT; goto out; } if (asoc->strreset_outstanding) { retval = -EINPROGRESS; goto out; } out = params->srs_flags & SCTP_STREAM_RESET_OUTGOING; in = params->srs_flags & SCTP_STREAM_RESET_INCOMING; if (!out && !in) goto out; str_nums = params->srs_number_streams; str_list = params->srs_stream_list; if (str_nums) { int param_len = 0; if (out) { for (i = 0; i < str_nums; i++) if (str_list[i] >= stream->outcnt) goto out; param_len = str_nums * sizeof(__u16) + sizeof(struct sctp_strreset_outreq); } if (in) { for (i = 0; i < str_nums; i++) if (str_list[i] >= stream->incnt) goto out; param_len += str_nums * sizeof(__u16) + sizeof(struct sctp_strreset_inreq); } if (param_len > SCTP_MAX_CHUNK_LEN - sizeof(struct sctp_reconf_chunk)) goto out; } nstr_list = kcalloc(str_nums, sizeof(__be16), GFP_KERNEL); if (!nstr_list) { retval = -ENOMEM; goto out; } for (i = 0; i < str_nums; i++) nstr_list[i] = htons(str_list[i]); if (out && !sctp_stream_outq_is_empty(stream, str_nums, nstr_list)) { kfree(nstr_list); retval = -EAGAIN; goto out; } chunk = sctp_make_strreset_req(asoc, str_nums, nstr_list, out, in); kfree(nstr_list); if (!chunk) { retval = -ENOMEM; goto out; } if (out) { if (str_nums) for (i = 0; i < str_nums; i++) SCTP_SO(stream, str_list[i])->state = SCTP_STREAM_CLOSED; else for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_CLOSED; } asoc->strreset_chunk = chunk; sctp_chunk_hold(asoc->strreset_chunk); retval = sctp_send_reconf(asoc, chunk); if (retval) { sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; if (!out) goto out; if (str_nums) for (i = 0; i < str_nums; i++) SCTP_SO(stream, str_list[i])->state = SCTP_STREAM_OPEN; else for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; goto out; } asoc->strreset_outstanding = out + in; out: return retval; } int sctp_send_reset_assoc(struct sctp_association *asoc) { struct sctp_stream *stream = &asoc->stream; struct sctp_chunk *chunk = NULL; int retval; __u16 i; if (!asoc->peer.reconf_capable || !(asoc->strreset_enable & SCTP_ENABLE_RESET_ASSOC_REQ)) return -ENOPROTOOPT; if (asoc->strreset_outstanding) return -EINPROGRESS; if (!sctp_outq_is_empty(&asoc->outqueue)) return -EAGAIN; chunk = sctp_make_strreset_tsnreq(asoc); if (!chunk) return -ENOMEM; /* Block further xmit of data until this request is completed */ for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_CLOSED; asoc->strreset_chunk = chunk; sctp_chunk_hold(asoc->strreset_chunk); retval = sctp_send_reconf(asoc, chunk); if (retval) { sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; return retval; } asoc->strreset_outstanding = 1; return 0; } int sctp_send_add_streams(struct sctp_association *asoc, struct sctp_add_streams *params) { struct sctp_stream *stream = &asoc->stream; struct sctp_chunk *chunk = NULL; int retval; __u32 outcnt, incnt; __u16 out, in; if (!asoc->peer.reconf_capable || !(asoc->strreset_enable & SCTP_ENABLE_CHANGE_ASSOC_REQ)) { retval = -ENOPROTOOPT; goto out; } if (asoc->strreset_outstanding) { retval = -EINPROGRESS; goto out; } out = params->sas_outstrms; in = params->sas_instrms; outcnt = stream->outcnt + out; incnt = stream->incnt + in; if (outcnt > SCTP_MAX_STREAM || incnt > SCTP_MAX_STREAM || (!out && !in)) { retval = -EINVAL; goto out; } if (out) { retval = sctp_stream_alloc_out(stream, outcnt, GFP_KERNEL); if (retval) goto out; } chunk = sctp_make_strreset_addstrm(asoc, out, in); if (!chunk) { retval = -ENOMEM; goto out; } asoc->strreset_chunk = chunk; sctp_chunk_hold(asoc->strreset_chunk); retval = sctp_send_reconf(asoc, chunk); if (retval) { sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; goto out; } asoc->strreset_outstanding = !!out + !!in; out: return retval; } static struct sctp_paramhdr *sctp_chunk_lookup_strreset_param( struct sctp_association *asoc, __be32 resp_seq, __be16 type) { struct sctp_chunk *chunk = asoc->strreset_chunk; struct sctp_reconf_chunk *hdr; union sctp_params param; if (!chunk) return NULL; hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr; sctp_walk_params(param, hdr) { /* sctp_strreset_tsnreq is actually the basic structure * of all stream reconf params, so it's safe to use it * to access request_seq. */ struct sctp_strreset_tsnreq *req = param.v; if ((!resp_seq || req->request_seq == resp_seq) && (!type || type == req->param_hdr.type)) return param.v; } return NULL; } static void sctp_update_strreset_result(struct sctp_association *asoc, __u32 result) { asoc->strreset_result[1] = asoc->strreset_result[0]; asoc->strreset_result[0] = result; } struct sctp_chunk *sctp_process_strreset_outreq( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp) { struct sctp_strreset_outreq *outreq = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; __be16 *str_p = NULL; __u32 request_seq; __u16 i, nums; request_seq = ntohl(outreq->request_seq); if (ntohl(outreq->send_reset_at_tsn) > sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map)) { result = SCTP_STRRESET_IN_PROGRESS; goto err; } if (TSN_lt(asoc->strreset_inseq, request_seq) || TSN_lt(request_seq, asoc->strreset_inseq - 2)) { result = SCTP_STRRESET_ERR_BAD_SEQNO; goto err; } else if (TSN_lt(request_seq, asoc->strreset_inseq)) { i = asoc->strreset_inseq - request_seq - 1; result = asoc->strreset_result[i]; goto err; } asoc->strreset_inseq++; /* Check strreset_enable after inseq inc, as sender cannot tell * the peer doesn't enable strreset after receiving response with * result denied, as well as to keep consistent with bsd. */ if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ)) goto out; nums = (ntohs(param.p->length) - sizeof(*outreq)) / sizeof(__u16); str_p = outreq->list_of_streams; for (i = 0; i < nums; i++) { if (ntohs(str_p[i]) >= stream->incnt) { result = SCTP_STRRESET_ERR_WRONG_SSN; goto out; } } if (asoc->strreset_chunk) { if (!sctp_chunk_lookup_strreset_param( asoc, outreq->response_seq, SCTP_PARAM_RESET_IN_REQUEST)) { /* same process with outstanding isn't 0 */ result = SCTP_STRRESET_ERR_IN_PROGRESS; goto out; } asoc->strreset_outstanding--; asoc->strreset_outseq++; if (!asoc->strreset_outstanding) { struct sctp_transport *t; t = asoc->strreset_chunk->transport; if (timer_delete(&t->reconf_timer)) sctp_transport_put(t); sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; } } if (nums) for (i = 0; i < nums; i++) SCTP_SI(stream, ntohs(str_p[i]))->mid = 0; else for (i = 0; i < stream->incnt; i++) SCTP_SI(stream, i)->mid = 0; result = SCTP_STRRESET_PERFORMED; *evp = sctp_ulpevent_make_stream_reset_event(asoc, SCTP_STREAM_RESET_INCOMING_SSN, nums, str_p, GFP_ATOMIC); out: sctp_update_strreset_result(asoc, result); err: return sctp_make_strreset_resp(asoc, result, request_seq); } struct sctp_chunk *sctp_process_strreset_inreq( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp) { struct sctp_strreset_inreq *inreq = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; struct sctp_chunk *chunk = NULL; __u32 request_seq; __u16 i, nums; __be16 *str_p; request_seq = ntohl(inreq->request_seq); if (TSN_lt(asoc->strreset_inseq, request_seq) || TSN_lt(request_seq, asoc->strreset_inseq - 2)) { result = SCTP_STRRESET_ERR_BAD_SEQNO; goto err; } else if (TSN_lt(request_seq, asoc->strreset_inseq)) { i = asoc->strreset_inseq - request_seq - 1; result = asoc->strreset_result[i]; if (result == SCTP_STRRESET_PERFORMED) return NULL; goto err; } asoc->strreset_inseq++; if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ)) goto out; if (asoc->strreset_outstanding) { result = SCTP_STRRESET_ERR_IN_PROGRESS; goto out; } nums = (ntohs(param.p->length) - sizeof(*inreq)) / sizeof(__u16); str_p = inreq->list_of_streams; for (i = 0; i < nums; i++) { if (ntohs(str_p[i]) >= stream->outcnt) { result = SCTP_STRRESET_ERR_WRONG_SSN; goto out; } } if (!sctp_stream_outq_is_empty(stream, nums, str_p)) { result = SCTP_STRRESET_IN_PROGRESS; asoc->strreset_inseq--; goto err; } chunk = sctp_make_strreset_req(asoc, nums, str_p, 1, 0); if (!chunk) goto out; if (nums) for (i = 0; i < nums; i++) SCTP_SO(stream, ntohs(str_p[i]))->state = SCTP_STREAM_CLOSED; else for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_CLOSED; asoc->strreset_chunk = chunk; asoc->strreset_outstanding = 1; sctp_chunk_hold(asoc->strreset_chunk); result = SCTP_STRRESET_PERFORMED; out: sctp_update_strreset_result(asoc, result); err: if (!chunk) chunk = sctp_make_strreset_resp(asoc, result, request_seq); return chunk; } struct sctp_chunk *sctp_process_strreset_tsnreq( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp) { __u32 init_tsn = 0, next_tsn = 0, max_tsn_seen; struct sctp_strreset_tsnreq *tsnreq = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; __u32 request_seq; __u16 i; request_seq = ntohl(tsnreq->request_seq); if (TSN_lt(asoc->strreset_inseq, request_seq) || TSN_lt(request_seq, asoc->strreset_inseq - 2)) { result = SCTP_STRRESET_ERR_BAD_SEQNO; goto err; } else if (TSN_lt(request_seq, asoc->strreset_inseq)) { i = asoc->strreset_inseq - request_seq - 1; result = asoc->strreset_result[i]; if (result == SCTP_STRRESET_PERFORMED) { next_tsn = asoc->ctsn_ack_point + 1; init_tsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + 1; } goto err; } if (!sctp_outq_is_empty(&asoc->outqueue)) { result = SCTP_STRRESET_IN_PROGRESS; goto err; } asoc->strreset_inseq++; if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_ASSOC_REQ)) goto out; if (asoc->strreset_outstanding) { result = SCTP_STRRESET_ERR_IN_PROGRESS; goto out; } /* G4: The same processing as though a FWD-TSN chunk (as defined in * [RFC3758]) with all streams affected and a new cumulative TSN * ACK of the Receiver's Next TSN minus 1 were received MUST be * performed. */ max_tsn_seen = sctp_tsnmap_get_max_tsn_seen(&asoc->peer.tsn_map); asoc->stream.si->report_ftsn(&asoc->ulpq, max_tsn_seen); /* G1: Compute an appropriate value for the Receiver's Next TSN -- the * TSN that the peer should use to send the next DATA chunk. The * value SHOULD be the smallest TSN not acknowledged by the * receiver of the request plus 2^31. */ init_tsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + (1U << 31); sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, init_tsn, GFP_ATOMIC); /* G3: The same processing as though a SACK chunk with no gap report * and a cumulative TSN ACK of the Sender's Next TSN minus 1 were * received MUST be performed. */ sctp_outq_free(&asoc->outqueue); /* G2: Compute an appropriate value for the local endpoint's next TSN, * i.e., the next TSN assigned by the receiver of the SSN/TSN reset * chunk. The value SHOULD be the highest TSN sent by the receiver * of the request plus 1. */ next_tsn = asoc->next_tsn; asoc->ctsn_ack_point = next_tsn - 1; asoc->adv_peer_ack_point = asoc->ctsn_ack_point; /* G5: The next expected and outgoing SSNs MUST be reset to 0 for all * incoming and outgoing streams. */ for (i = 0; i < stream->outcnt; i++) { SCTP_SO(stream, i)->mid = 0; SCTP_SO(stream, i)->mid_uo = 0; } for (i = 0; i < stream->incnt; i++) SCTP_SI(stream, i)->mid = 0; result = SCTP_STRRESET_PERFORMED; *evp = sctp_ulpevent_make_assoc_reset_event(asoc, 0, init_tsn, next_tsn, GFP_ATOMIC); out: sctp_update_strreset_result(asoc, result); err: return sctp_make_strreset_tsnresp(asoc, result, request_seq, next_tsn, init_tsn); } struct sctp_chunk *sctp_process_strreset_addstrm_out( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp) { struct sctp_strreset_addstrm *addstrm = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; __u32 request_seq, incnt; __u16 in, i; request_seq = ntohl(addstrm->request_seq); if (TSN_lt(asoc->strreset_inseq, request_seq) || TSN_lt(request_seq, asoc->strreset_inseq - 2)) { result = SCTP_STRRESET_ERR_BAD_SEQNO; goto err; } else if (TSN_lt(request_seq, asoc->strreset_inseq)) { i = asoc->strreset_inseq - request_seq - 1; result = asoc->strreset_result[i]; goto err; } asoc->strreset_inseq++; if (!(asoc->strreset_enable & SCTP_ENABLE_CHANGE_ASSOC_REQ)) goto out; in = ntohs(addstrm->number_of_streams); incnt = stream->incnt + in; if (!in || incnt > SCTP_MAX_STREAM) goto out; if (sctp_stream_alloc_in(stream, incnt, GFP_ATOMIC)) goto out; if (asoc->strreset_chunk) { if (!sctp_chunk_lookup_strreset_param( asoc, 0, SCTP_PARAM_RESET_ADD_IN_STREAMS)) { /* same process with outstanding isn't 0 */ result = SCTP_STRRESET_ERR_IN_PROGRESS; goto out; } asoc->strreset_outstanding--; asoc->strreset_outseq++; if (!asoc->strreset_outstanding) { struct sctp_transport *t; t = asoc->strreset_chunk->transport; if (timer_delete(&t->reconf_timer)) sctp_transport_put(t); sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; } } stream->incnt = incnt; result = SCTP_STRRESET_PERFORMED; *evp = sctp_ulpevent_make_stream_change_event(asoc, 0, ntohs(addstrm->number_of_streams), 0, GFP_ATOMIC); out: sctp_update_strreset_result(asoc, result); err: return sctp_make_strreset_resp(asoc, result, request_seq); } struct sctp_chunk *sctp_process_strreset_addstrm_in( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp) { struct sctp_strreset_addstrm *addstrm = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; struct sctp_chunk *chunk = NULL; __u32 request_seq, outcnt; __u16 out, i; int ret; request_seq = ntohl(addstrm->request_seq); if (TSN_lt(asoc->strreset_inseq, request_seq) || TSN_lt(request_seq, asoc->strreset_inseq - 2)) { result = SCTP_STRRESET_ERR_BAD_SEQNO; goto err; } else if (TSN_lt(request_seq, asoc->strreset_inseq)) { i = asoc->strreset_inseq - request_seq - 1; result = asoc->strreset_result[i]; if (result == SCTP_STRRESET_PERFORMED) return NULL; goto err; } asoc->strreset_inseq++; if (!(asoc->strreset_enable & SCTP_ENABLE_CHANGE_ASSOC_REQ)) goto out; if (asoc->strreset_outstanding) { result = SCTP_STRRESET_ERR_IN_PROGRESS; goto out; } out = ntohs(addstrm->number_of_streams); outcnt = stream->outcnt + out; if (!out || outcnt > SCTP_MAX_STREAM) goto out; ret = sctp_stream_alloc_out(stream, outcnt, GFP_ATOMIC); if (ret) goto out; chunk = sctp_make_strreset_addstrm(asoc, out, 0); if (!chunk) goto out; asoc->strreset_chunk = chunk; asoc->strreset_outstanding = 1; sctp_chunk_hold(asoc->strreset_chunk); stream->outcnt = outcnt; result = SCTP_STRRESET_PERFORMED; out: sctp_update_strreset_result(asoc, result); err: if (!chunk) chunk = sctp_make_strreset_resp(asoc, result, request_seq); return chunk; } struct sctp_chunk *sctp_process_strreset_resp( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp) { struct sctp_stream *stream = &asoc->stream; struct sctp_strreset_resp *resp = param.v; struct sctp_transport *t; __u16 i, nums, flags = 0; struct sctp_paramhdr *req; __u32 result; req = sctp_chunk_lookup_strreset_param(asoc, resp->response_seq, 0); if (!req) return NULL; result = ntohl(resp->result); if (result != SCTP_STRRESET_PERFORMED) { /* if in progress, do nothing but retransmit */ if (result == SCTP_STRRESET_IN_PROGRESS) return NULL; else if (result == SCTP_STRRESET_DENIED) flags = SCTP_STREAM_RESET_DENIED; else flags = SCTP_STREAM_RESET_FAILED; } if (req->type == SCTP_PARAM_RESET_OUT_REQUEST) { struct sctp_strreset_outreq *outreq; __be16 *str_p; outreq = (struct sctp_strreset_outreq *)req; str_p = outreq->list_of_streams; nums = (ntohs(outreq->param_hdr.length) - sizeof(*outreq)) / sizeof(__u16); if (result == SCTP_STRRESET_PERFORMED) { struct sctp_stream_out *sout; if (nums) { for (i = 0; i < nums; i++) { sout = SCTP_SO(stream, ntohs(str_p[i])); sout->mid = 0; sout->mid_uo = 0; } } else { for (i = 0; i < stream->outcnt; i++) { sout = SCTP_SO(stream, i); sout->mid = 0; sout->mid_uo = 0; } } } flags |= SCTP_STREAM_RESET_OUTGOING_SSN; for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; *evp = sctp_ulpevent_make_stream_reset_event(asoc, flags, nums, str_p, GFP_ATOMIC); } else if (req->type == SCTP_PARAM_RESET_IN_REQUEST) { struct sctp_strreset_inreq *inreq; __be16 *str_p; /* if the result is performed, it's impossible for inreq */ if (result == SCTP_STRRESET_PERFORMED) return NULL; inreq = (struct sctp_strreset_inreq *)req; str_p = inreq->list_of_streams; nums = (ntohs(inreq->param_hdr.length) - sizeof(*inreq)) / sizeof(__u16); flags |= SCTP_STREAM_RESET_INCOMING_SSN; *evp = sctp_ulpevent_make_stream_reset_event(asoc, flags, nums, str_p, GFP_ATOMIC); } else if (req->type == SCTP_PARAM_RESET_TSN_REQUEST) { struct sctp_strreset_resptsn *resptsn; __u32 stsn, rtsn; /* check for resptsn, as sctp_verify_reconf didn't do it*/ if (ntohs(param.p->length) != sizeof(*resptsn)) return NULL; resptsn = (struct sctp_strreset_resptsn *)resp; stsn = ntohl(resptsn->senders_next_tsn); rtsn = ntohl(resptsn->receivers_next_tsn); if (result == SCTP_STRRESET_PERFORMED) { __u32 mtsn = sctp_tsnmap_get_max_tsn_seen( &asoc->peer.tsn_map); LIST_HEAD(temp); asoc->stream.si->report_ftsn(&asoc->ulpq, mtsn); sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, stsn, GFP_ATOMIC); /* Clean up sacked and abandoned queues only. As the * out_chunk_list may not be empty, splice it to temp, * then get it back after sctp_outq_free is done. */ list_splice_init(&asoc->outqueue.out_chunk_list, &temp); sctp_outq_free(&asoc->outqueue); list_splice_init(&temp, &asoc->outqueue.out_chunk_list); asoc->next_tsn = rtsn; asoc->ctsn_ack_point = asoc->next_tsn - 1; asoc->adv_peer_ack_point = asoc->ctsn_ack_point; for (i = 0; i < stream->outcnt; i++) { SCTP_SO(stream, i)->mid = 0; SCTP_SO(stream, i)->mid_uo = 0; } for (i = 0; i < stream->incnt; i++) SCTP_SI(stream, i)->mid = 0; } for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; *evp = sctp_ulpevent_make_assoc_reset_event(asoc, flags, stsn, rtsn, GFP_ATOMIC); } else if (req->type == SCTP_PARAM_RESET_ADD_OUT_STREAMS) { struct sctp_strreset_addstrm *addstrm; __u16 number; addstrm = (struct sctp_strreset_addstrm *)req; nums = ntohs(addstrm->number_of_streams); number = stream->outcnt - nums; if (result == SCTP_STRRESET_PERFORMED) { for (i = number; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; } else { sctp_stream_shrink_out(stream, number); stream->outcnt = number; } *evp = sctp_ulpevent_make_stream_change_event(asoc, flags, 0, nums, GFP_ATOMIC); } else if (req->type == SCTP_PARAM_RESET_ADD_IN_STREAMS) { struct sctp_strreset_addstrm *addstrm; /* if the result is performed, it's impossible for addstrm in * request. */ if (result == SCTP_STRRESET_PERFORMED) return NULL; addstrm = (struct sctp_strreset_addstrm *)req; nums = ntohs(addstrm->number_of_streams); *evp = sctp_ulpevent_make_stream_change_event(asoc, flags, nums, 0, GFP_ATOMIC); } asoc->strreset_outstanding--; asoc->strreset_outseq++; /* remove everything for this reconf request */ if (!asoc->strreset_outstanding) { t = asoc->strreset_chunk->transport; if (timer_delete(&t->reconf_timer)) sctp_transport_put(t); sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; } return NULL; }
28 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 /* SPDX-License-Identifier: GPL-2.0 */ /* Rewritten and vastly simplified by Rusty Russell for in-kernel * module loader: * Copyright 2002 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation */ #ifndef _LINUX_KALLSYMS_H #define _LINUX_KALLSYMS_H #include <linux/errno.h> #include <linux/buildid.h> #include <linux/kernel.h> #include <linux/stddef.h> #include <linux/mm.h> #include <linux/module.h> #include <asm/sections.h> #define KSYM_NAME_LEN 512 #define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s %s]") + \ (KSYM_NAME_LEN - 1) + \ 2*(BITS_PER_LONG*3/10) + (MODULE_NAME_LEN - 1) + \ (BUILD_ID_SIZE_MAX * 2) + 1) struct cred; struct module; static inline int is_kernel_text(unsigned long addr) { if (__is_kernel_text(addr)) return 1; return in_gate_area_no_mm(addr); } static inline int is_kernel(unsigned long addr) { if (__is_kernel(addr)) return 1; return in_gate_area_no_mm(addr); } static inline int is_ksym_addr(unsigned long addr) { if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) return is_kernel(addr); return is_kernel_text(addr) || is_kernel_inittext(addr); } static inline void *dereference_symbol_descriptor(void *ptr) { #ifdef CONFIG_HAVE_FUNCTION_DESCRIPTORS struct module *mod; ptr = dereference_kernel_function_descriptor(ptr); if (is_ksym_addr((unsigned long)ptr)) return ptr; guard(rcu)(); mod = __module_address((unsigned long)ptr); if (mod) ptr = dereference_module_function_descriptor(mod, ptr); #endif return ptr; } /* How and when do we show kallsyms values? */ extern bool kallsyms_show_value(const struct cred *cred); #ifdef CONFIG_KALLSYMS unsigned long kallsyms_sym_address(int idx); int kallsyms_on_each_symbol(int (*fn)(void *, const char *, unsigned long), void *data); int kallsyms_on_each_match_symbol(int (*fn)(void *, unsigned long), const char *name, void *data); /* Lookup the address for a symbol. Returns 0 if not found. */ unsigned long kallsyms_lookup_name(const char *name); extern int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, unsigned long *offset); /* Lookup an address. modname is set to NULL if it's in the kernel. */ const char *kallsyms_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, char *namebuf); /* Look up a kernel symbol and return it in a text buffer. */ extern int sprint_symbol(char *buffer, unsigned long address); extern int sprint_symbol_build_id(char *buffer, unsigned long address); extern int sprint_symbol_no_offset(char *buffer, unsigned long address); extern int sprint_backtrace(char *buffer, unsigned long address); extern int sprint_backtrace_build_id(char *buffer, unsigned long address); int lookup_symbol_name(unsigned long addr, char *symname); #else /* !CONFIG_KALLSYMS */ static inline unsigned long kallsyms_lookup_name(const char *name) { return 0; } static inline int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, unsigned long *offset) { return 0; } static inline const char *kallsyms_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, char *namebuf) { return NULL; } static inline int sprint_symbol(char *buffer, unsigned long addr) { *buffer = '\0'; return 0; } static inline int sprint_symbol_build_id(char *buffer, unsigned long address) { *buffer = '\0'; return 0; } static inline int sprint_symbol_no_offset(char *buffer, unsigned long addr) { *buffer = '\0'; return 0; } static inline int sprint_backtrace(char *buffer, unsigned long addr) { *buffer = '\0'; return 0; } static inline int sprint_backtrace_build_id(char *buffer, unsigned long addr) { *buffer = '\0'; return 0; } static inline int lookup_symbol_name(unsigned long addr, char *symname) { return -ERANGE; } static inline int kallsyms_on_each_symbol(int (*fn)(void *, const char *, unsigned long), void *data) { return -EOPNOTSUPP; } static inline int kallsyms_on_each_match_symbol(int (*fn)(void *, unsigned long), const char *name, void *data) { return -EOPNOTSUPP; } #endif /*CONFIG_KALLSYMS*/ static inline void print_ip_sym(const char *loglvl, unsigned long ip) { printk("%s[<%px>] %pS\n", loglvl, (void *) ip, (void *) ip); } #endif /*_LINUX_KALLSYMS_H*/
1 3 5 1 2 4 4 4 4 13 4 28 14 12 15 27 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Cloudflare Ltd https://cloudflare.com */ #include <linux/skmsg.h> #include <net/sock.h> #include <net/udp.h> #include <net/inet_common.h> #include "udp_impl.h" static struct proto *udpv6_prot_saved __read_mostly; static int sk_udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) return udpv6_prot_saved->recvmsg(sk, msg, len, flags, addr_len); #endif return udp_prot.recvmsg(sk, msg, len, flags, addr_len); } static bool udp_sk_has_data(struct sock *sk) { return !skb_queue_empty(&udp_sk(sk)->reader_queue) || !skb_queue_empty(&sk->sk_receive_queue); } static bool psock_has_data(struct sk_psock *psock) { return !skb_queue_empty(&psock->ingress_skb) || !sk_psock_queue_empty(psock); } #define udp_msg_has_data(__sk, __psock) \ ({ udp_sk_has_data(__sk) || psock_has_data(__psock); }) static int udp_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo) { DEFINE_WAIT_FUNC(wait, woken_wake_function); int ret = 0; if (sk->sk_shutdown & RCV_SHUTDOWN) return 1; if (!timeo) return ret; add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); ret = udp_msg_has_data(sk, psock); if (!ret) { wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); ret = udp_msg_has_data(sk, psock); } sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); return ret; } static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct sk_psock *psock; int copied, ret; if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); if (!len) return 0; psock = sk_psock_get(sk); if (unlikely(!psock)) return sk_udp_recvmsg(sk, msg, len, flags, addr_len); if (!psock_has_data(psock)) { ret = sk_udp_recvmsg(sk, msg, len, flags, addr_len); goto out; } msg_bytes_ready: copied = sk_msg_recvmsg(sk, psock, msg, len, flags); if (!copied) { long timeo; int data; timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); data = udp_msg_wait_data(sk, psock, timeo); if (data) { if (psock_has_data(psock)) goto msg_bytes_ready; ret = sk_udp_recvmsg(sk, msg, len, flags, addr_len); goto out; } copied = -EAGAIN; } ret = copied; out: sk_psock_put(sk, psock); return ret; } enum { UDP_BPF_IPV4, UDP_BPF_IPV6, UDP_BPF_NUM_PROTS, }; static DEFINE_SPINLOCK(udpv6_prot_lock); static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS]; static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base) { *prot = *base; prot->close = sock_map_close; prot->recvmsg = udp_bpf_recvmsg; prot->sock_is_readable = sk_msg_is_readable; } static void udp_bpf_check_v6_needs_rebuild(struct proto *ops) { if (unlikely(ops != smp_load_acquire(&udpv6_prot_saved))) { spin_lock_bh(&udpv6_prot_lock); if (likely(ops != udpv6_prot_saved)) { udp_bpf_rebuild_protos(&udp_bpf_prots[UDP_BPF_IPV6], ops); smp_store_release(&udpv6_prot_saved, ops); } spin_unlock_bh(&udpv6_prot_lock); } } static int __init udp_bpf_v4_build_proto(void) { udp_bpf_rebuild_protos(&udp_bpf_prots[UDP_BPF_IPV4], &udp_prot); return 0; } late_initcall(udp_bpf_v4_build_proto); int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6; if (restore) { sk->sk_write_space = psock->saved_write_space; sock_replace_proto(sk, psock->sk_proto); return 0; } if (sk->sk_family == AF_INET6) udp_bpf_check_v6_needs_rebuild(psock->sk_proto); sock_replace_proto(sk, &udp_bpf_prots[family]); return 0; } EXPORT_SYMBOL_GPL(udp_bpf_update_proto);
6 5 5 1 1 11 11 3 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 // SPDX-License-Identifier: GPL-2.0-or-later /* * Crypto API wrappers for the ChaCha20, XChaCha20, and XChaCha12 stream ciphers * * Copyright (C) 2015 Martin Willi * Copyright (C) 2018 Google LLC */ #include <linux/unaligned.h> #include <crypto/algapi.h> #include <crypto/chacha.h> #include <crypto/internal/skcipher.h> #include <linux/module.h> struct chacha_ctx { u32 key[8]; int nrounds; }; static int chacha_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keysize, int nrounds) { struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); int i; if (keysize != CHACHA_KEY_SIZE) return -EINVAL; for (i = 0; i < ARRAY_SIZE(ctx->key); i++) ctx->key[i] = get_unaligned_le32(key + i * sizeof(u32)); ctx->nrounds = nrounds; return 0; } static int chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keysize) { return chacha_setkey(tfm, key, keysize, 20); } static int chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keysize) { return chacha_setkey(tfm, key, keysize, 12); } static int chacha_stream_xor(struct skcipher_request *req, const struct chacha_ctx *ctx, const u8 iv[CHACHA_IV_SIZE]) { struct skcipher_walk walk; struct chacha_state state; int err; err = skcipher_walk_virt(&walk, req, false); chacha_init(&state, ctx->key, iv); while (walk.nbytes > 0) { unsigned int nbytes = walk.nbytes; if (nbytes < walk.total) nbytes = round_down(nbytes, CHACHA_BLOCK_SIZE); chacha_crypt(&state, walk.dst.virt.addr, walk.src.virt.addr, nbytes, ctx->nrounds); err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } return err; } static int crypto_chacha_crypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); const struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); return chacha_stream_xor(req, ctx, req->iv); } static int crypto_xchacha_crypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); const struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); struct chacha_ctx subctx; struct chacha_state state; u8 real_iv[16]; /* Compute the subkey given the original key and first 128 nonce bits */ chacha_init(&state, ctx->key, req->iv); hchacha_block(&state, subctx.key, ctx->nrounds); subctx.nrounds = ctx->nrounds; /* Build the real IV */ memcpy(&real_iv[0], req->iv + 24, 8); /* stream position */ memcpy(&real_iv[8], req->iv + 16, 8); /* remaining 64 nonce bits */ /* Generate the stream and XOR it with the data */ return chacha_stream_xor(req, &subctx, real_iv); } static struct skcipher_alg algs[] = { { .base.cra_name = "chacha20", .base.cra_driver_name = "chacha20-lib", .base.cra_priority = 300, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct chacha_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CHACHA_KEY_SIZE, .max_keysize = CHACHA_KEY_SIZE, .ivsize = CHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, .setkey = chacha20_setkey, .encrypt = crypto_chacha_crypt, .decrypt = crypto_chacha_crypt, }, { .base.cra_name = "xchacha20", .base.cra_driver_name = "xchacha20-lib", .base.cra_priority = 300, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct chacha_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CHACHA_KEY_SIZE, .max_keysize = CHACHA_KEY_SIZE, .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, .setkey = chacha20_setkey, .encrypt = crypto_xchacha_crypt, .decrypt = crypto_xchacha_crypt, }, { .base.cra_name = "xchacha12", .base.cra_driver_name = "xchacha12-lib", .base.cra_priority = 300, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct chacha_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CHACHA_KEY_SIZE, .max_keysize = CHACHA_KEY_SIZE, .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, .setkey = chacha12_setkey, .encrypt = crypto_xchacha_crypt, .decrypt = crypto_xchacha_crypt, } }; static int __init crypto_chacha_mod_init(void) { return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); } static void __exit crypto_chacha_mod_fini(void) { crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); } module_init(crypto_chacha_mod_init); module_exit(crypto_chacha_mod_fini); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); MODULE_DESCRIPTION("Crypto API wrappers for the ChaCha20, XChaCha20, and XChaCha12 stream ciphers"); MODULE_ALIAS_CRYPTO("chacha20"); MODULE_ALIAS_CRYPTO("chacha20-lib"); MODULE_ALIAS_CRYPTO("xchacha20"); MODULE_ALIAS_CRYPTO("xchacha20-lib"); MODULE_ALIAS_CRYPTO("xchacha12"); MODULE_ALIAS_CRYPTO("xchacha12-lib");
8 2 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2006 Patrick McHardy <kaber@trash.net> * * Based on ipt_random and ipt_nth by Fabrice MARIE <fabrice@netfilter.org>. */ #include <linux/init.h> #include <linux/spinlock.h> #include <linux/skbuff.h> #include <linux/net.h> #include <linux/slab.h> #include <linux/netfilter/xt_statistic.h> #include <linux/netfilter/x_tables.h> #include <linux/module.h> struct xt_statistic_priv { atomic_t count; } ____cacheline_aligned_in_smp; MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Xtables: statistics-based matching (\"Nth\", random)"); MODULE_ALIAS("ipt_statistic"); MODULE_ALIAS("ip6t_statistic"); static bool statistic_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_statistic_info *info = par->matchinfo; bool ret = info->flags & XT_STATISTIC_INVERT; int nval, oval; switch (info->mode) { case XT_STATISTIC_MODE_RANDOM: if ((get_random_u32() & 0x7FFFFFFF) < info->u.random.probability) ret = !ret; break; case XT_STATISTIC_MODE_NTH: do { oval = atomic_read(&info->master->count); nval = (oval == info->u.nth.every) ? 0 : oval + 1; } while (atomic_cmpxchg(&info->master->count, oval, nval) != oval); if (nval == 0) ret = !ret; break; } return ret; } static int statistic_mt_check(const struct xt_mtchk_param *par) { struct xt_statistic_info *info = par->matchinfo; if (info->mode > XT_STATISTIC_MODE_MAX || info->flags & ~XT_STATISTIC_MASK) return -EINVAL; info->master = kzalloc(sizeof(*info->master), GFP_KERNEL); if (info->master == NULL) return -ENOMEM; atomic_set(&info->master->count, info->u.nth.count); return 0; } static void statistic_mt_destroy(const struct xt_mtdtor_param *par) { const struct xt_statistic_info *info = par->matchinfo; kfree(info->master); } static struct xt_match xt_statistic_mt_reg __read_mostly = { .name = "statistic", .revision = 0, .family = NFPROTO_UNSPEC, .match = statistic_mt, .checkentry = statistic_mt_check, .destroy = statistic_mt_destroy, .matchsize = sizeof(struct xt_statistic_info), .usersize = offsetof(struct xt_statistic_info, master), .me = THIS_MODULE, }; static int __init statistic_mt_init(void) { return xt_register_match(&xt_statistic_mt_reg); } static void __exit statistic_mt_exit(void) { xt_unregister_match(&xt_statistic_mt_reg); } module_init(statistic_mt_init); module_exit(statistic_mt_exit);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SPECIAL_INSNS_H #define _ASM_X86_SPECIAL_INSNS_H #ifdef __KERNEL__ #include <asm/nops.h> #include <asm/processor-flags.h> #include <linux/errno.h> #include <linux/irqflags.h> #include <linux/jump_label.h> void native_write_cr0(unsigned long val); static inline unsigned long native_read_cr0(void) { unsigned long val; asm volatile("mov %%cr0,%0" : "=r" (val)); return val; } static __always_inline unsigned long native_read_cr2(void) { unsigned long val; asm volatile("mov %%cr2,%0" : "=r" (val)); return val; } static __always_inline void native_write_cr2(unsigned long val) { asm volatile("mov %0,%%cr2": : "r" (val) : "memory"); } static __always_inline unsigned long __native_read_cr3(void) { unsigned long val; asm volatile("mov %%cr3,%0" : "=r" (val)); return val; } static __always_inline void native_write_cr3(unsigned long val) { asm volatile("mov %0,%%cr3": : "r" (val) : "memory"); } static inline unsigned long native_read_cr4(void) { unsigned long val; #ifdef CONFIG_X86_32 /* * This could fault if CR4 does not exist. Non-existent CR4 * is functionally equivalent to CR4 == 0. Keep it simple and pretend * that CR4 == 0 on CPUs that don't have CR4. */ asm volatile("1: mov %%cr4, %0\n" "2:\n" _ASM_EXTABLE(1b, 2b) : "=r" (val) : "0" (0)); #else /* CR4 always exists on x86_64. */ asm volatile("mov %%cr4,%0" : "=r" (val)); #endif return val; } void native_write_cr4(unsigned long val); #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS static inline u32 rdpkru(void) { u32 ecx = 0; u32 edx, pkru; /* * "rdpkru" instruction. Places PKRU contents in to EAX, * clears EDX and requires that ecx=0. */ asm volatile("rdpkru" : "=a" (pkru), "=d" (edx) : "c" (ecx)); return pkru; } static inline void wrpkru(u32 pkru) { u32 ecx = 0, edx = 0; /* * "wrpkru" instruction. Loads contents in EAX to PKRU, * requires that ecx = edx = 0. */ asm volatile("wrpkru" : : "a" (pkru), "c"(ecx), "d"(edx)); } #else static inline u32 rdpkru(void) { return 0; } static inline void wrpkru(u32 pkru) { } #endif /* * Write back all modified lines in all levels of cache associated with this * logical processor to main memory, and then invalidate all caches. Depending * on the micro-architecture, WBINVD (and WBNOINVD below) may or may not affect * lower level caches associated with another logical processor that shares any * level of this processor's cache hierarchy. */ static __always_inline void wbinvd(void) { asm volatile("wbinvd" : : : "memory"); } /* Instruction encoding provided for binutils backwards compatibility. */ #define ASM_WBNOINVD _ASM_BYTES(0xf3,0x0f,0x09) /* * Write back all modified lines in all levels of cache associated with this * logical processor to main memory, but do NOT explicitly invalidate caches, * i.e. leave all/most cache lines in the hierarchy in non-modified state. */ static __always_inline void wbnoinvd(void) { /* * Explicitly encode WBINVD if X86_FEATURE_WBNOINVD is unavailable even * though WBNOINVD is backwards compatible (it's simply WBINVD with an * ignored REP prefix), to guarantee that WBNOINVD isn't used if it * needs to be avoided for any reason. For all supported usage in the * kernel, WBINVD is functionally a superset of WBNOINVD. */ alternative("wbinvd", ASM_WBNOINVD, X86_FEATURE_WBNOINVD); } static inline unsigned long __read_cr4(void) { return native_read_cr4(); } #ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else static inline unsigned long read_cr0(void) { return native_read_cr0(); } static inline void write_cr0(unsigned long x) { native_write_cr0(x); } static __always_inline unsigned long read_cr2(void) { return native_read_cr2(); } static __always_inline void write_cr2(unsigned long x) { native_write_cr2(x); } /* * Careful! CR3 contains more than just an address. You probably want * read_cr3_pa() instead. */ static inline unsigned long __read_cr3(void) { return __native_read_cr3(); } static inline void write_cr3(unsigned long x) { native_write_cr3(x); } static inline void __write_cr4(unsigned long x) { native_write_cr4(x); } #endif /* CONFIG_PARAVIRT_XXL */ static __always_inline void clflush(volatile void *__p) { asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p)); } static inline void clflushopt(volatile void *__p) { alternative_io("ds clflush %0", "clflushopt %0", X86_FEATURE_CLFLUSHOPT, "+m" (*(volatile char __force *)__p)); } static inline void clwb(volatile void *__p) { volatile struct { char x[64]; } *p = __p; asm_inline volatile(ALTERNATIVE_2( "ds clflush %0", "clflushopt %0", X86_FEATURE_CLFLUSHOPT, "clwb %0", X86_FEATURE_CLWB) : "+m" (*p)); } #ifdef CONFIG_X86_USER_SHADOW_STACK static inline int write_user_shstk_64(u64 __user *addr, u64 val) { asm goto("1: wrussq %[val], %[addr]\n" _ASM_EXTABLE(1b, %l[fail]) :: [addr] "m" (*addr), [val] "r" (val) :: fail); return 0; fail: return -EFAULT; } #endif /* CONFIG_X86_USER_SHADOW_STACK */ #define nop() asm volatile ("nop") static __always_inline void serialize(void) { /* Instruction opcode for SERIALIZE; supported in binutils >= 2.35. */ asm volatile(".byte 0xf, 0x1, 0xe8" ::: "memory"); } /* The dst parameter must be 64-bytes aligned */ static inline void movdir64b(void *dst, const void *src) { const struct { char _[64]; } *__src = src; struct { char _[64]; } *__dst = dst; /* * MOVDIR64B %(rdx), rax. * * Both __src and __dst must be memory constraints in order to tell the * compiler that no other memory accesses should be reordered around * this one. * * Also, both must be supplied as lvalues because this tells * the compiler what the object is (its size) the instruction accesses. * I.e., not the pointers but what they point to, thus the deref'ing '*'. */ asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02" : "+m" (*__dst) : "m" (*__src), "a" (__dst), "d" (__src)); } static inline void movdir64b_io(void __iomem *dst, const void *src) { movdir64b((void __force *)dst, src); } /** * enqcmds - Enqueue a command in supervisor (CPL0) mode * @dst: destination, in MMIO space (must be 512-bit aligned) * @src: 512 bits memory operand * * The ENQCMDS instruction allows software to write a 512-bit command to * a 512-bit-aligned special MMIO region that supports the instruction. * A return status is loaded into the ZF flag in the RFLAGS register. * ZF = 0 equates to success, and ZF = 1 indicates retry or error. * * This function issues the ENQCMDS instruction to submit data from * kernel space to MMIO space, in a unit of 512 bits. Order of data access * is not guaranteed, nor is a memory barrier performed afterwards. It * returns 0 on success and -EAGAIN on failure. * * Warning: Do not use this helper unless your driver has checked that the * ENQCMDS instruction is supported on the platform and the device accepts * ENQCMDS. */ static inline int enqcmds(void __iomem *dst, const void *src) { const struct { char _[64]; } *__src = src; struct { char _[64]; } __iomem *__dst = dst; bool zf; /* * ENQCMDS %(rdx), rax * * See movdir64b()'s comment on operand specification. */ asm volatile(".byte 0xf3, 0x0f, 0x38, 0xf8, 0x02, 0x66, 0x90" : "=@ccz" (zf), "+m" (*__dst) : "m" (*__src), "a" (__dst), "d" (__src)); /* Submission failure is indicated via EFLAGS.ZF=1 */ if (zf) return -EAGAIN; return 0; } static __always_inline void tile_release(void) { /* * Instruction opcode for TILERELEASE; supported in binutils * version >= 2.36. */ asm volatile(".byte 0xc4, 0xe2, 0x78, 0x49, 0xc0"); } #endif /* __KERNEL__ */ #endif /* _ASM_X86_SPECIAL_INSNS_H */
6 6 6 9 6 5 5 5 6 6 6 1 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/slab.h> #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> static struct ax25_protocol *protocol_list; static DEFINE_RWLOCK(protocol_list_lock); static HLIST_HEAD(ax25_linkfail_list); static DEFINE_SPINLOCK(linkfail_lock); static struct listen_struct { struct listen_struct *next; ax25_address callsign; struct net_device *dev; } *listen_list = NULL; static DEFINE_SPINLOCK(listen_lock); /* * Do not register the internal protocols AX25_P_TEXT, AX25_P_SEGMENT, * AX25_P_IP or AX25_P_ARP ... */ void ax25_register_pid(struct ax25_protocol *ap) { write_lock_bh(&protocol_list_lock); ap->next = protocol_list; protocol_list = ap; write_unlock_bh(&protocol_list_lock); } EXPORT_SYMBOL_GPL(ax25_register_pid); void ax25_protocol_release(unsigned int pid) { struct ax25_protocol *protocol; write_lock_bh(&protocol_list_lock); protocol = protocol_list; if (protocol == NULL) goto out; if (protocol->pid == pid) { protocol_list = protocol->next; goto out; } while (protocol != NULL && protocol->next != NULL) { if (protocol->next->pid == pid) { protocol->next = protocol->next->next; goto out; } protocol = protocol->next; } out: write_unlock_bh(&protocol_list_lock); } EXPORT_SYMBOL(ax25_protocol_release); void ax25_linkfail_register(struct ax25_linkfail *lf) { spin_lock_bh(&linkfail_lock); hlist_add_head(&lf->lf_node, &ax25_linkfail_list); spin_unlock_bh(&linkfail_lock); } EXPORT_SYMBOL(ax25_linkfail_register); void ax25_linkfail_release(struct ax25_linkfail *lf) { spin_lock_bh(&linkfail_lock); hlist_del_init(&lf->lf_node); spin_unlock_bh(&linkfail_lock); } EXPORT_SYMBOL(ax25_linkfail_release); int ax25_listen_register(const ax25_address *callsign, struct net_device *dev) { struct listen_struct *listen; if (ax25_listen_mine(callsign, dev)) return 0; if ((listen = kmalloc(sizeof(*listen), GFP_ATOMIC)) == NULL) return -ENOMEM; listen->callsign = *callsign; listen->dev = dev; spin_lock_bh(&listen_lock); listen->next = listen_list; listen_list = listen; spin_unlock_bh(&listen_lock); return 0; } EXPORT_SYMBOL(ax25_listen_register); void ax25_listen_release(const ax25_address *callsign, struct net_device *dev) { struct listen_struct *s, *listen; spin_lock_bh(&listen_lock); listen = listen_list; if (listen == NULL) { spin_unlock_bh(&listen_lock); return; } if (ax25cmp(&listen->callsign, callsign) == 0 && listen->dev == dev) { listen_list = listen->next; spin_unlock_bh(&listen_lock); kfree(listen); return; } while (listen != NULL && listen->next != NULL) { if (ax25cmp(&listen->next->callsign, callsign) == 0 && listen->next->dev == dev) { s = listen->next; listen->next = listen->next->next; spin_unlock_bh(&listen_lock); kfree(s); return; } listen = listen->next; } spin_unlock_bh(&listen_lock); } EXPORT_SYMBOL(ax25_listen_release); int (*ax25_protocol_function(unsigned int pid))(struct sk_buff *, ax25_cb *) { int (*res)(struct sk_buff *, ax25_cb *) = NULL; struct ax25_protocol *protocol; read_lock(&protocol_list_lock); for (protocol = protocol_list; protocol != NULL; protocol = protocol->next) if (protocol->pid == pid) { res = protocol->func; break; } read_unlock(&protocol_list_lock); return res; } int ax25_listen_mine(const ax25_address *callsign, struct net_device *dev) { struct listen_struct *listen; spin_lock_bh(&listen_lock); for (listen = listen_list; listen != NULL; listen = listen->next) if (ax25cmp(&listen->callsign, callsign) == 0 && (listen->dev == dev || listen->dev == NULL)) { spin_unlock_bh(&listen_lock); return 1; } spin_unlock_bh(&listen_lock); return 0; } void ax25_link_failed(ax25_cb *ax25, int reason) { struct ax25_linkfail *lf; spin_lock_bh(&linkfail_lock); hlist_for_each_entry(lf, &ax25_linkfail_list, lf_node) lf->func(ax25, reason); spin_unlock_bh(&linkfail_lock); } int ax25_protocol_is_registered(unsigned int pid) { struct ax25_protocol *protocol; int res = 0; read_lock_bh(&protocol_list_lock); for (protocol = protocol_list; protocol != NULL; protocol = protocol->next) if (protocol->pid == pid) { res = 1; break; } read_unlock_bh(&protocol_list_lock); return res; }
3 3 3 2 3 2 2 3 4 3 4 4 2 1 4 3 3 3 1 1 1 1 2 3 3 3 3 3 3 2 2 3 2 4 2 1 4 3 1 1 2 3 3 3 4 4 4 4 4 4 4 4 3 1 68 68 1 2 61 4 45 19 29 4 1 5 1 2 8 8 9 5 9 1 2 1 10 2 12 10 3 11 3 11 3 13 1 13 1 5 11 12 2 11 5 11 5 62 62 55 54 55 4 4 4 55 68 68 63 8 9 9 3 9 9 9 9 9 9 9 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/cls_flow.c Generic flow classifier * * Copyright (c) 2007, 2008 Patrick McHardy <kaber@trash.net> */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/list.h> #include <linux/jhash.h> #include <linux/random.h> #include <linux/pkt_cls.h> #include <linux/skbuff.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/if_vlan.h> #include <linux/slab.h> #include <linux/module.h> #include <net/inet_sock.h> #include <net/pkt_cls.h> #include <net/ip.h> #include <net/route.h> #include <net/flow_dissector.h> #include <net/tc_wrapper.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> #endif struct flow_head { struct list_head filters; struct rcu_head rcu; }; struct flow_filter { struct list_head list; struct tcf_exts exts; struct tcf_ematch_tree ematches; struct tcf_proto *tp; struct timer_list perturb_timer; u32 perturb_period; u32 handle; u32 nkeys; u32 keymask; u32 mode; u32 mask; u32 xor; u32 rshift; u32 addend; u32 divisor; u32 baseclass; u32 hashrnd; struct rcu_work rwork; }; static inline u32 addr_fold(void *addr) { unsigned long a = (unsigned long)addr; return (a & 0xFFFFFFFF) ^ (BITS_PER_LONG > 32 ? a >> 32 : 0); } static u32 flow_get_src(const struct sk_buff *skb, const struct flow_keys *flow) { __be32 src = flow_get_u32_src(flow); if (src) return ntohl(src); return addr_fold(skb->sk); } static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow) { __be32 dst = flow_get_u32_dst(flow); if (dst) return ntohl(dst); return addr_fold(skb_dst(skb)) ^ (__force u16)skb_protocol(skb, true); } static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flow) { return flow->basic.ip_proto; } static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow) { if (flow->ports.ports) return ntohs(flow->ports.src); return addr_fold(skb->sk); } static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow) { if (flow->ports.ports) return ntohs(flow->ports.dst); return addr_fold(skb_dst(skb)) ^ (__force u16)skb_protocol(skb, true); } static u32 flow_get_iif(const struct sk_buff *skb) { return skb->skb_iif; } static u32 flow_get_priority(const struct sk_buff *skb) { return skb->priority; } static u32 flow_get_mark(const struct sk_buff *skb) { return skb->mark; } static u32 flow_get_nfct(const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) return addr_fold(skb_nfct(skb)); #else return 0; #endif } #if IS_ENABLED(CONFIG_NF_CONNTRACK) #define CTTUPLE(skb, member) \ ({ \ enum ip_conntrack_info ctinfo; \ const struct nf_conn *ct = nf_ct_get(skb, &ctinfo); \ if (ct == NULL) \ goto fallback; \ ct->tuplehash[CTINFO2DIR(ctinfo)].tuple.member; \ }) #else #define CTTUPLE(skb, member) \ ({ \ goto fallback; \ 0; \ }) #endif static u32 flow_get_nfct_src(const struct sk_buff *skb, const struct flow_keys *flow) { switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): return ntohl(CTTUPLE(skb, src.u3.ip)); case htons(ETH_P_IPV6): return ntohl(CTTUPLE(skb, src.u3.ip6[3])); } fallback: return flow_get_src(skb, flow); } static u32 flow_get_nfct_dst(const struct sk_buff *skb, const struct flow_keys *flow) { switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): return ntohl(CTTUPLE(skb, dst.u3.ip)); case htons(ETH_P_IPV6): return ntohl(CTTUPLE(skb, dst.u3.ip6[3])); } fallback: return flow_get_dst(skb, flow); } static u32 flow_get_nfct_proto_src(const struct sk_buff *skb, const struct flow_keys *flow) { return ntohs(CTTUPLE(skb, src.u.all)); fallback: return flow_get_proto_src(skb, flow); } static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow) { return ntohs(CTTUPLE(skb, dst.u.all)); fallback: return flow_get_proto_dst(skb, flow); } static u32 flow_get_rtclassid(const struct sk_buff *skb) { #ifdef CONFIG_IP_ROUTE_CLASSID if (skb_dst(skb)) return skb_dst(skb)->tclassid; #endif return 0; } static u32 flow_get_skuid(const struct sk_buff *skb) { struct sock *sk = skb_to_full_sk(skb); if (sk && sk->sk_socket && sk->sk_socket->file) { kuid_t skuid = sk->sk_socket->file->f_cred->fsuid; return from_kuid(&init_user_ns, skuid); } return 0; } static u32 flow_get_skgid(const struct sk_buff *skb) { struct sock *sk = skb_to_full_sk(skb); if (sk && sk->sk_socket && sk->sk_socket->file) { kgid_t skgid = sk->sk_socket->file->f_cred->fsgid; return from_kgid(&init_user_ns, skgid); } return 0; } static u32 flow_get_vlan_tag(const struct sk_buff *skb) { u16 tag; if (vlan_get_tag(skb, &tag) < 0) return 0; return tag & VLAN_VID_MASK; } static u32 flow_get_rxhash(struct sk_buff *skb) { return skb_get_hash(skb); } static u32 flow_key_get(struct sk_buff *skb, int key, struct flow_keys *flow) { switch (key) { case FLOW_KEY_SRC: return flow_get_src(skb, flow); case FLOW_KEY_DST: return flow_get_dst(skb, flow); case FLOW_KEY_PROTO: return flow_get_proto(skb, flow); case FLOW_KEY_PROTO_SRC: return flow_get_proto_src(skb, flow); case FLOW_KEY_PROTO_DST: return flow_get_proto_dst(skb, flow); case FLOW_KEY_IIF: return flow_get_iif(skb); case FLOW_KEY_PRIORITY: return flow_get_priority(skb); case FLOW_KEY_MARK: return flow_get_mark(skb); case FLOW_KEY_NFCT: return flow_get_nfct(skb); case FLOW_KEY_NFCT_SRC: return flow_get_nfct_src(skb, flow); case FLOW_KEY_NFCT_DST: return flow_get_nfct_dst(skb, flow); case FLOW_KEY_NFCT_PROTO_SRC: return flow_get_nfct_proto_src(skb, flow); case FLOW_KEY_NFCT_PROTO_DST: return flow_get_nfct_proto_dst(skb, flow); case FLOW_KEY_RTCLASSID: return flow_get_rtclassid(skb); case FLOW_KEY_SKUID: return flow_get_skuid(skb); case FLOW_KEY_SKGID: return flow_get_skgid(skb); case FLOW_KEY_VLAN_TAG: return flow_get_vlan_tag(skb); case FLOW_KEY_RXHASH: return flow_get_rxhash(skb); default: WARN_ON(1); return 0; } } #define FLOW_KEYS_NEEDED ((1 << FLOW_KEY_SRC) | \ (1 << FLOW_KEY_DST) | \ (1 << FLOW_KEY_PROTO) | \ (1 << FLOW_KEY_PROTO_SRC) | \ (1 << FLOW_KEY_PROTO_DST) | \ (1 << FLOW_KEY_NFCT_SRC) | \ (1 << FLOW_KEY_NFCT_DST) | \ (1 << FLOW_KEY_NFCT_PROTO_SRC) | \ (1 << FLOW_KEY_NFCT_PROTO_DST)) TC_INDIRECT_SCOPE int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct flow_head *head = rcu_dereference_bh(tp->root); struct flow_filter *f; u32 keymask; u32 classid; unsigned int n, key; int r; list_for_each_entry_rcu(f, &head->filters, list) { u32 keys[FLOW_KEY_MAX + 1]; struct flow_keys flow_keys; if (!tcf_em_tree_match(skb, &f->ematches, NULL)) continue; keymask = f->keymask; if (keymask & FLOW_KEYS_NEEDED) skb_flow_dissect_flow_keys(skb, &flow_keys, 0); for (n = 0; n < f->nkeys; n++) { key = ffs(keymask) - 1; keymask &= ~(1 << key); keys[n] = flow_key_get(skb, key, &flow_keys); } if (f->mode == FLOW_MODE_HASH) classid = jhash2(keys, f->nkeys, f->hashrnd); else { classid = keys[0]; classid = (classid & f->mask) ^ f->xor; classid = (classid >> f->rshift) + f->addend; } if (f->divisor) classid %= f->divisor; res->class = 0; res->classid = TC_H_MAKE(f->baseclass, f->baseclass + classid); r = tcf_exts_exec(skb, &f->exts, res); if (r < 0) continue; return r; } return -1; } static void flow_perturbation(struct timer_list *t) { struct flow_filter *f = timer_container_of(f, t, perturb_timer); get_random_bytes(&f->hashrnd, 4); if (f->perturb_period) mod_timer(&f->perturb_timer, jiffies + f->perturb_period); } static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = { [TCA_FLOW_KEYS] = { .type = NLA_U32 }, [TCA_FLOW_MODE] = { .type = NLA_U32 }, [TCA_FLOW_BASECLASS] = { .type = NLA_U32 }, [TCA_FLOW_RSHIFT] = NLA_POLICY_MAX(NLA_U32, 31 /* BITS_PER_U32 - 1 */), [TCA_FLOW_ADDEND] = { .type = NLA_U32 }, [TCA_FLOW_MASK] = { .type = NLA_U32 }, [TCA_FLOW_XOR] = { .type = NLA_U32 }, [TCA_FLOW_DIVISOR] = { .type = NLA_U32 }, [TCA_FLOW_ACT] = { .type = NLA_NESTED }, [TCA_FLOW_POLICE] = { .type = NLA_NESTED }, [TCA_FLOW_EMATCHES] = { .type = NLA_NESTED }, [TCA_FLOW_PERTURB] = { .type = NLA_U32 }, }; static void __flow_destroy_filter(struct flow_filter *f) { timer_shutdown_sync(&f->perturb_timer); tcf_exts_destroy(&f->exts); tcf_em_tree_destroy(&f->ematches); tcf_exts_put_net(&f->exts); kfree(f); } static void flow_destroy_filter_work(struct work_struct *work) { struct flow_filter *f = container_of(to_rcu_work(work), struct flow_filter, rwork); rtnl_lock(); __flow_destroy_filter(f); rtnl_unlock(); } static int flow_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, void **arg, u32 flags, struct netlink_ext_ack *extack) { struct flow_head *head = rtnl_dereference(tp->root); struct flow_filter *fold, *fnew; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_FLOW_MAX + 1]; unsigned int nkeys = 0; unsigned int perturb_period = 0; u32 baseclass = 0; u32 keymask = 0; u32 mode; int err; if (opt == NULL) return -EINVAL; err = nla_parse_nested_deprecated(tb, TCA_FLOW_MAX, opt, flow_policy, NULL); if (err < 0) return err; if (tb[TCA_FLOW_BASECLASS]) { baseclass = nla_get_u32(tb[TCA_FLOW_BASECLASS]); if (TC_H_MIN(baseclass) == 0) return -EINVAL; } if (tb[TCA_FLOW_KEYS]) { keymask = nla_get_u32(tb[TCA_FLOW_KEYS]); nkeys = hweight32(keymask); if (nkeys == 0) return -EINVAL; if (fls(keymask) - 1 > FLOW_KEY_MAX) return -EOPNOTSUPP; if ((keymask & (FLOW_KEY_SKUID|FLOW_KEY_SKGID)) && sk_user_ns(NETLINK_CB(in_skb).sk) != &init_user_ns) return -EOPNOTSUPP; } fnew = kzalloc(sizeof(*fnew), GFP_KERNEL); if (!fnew) return -ENOBUFS; err = tcf_em_tree_validate(tp, tb[TCA_FLOW_EMATCHES], &fnew->ematches); if (err < 0) goto err1; err = tcf_exts_init(&fnew->exts, net, TCA_FLOW_ACT, TCA_FLOW_POLICE); if (err < 0) goto err2; err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &fnew->exts, flags, extack); if (err < 0) goto err2; fold = *arg; if (fold) { err = -EINVAL; if (fold->handle != handle && handle) goto err2; /* Copy fold into fnew */ fnew->tp = fold->tp; fnew->handle = fold->handle; fnew->nkeys = fold->nkeys; fnew->keymask = fold->keymask; fnew->mode = fold->mode; fnew->mask = fold->mask; fnew->xor = fold->xor; fnew->rshift = fold->rshift; fnew->addend = fold->addend; fnew->divisor = fold->divisor; fnew->baseclass = fold->baseclass; fnew->hashrnd = fold->hashrnd; mode = fold->mode; if (tb[TCA_FLOW_MODE]) mode = nla_get_u32(tb[TCA_FLOW_MODE]); if (mode != FLOW_MODE_HASH && nkeys > 1) goto err2; if (mode == FLOW_MODE_HASH) perturb_period = fold->perturb_period; if (tb[TCA_FLOW_PERTURB]) { if (mode != FLOW_MODE_HASH) goto err2; perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ; } } else { err = -EINVAL; if (!handle) goto err2; if (!tb[TCA_FLOW_KEYS]) goto err2; mode = FLOW_MODE_MAP; if (tb[TCA_FLOW_MODE]) mode = nla_get_u32(tb[TCA_FLOW_MODE]); if (mode != FLOW_MODE_HASH && nkeys > 1) goto err2; if (tb[TCA_FLOW_PERTURB]) { if (mode != FLOW_MODE_HASH) goto err2; perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ; } if (TC_H_MAJ(baseclass) == 0) { struct Qdisc *q = tcf_block_q(tp->chain->block); baseclass = TC_H_MAKE(q->handle, baseclass); } if (TC_H_MIN(baseclass) == 0) baseclass = TC_H_MAKE(baseclass, 1); fnew->handle = handle; fnew->mask = ~0U; fnew->tp = tp; get_random_bytes(&fnew->hashrnd, 4); } timer_setup(&fnew->perturb_timer, flow_perturbation, TIMER_DEFERRABLE); tcf_block_netif_keep_dst(tp->chain->block); if (tb[TCA_FLOW_KEYS]) { fnew->keymask = keymask; fnew->nkeys = nkeys; } fnew->mode = mode; if (tb[TCA_FLOW_MASK]) fnew->mask = nla_get_u32(tb[TCA_FLOW_MASK]); if (tb[TCA_FLOW_XOR]) fnew->xor = nla_get_u32(tb[TCA_FLOW_XOR]); if (tb[TCA_FLOW_RSHIFT]) fnew->rshift = nla_get_u32(tb[TCA_FLOW_RSHIFT]); if (tb[TCA_FLOW_ADDEND]) fnew->addend = nla_get_u32(tb[TCA_FLOW_ADDEND]); if (tb[TCA_FLOW_DIVISOR]) fnew->divisor = nla_get_u32(tb[TCA_FLOW_DIVISOR]); if (baseclass) fnew->baseclass = baseclass; fnew->perturb_period = perturb_period; if (perturb_period) mod_timer(&fnew->perturb_timer, jiffies + perturb_period); if (!*arg) list_add_tail_rcu(&fnew->list, &head->filters); else list_replace_rcu(&fold->list, &fnew->list); *arg = fnew; if (fold) { tcf_exts_get_net(&fold->exts); tcf_queue_work(&fold->rwork, flow_destroy_filter_work); } return 0; err2: tcf_exts_destroy(&fnew->exts); tcf_em_tree_destroy(&fnew->ematches); err1: kfree(fnew); return err; } static int flow_delete(struct tcf_proto *tp, void *arg, bool *last, bool rtnl_held, struct netlink_ext_ack *extack) { struct flow_head *head = rtnl_dereference(tp->root); struct flow_filter *f = arg; list_del_rcu(&f->list); tcf_exts_get_net(&f->exts); tcf_queue_work(&f->rwork, flow_destroy_filter_work); *last = list_empty(&head->filters); return 0; } static int flow_init(struct tcf_proto *tp) { struct flow_head *head; head = kzalloc(sizeof(*head), GFP_KERNEL); if (head == NULL) return -ENOBUFS; INIT_LIST_HEAD(&head->filters); rcu_assign_pointer(tp->root, head); return 0; } static void flow_destroy(struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack) { struct flow_head *head = rtnl_dereference(tp->root); struct flow_filter *f, *next; list_for_each_entry_safe(f, next, &head->filters, list) { list_del_rcu(&f->list); if (tcf_exts_get_net(&f->exts)) tcf_queue_work(&f->rwork, flow_destroy_filter_work); else __flow_destroy_filter(f); } kfree_rcu(head, rcu); } static void *flow_get(struct tcf_proto *tp, u32 handle) { struct flow_head *head = rtnl_dereference(tp->root); struct flow_filter *f; list_for_each_entry(f, &head->filters, list) if (f->handle == handle) return f; return NULL; } static int flow_dump(struct net *net, struct tcf_proto *tp, void *fh, struct sk_buff *skb, struct tcmsg *t, bool rtnl_held) { struct flow_filter *f = fh; struct nlattr *nest; if (f == NULL) return skb->len; t->tcm_handle = f->handle; nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; if (nla_put_u32(skb, TCA_FLOW_KEYS, f->keymask) || nla_put_u32(skb, TCA_FLOW_MODE, f->mode)) goto nla_put_failure; if (f->mask != ~0 || f->xor != 0) { if (nla_put_u32(skb, TCA_FLOW_MASK, f->mask) || nla_put_u32(skb, TCA_FLOW_XOR, f->xor)) goto nla_put_failure; } if (f->rshift && nla_put_u32(skb, TCA_FLOW_RSHIFT, f->rshift)) goto nla_put_failure; if (f->addend && nla_put_u32(skb, TCA_FLOW_ADDEND, f->addend)) goto nla_put_failure; if (f->divisor && nla_put_u32(skb, TCA_FLOW_DIVISOR, f->divisor)) goto nla_put_failure; if (f->baseclass && nla_put_u32(skb, TCA_FLOW_BASECLASS, f->baseclass)) goto nla_put_failure; if (f->perturb_period && nla_put_u32(skb, TCA_FLOW_PERTURB, f->perturb_period / HZ)) goto nla_put_failure; if (tcf_exts_dump(skb, &f->exts) < 0) goto nla_put_failure; #ifdef CONFIG_NET_EMATCH if (f->ematches.hdr.nmatches && tcf_em_tree_dump(skb, &f->ematches, TCA_FLOW_EMATCHES) < 0) goto nla_put_failure; #endif nla_nest_end(skb, nest); if (tcf_exts_dump_stats(skb, &f->exts) < 0) goto nla_put_failure; return skb->len; nla_put_failure: nla_nest_cancel(skb, nest); return -1; } static void flow_walk(struct tcf_proto *tp, struct tcf_walker *arg, bool rtnl_held) { struct flow_head *head = rtnl_dereference(tp->root); struct flow_filter *f; list_for_each_entry(f, &head->filters, list) { if (!tc_cls_stats_dump(tp, arg, f)) break; } } static struct tcf_proto_ops cls_flow_ops __read_mostly = { .kind = "flow", .classify = flow_classify, .init = flow_init, .destroy = flow_destroy, .change = flow_change, .delete = flow_delete, .get = flow_get, .dump = flow_dump, .walk = flow_walk, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_CLS("flow"); static int __init cls_flow_init(void) { return register_tcf_proto_ops(&cls_flow_ops); } static void __exit cls_flow_exit(void) { unregister_tcf_proto_ops(&cls_flow_ops); } module_init(cls_flow_init); module_exit(cls_flow_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("TC flow classifier");
10 10 10 10 10 10 10 10 13 13 13 10 10 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 /* * Copyright (c) 2014 Chelsio, Inc. All rights reserved. * Copyright (c) 2014 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "iwpm_util.h" #define IWPM_MAPINFO_HASH_SIZE 512 #define IWPM_MAPINFO_HASH_MASK (IWPM_MAPINFO_HASH_SIZE - 1) #define IWPM_REMINFO_HASH_SIZE 64 #define IWPM_REMINFO_HASH_MASK (IWPM_REMINFO_HASH_SIZE - 1) #define IWPM_MSG_SIZE 512 static LIST_HEAD(iwpm_nlmsg_req_list); static DEFINE_SPINLOCK(iwpm_nlmsg_req_lock); static struct hlist_head *iwpm_hash_bucket; static DEFINE_SPINLOCK(iwpm_mapinfo_lock); static struct hlist_head *iwpm_reminfo_bucket; static DEFINE_SPINLOCK(iwpm_reminfo_lock); static struct iwpm_admin_data iwpm_admin; /** * iwpm_init - Allocate resources for the iwarp port mapper * @nl_client: The index of the netlink client * * Should be called when network interface goes up. */ int iwpm_init(u8 nl_client) { iwpm_hash_bucket = kcalloc(IWPM_MAPINFO_HASH_SIZE, sizeof(struct hlist_head), GFP_KERNEL); if (!iwpm_hash_bucket) return -ENOMEM; iwpm_reminfo_bucket = kcalloc(IWPM_REMINFO_HASH_SIZE, sizeof(struct hlist_head), GFP_KERNEL); if (!iwpm_reminfo_bucket) { kfree(iwpm_hash_bucket); return -ENOMEM; } iwpm_set_registration(nl_client, IWPM_REG_UNDEF); pr_debug("%s: Mapinfo and reminfo tables are created\n", __func__); return 0; } static void free_hash_bucket(void); static void free_reminfo_bucket(void); /** * iwpm_exit - Deallocate resources for the iwarp port mapper * @nl_client: The index of the netlink client * * Should be called when network interface goes down. */ int iwpm_exit(u8 nl_client) { free_hash_bucket(); free_reminfo_bucket(); pr_debug("%s: Resources are destroyed\n", __func__); iwpm_set_registration(nl_client, IWPM_REG_UNDEF); return 0; } static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *, struct sockaddr_storage *); /** * iwpm_create_mapinfo - Store local and mapped IPv4/IPv6 address * info in a hash table * @local_sockaddr: Local ip/tcp address * @mapped_sockaddr: Mapped local ip/tcp address * @nl_client: The index of the netlink client * @map_flags: IWPM mapping flags */ int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr, struct sockaddr_storage *mapped_sockaddr, u8 nl_client, u32 map_flags) { struct hlist_head *hash_bucket_head = NULL; struct iwpm_mapping_info *map_info; unsigned long flags; int ret = -EINVAL; map_info = kzalloc(sizeof(struct iwpm_mapping_info), GFP_KERNEL); if (!map_info) return -ENOMEM; memcpy(&map_info->local_sockaddr, local_sockaddr, sizeof(struct sockaddr_storage)); memcpy(&map_info->mapped_sockaddr, mapped_sockaddr, sizeof(struct sockaddr_storage)); map_info->nl_client = nl_client; map_info->map_flags = map_flags; spin_lock_irqsave(&iwpm_mapinfo_lock, flags); if (iwpm_hash_bucket) { hash_bucket_head = get_mapinfo_hash_bucket( &map_info->local_sockaddr, &map_info->mapped_sockaddr); if (hash_bucket_head) { hlist_add_head(&map_info->hlist_node, hash_bucket_head); ret = 0; } } spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); if (!hash_bucket_head) kfree(map_info); return ret; } /** * iwpm_remove_mapinfo - Remove local and mapped IPv4/IPv6 address * info from the hash table * @local_sockaddr: Local ip/tcp address * @mapped_local_addr: Mapped local ip/tcp address * * Returns err code if mapping info is not found in the hash table, * otherwise returns 0 */ int iwpm_remove_mapinfo(struct sockaddr_storage *local_sockaddr, struct sockaddr_storage *mapped_local_addr) { struct hlist_node *tmp_hlist_node; struct hlist_head *hash_bucket_head; struct iwpm_mapping_info *map_info = NULL; unsigned long flags; int ret = -EINVAL; spin_lock_irqsave(&iwpm_mapinfo_lock, flags); if (iwpm_hash_bucket) { hash_bucket_head = get_mapinfo_hash_bucket( local_sockaddr, mapped_local_addr); if (!hash_bucket_head) goto remove_mapinfo_exit; hlist_for_each_entry_safe(map_info, tmp_hlist_node, hash_bucket_head, hlist_node) { if (!iwpm_compare_sockaddr(&map_info->mapped_sockaddr, mapped_local_addr)) { hlist_del_init(&map_info->hlist_node); kfree(map_info); ret = 0; break; } } } remove_mapinfo_exit: spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); return ret; } static void free_hash_bucket(void) { struct hlist_node *tmp_hlist_node; struct iwpm_mapping_info *map_info; unsigned long flags; int i; /* remove all the mapinfo data from the list */ spin_lock_irqsave(&iwpm_mapinfo_lock, flags); for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) { hlist_for_each_entry_safe(map_info, tmp_hlist_node, &iwpm_hash_bucket[i], hlist_node) { hlist_del_init(&map_info->hlist_node); kfree(map_info); } } /* free the hash list */ kfree(iwpm_hash_bucket); iwpm_hash_bucket = NULL; spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); } static void free_reminfo_bucket(void) { struct hlist_node *tmp_hlist_node; struct iwpm_remote_info *rem_info; unsigned long flags; int i; /* remove all the remote info from the list */ spin_lock_irqsave(&iwpm_reminfo_lock, flags); for (i = 0; i < IWPM_REMINFO_HASH_SIZE; i++) { hlist_for_each_entry_safe(rem_info, tmp_hlist_node, &iwpm_reminfo_bucket[i], hlist_node) { hlist_del_init(&rem_info->hlist_node); kfree(rem_info); } } /* free the hash list */ kfree(iwpm_reminfo_bucket); iwpm_reminfo_bucket = NULL; spin_unlock_irqrestore(&iwpm_reminfo_lock, flags); } static struct hlist_head *get_reminfo_hash_bucket(struct sockaddr_storage *, struct sockaddr_storage *); void iwpm_add_remote_info(struct iwpm_remote_info *rem_info) { struct hlist_head *hash_bucket_head; unsigned long flags; spin_lock_irqsave(&iwpm_reminfo_lock, flags); if (iwpm_reminfo_bucket) { hash_bucket_head = get_reminfo_hash_bucket( &rem_info->mapped_loc_sockaddr, &rem_info->mapped_rem_sockaddr); if (hash_bucket_head) hlist_add_head(&rem_info->hlist_node, hash_bucket_head); } spin_unlock_irqrestore(&iwpm_reminfo_lock, flags); } /** * iwpm_get_remote_info - Get the remote connecting peer address info * * @mapped_loc_addr: Mapped local address of the listening peer * @mapped_rem_addr: Mapped remote address of the connecting peer * @remote_addr: To store the remote address of the connecting peer * @nl_client: The index of the netlink client * * The remote address info is retrieved and provided to the client in * the remote_addr. After that it is removed from the hash table */ int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr, struct sockaddr_storage *mapped_rem_addr, struct sockaddr_storage *remote_addr, u8 nl_client) { struct hlist_node *tmp_hlist_node; struct hlist_head *hash_bucket_head; struct iwpm_remote_info *rem_info = NULL; unsigned long flags; int ret = -EINVAL; spin_lock_irqsave(&iwpm_reminfo_lock, flags); if (iwpm_reminfo_bucket) { hash_bucket_head = get_reminfo_hash_bucket( mapped_loc_addr, mapped_rem_addr); if (!hash_bucket_head) goto get_remote_info_exit; hlist_for_each_entry_safe(rem_info, tmp_hlist_node, hash_bucket_head, hlist_node) { if (!iwpm_compare_sockaddr(&rem_info->mapped_loc_sockaddr, mapped_loc_addr) && !iwpm_compare_sockaddr(&rem_info->mapped_rem_sockaddr, mapped_rem_addr)) { memcpy(remote_addr, &rem_info->remote_sockaddr, sizeof(struct sockaddr_storage)); iwpm_print_sockaddr(remote_addr, "get_remote_info: Remote sockaddr:"); hlist_del_init(&rem_info->hlist_node); kfree(rem_info); ret = 0; break; } } } get_remote_info_exit: spin_unlock_irqrestore(&iwpm_reminfo_lock, flags); return ret; } struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq, u8 nl_client, gfp_t gfp) { struct iwpm_nlmsg_request *nlmsg_request; unsigned long flags; nlmsg_request = kzalloc(sizeof(struct iwpm_nlmsg_request), gfp); if (!nlmsg_request) return NULL; spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags); list_add_tail(&nlmsg_request->inprocess_list, &iwpm_nlmsg_req_list); spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags); kref_init(&nlmsg_request->kref); kref_get(&nlmsg_request->kref); nlmsg_request->nlmsg_seq = nlmsg_seq; nlmsg_request->nl_client = nl_client; nlmsg_request->request_done = 0; nlmsg_request->err_code = 0; sema_init(&nlmsg_request->sem, 1); down(&nlmsg_request->sem); return nlmsg_request; } void iwpm_free_nlmsg_request(struct kref *kref) { struct iwpm_nlmsg_request *nlmsg_request; unsigned long flags; nlmsg_request = container_of(kref, struct iwpm_nlmsg_request, kref); spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags); list_del_init(&nlmsg_request->inprocess_list); spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags); if (!nlmsg_request->request_done) pr_debug("%s Freeing incomplete nlmsg request (seq = %u).\n", __func__, nlmsg_request->nlmsg_seq); kfree(nlmsg_request); } struct iwpm_nlmsg_request *iwpm_find_nlmsg_request(__u32 echo_seq) { struct iwpm_nlmsg_request *nlmsg_request; struct iwpm_nlmsg_request *found_request = NULL; unsigned long flags; spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags); list_for_each_entry(nlmsg_request, &iwpm_nlmsg_req_list, inprocess_list) { if (nlmsg_request->nlmsg_seq == echo_seq) { found_request = nlmsg_request; kref_get(&nlmsg_request->kref); break; } } spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags); return found_request; } int iwpm_wait_complete_req(struct iwpm_nlmsg_request *nlmsg_request) { int ret; ret = down_timeout(&nlmsg_request->sem, IWPM_NL_TIMEOUT); if (ret) { ret = -EINVAL; pr_info("%s: Timeout %d sec for netlink request (seq = %u)\n", __func__, (IWPM_NL_TIMEOUT/HZ), nlmsg_request->nlmsg_seq); } else { ret = nlmsg_request->err_code; } kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); return ret; } int iwpm_get_nlmsg_seq(void) { return atomic_inc_return(&iwpm_admin.nlmsg_seq); } /* valid client */ u32 iwpm_get_registration(u8 nl_client) { return iwpm_admin.reg_list[nl_client]; } /* valid client */ void iwpm_set_registration(u8 nl_client, u32 reg) { iwpm_admin.reg_list[nl_client] = reg; } /* valid client */ u32 iwpm_check_registration(u8 nl_client, u32 reg) { return (iwpm_get_registration(nl_client) & reg); } int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr, struct sockaddr_storage *b_sockaddr) { if (a_sockaddr->ss_family != b_sockaddr->ss_family) return 1; if (a_sockaddr->ss_family == AF_INET) { struct sockaddr_in *a4_sockaddr = (struct sockaddr_in *)a_sockaddr; struct sockaddr_in *b4_sockaddr = (struct sockaddr_in *)b_sockaddr; if (!memcmp(&a4_sockaddr->sin_addr, &b4_sockaddr->sin_addr, sizeof(struct in_addr)) && a4_sockaddr->sin_port == b4_sockaddr->sin_port) return 0; } else if (a_sockaddr->ss_family == AF_INET6) { struct sockaddr_in6 *a6_sockaddr = (struct sockaddr_in6 *)a_sockaddr; struct sockaddr_in6 *b6_sockaddr = (struct sockaddr_in6 *)b_sockaddr; if (!memcmp(&a6_sockaddr->sin6_addr, &b6_sockaddr->sin6_addr, sizeof(struct in6_addr)) && a6_sockaddr->sin6_port == b6_sockaddr->sin6_port) return 0; } else { pr_err("%s: Invalid sockaddr family\n", __func__); } return 1; } struct sk_buff *iwpm_create_nlmsg(u32 nl_op, struct nlmsghdr **nlh, int nl_client) { struct sk_buff *skb = NULL; skb = dev_alloc_skb(IWPM_MSG_SIZE); if (!skb) goto create_nlmsg_exit; if (!(ibnl_put_msg(skb, nlh, 0, 0, nl_client, nl_op, NLM_F_REQUEST))) { pr_warn("%s: Unable to put the nlmsg header\n", __func__); dev_kfree_skb(skb); skb = NULL; } create_nlmsg_exit: return skb; } int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max, const struct nla_policy *nlmsg_policy, struct nlattr *nltb[], const char *msg_type) { int nlh_len = 0; int ret; const char *err_str = ""; ret = nlmsg_validate_deprecated(cb->nlh, nlh_len, policy_max - 1, nlmsg_policy, NULL); if (ret) { err_str = "Invalid attribute"; goto parse_nlmsg_error; } ret = nlmsg_parse_deprecated(cb->nlh, nlh_len, nltb, policy_max - 1, nlmsg_policy, NULL); if (ret) { err_str = "Unable to parse the nlmsg"; goto parse_nlmsg_error; } ret = iwpm_validate_nlmsg_attr(nltb, policy_max); if (ret) { err_str = "Invalid NULL attribute"; goto parse_nlmsg_error; } return 0; parse_nlmsg_error: pr_warn("%s: %s (msg type %s ret = %d)\n", __func__, err_str, msg_type, ret); return ret; } void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg) { struct sockaddr_in6 *sockaddr_v6; struct sockaddr_in *sockaddr_v4; switch (sockaddr->ss_family) { case AF_INET: sockaddr_v4 = (struct sockaddr_in *)sockaddr; pr_debug("%s IPV4 %pI4: %u(0x%04X)\n", msg, &sockaddr_v4->sin_addr, ntohs(sockaddr_v4->sin_port), ntohs(sockaddr_v4->sin_port)); break; case AF_INET6: sockaddr_v6 = (struct sockaddr_in6 *)sockaddr; pr_debug("%s IPV6 %pI6: %u(0x%04X)\n", msg, &sockaddr_v6->sin6_addr, ntohs(sockaddr_v6->sin6_port), ntohs(sockaddr_v6->sin6_port)); break; default: break; } } static u32 iwpm_ipv6_jhash(struct sockaddr_in6 *ipv6_sockaddr) { u32 ipv6_hash = jhash(&ipv6_sockaddr->sin6_addr, sizeof(struct in6_addr), 0); u32 hash = jhash_2words(ipv6_hash, (__force u32) ipv6_sockaddr->sin6_port, 0); return hash; } static u32 iwpm_ipv4_jhash(struct sockaddr_in *ipv4_sockaddr) { u32 ipv4_hash = jhash(&ipv4_sockaddr->sin_addr, sizeof(struct in_addr), 0); u32 hash = jhash_2words(ipv4_hash, (__force u32) ipv4_sockaddr->sin_port, 0); return hash; } static int get_hash_bucket(struct sockaddr_storage *a_sockaddr, struct sockaddr_storage *b_sockaddr, u32 *hash) { u32 a_hash, b_hash; if (a_sockaddr->ss_family == AF_INET) { a_hash = iwpm_ipv4_jhash((struct sockaddr_in *) a_sockaddr); b_hash = iwpm_ipv4_jhash((struct sockaddr_in *) b_sockaddr); } else if (a_sockaddr->ss_family == AF_INET6) { a_hash = iwpm_ipv6_jhash((struct sockaddr_in6 *) a_sockaddr); b_hash = iwpm_ipv6_jhash((struct sockaddr_in6 *) b_sockaddr); } else { pr_err("%s: Invalid sockaddr family\n", __func__); return -EINVAL; } if (a_hash == b_hash) /* if port mapper isn't available */ *hash = a_hash; else *hash = jhash_2words(a_hash, b_hash, 0); return 0; } static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *local_sockaddr, struct sockaddr_storage *mapped_sockaddr) { u32 hash; int ret; ret = get_hash_bucket(local_sockaddr, mapped_sockaddr, &hash); if (ret) return NULL; return &iwpm_hash_bucket[hash & IWPM_MAPINFO_HASH_MASK]; } static struct hlist_head *get_reminfo_hash_bucket(struct sockaddr_storage *mapped_loc_sockaddr, struct sockaddr_storage *mapped_rem_sockaddr) { u32 hash; int ret; ret = get_hash_bucket(mapped_loc_sockaddr, mapped_rem_sockaddr, &hash); if (ret) return NULL; return &iwpm_reminfo_bucket[hash & IWPM_REMINFO_HASH_MASK]; } static int send_mapinfo_num(u32 mapping_num, u8 nl_client, int iwpm_pid) { struct sk_buff *skb = NULL; struct nlmsghdr *nlh; u32 msg_seq; const char *err_str = ""; int ret = -EINVAL; skb = iwpm_create_nlmsg(RDMA_NL_IWPM_MAPINFO_NUM, &nlh, nl_client); if (!skb) { err_str = "Unable to create a nlmsg"; goto mapinfo_num_error; } nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); msg_seq = 0; err_str = "Unable to put attribute of mapinfo number nlmsg"; ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, IWPM_NLA_MAPINFO_SEQ); if (ret) goto mapinfo_num_error; ret = ibnl_put_attr(skb, nlh, sizeof(u32), &mapping_num, IWPM_NLA_MAPINFO_SEND_NUM); if (ret) goto mapinfo_num_error; nlmsg_end(skb, nlh); ret = rdma_nl_unicast(&init_net, skb, iwpm_pid); if (ret) { skb = NULL; err_str = "Unable to send a nlmsg"; goto mapinfo_num_error; } pr_debug("%s: Sent mapping number = %u\n", __func__, mapping_num); return 0; mapinfo_num_error: pr_info("%s: %s\n", __func__, err_str); dev_kfree_skb(skb); return ret; } static int send_nlmsg_done(struct sk_buff *skb, u8 nl_client, int iwpm_pid) { struct nlmsghdr *nlh = NULL; int ret = 0; if (!skb) return ret; if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client, RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) { pr_warn("%s Unable to put NLMSG_DONE\n", __func__); dev_kfree_skb(skb); return -ENOMEM; } nlh->nlmsg_type = NLMSG_DONE; ret = rdma_nl_unicast(&init_net, skb, iwpm_pid); if (ret) pr_warn("%s Unable to send a nlmsg\n", __func__); return ret; } int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid) { struct iwpm_mapping_info *map_info; struct sk_buff *skb = NULL; struct nlmsghdr *nlh; int skb_num = 0, mapping_num = 0; int i = 0, nlmsg_bytes = 0; unsigned long flags; const char *err_str = ""; int ret; skb = dev_alloc_skb(NLMSG_GOODSIZE); if (!skb) { ret = -ENOMEM; err_str = "Unable to allocate skb"; goto send_mapping_info_exit; } skb_num++; spin_lock_irqsave(&iwpm_mapinfo_lock, flags); ret = -EINVAL; for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) { hlist_for_each_entry(map_info, &iwpm_hash_bucket[i], hlist_node) { if (map_info->nl_client != nl_client) continue; nlh = NULL; if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client, RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) { ret = -ENOMEM; err_str = "Unable to put the nlmsg header"; goto send_mapping_info_unlock; } err_str = "Unable to put attribute of the nlmsg"; ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), &map_info->local_sockaddr, IWPM_NLA_MAPINFO_LOCAL_ADDR); if (ret) goto send_mapping_info_unlock; ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), &map_info->mapped_sockaddr, IWPM_NLA_MAPINFO_MAPPED_ADDR); if (ret) goto send_mapping_info_unlock; if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) { ret = ibnl_put_attr(skb, nlh, sizeof(u32), &map_info->map_flags, IWPM_NLA_MAPINFO_FLAGS); if (ret) goto send_mapping_info_unlock; } nlmsg_end(skb, nlh); iwpm_print_sockaddr(&map_info->local_sockaddr, "send_mapping_info: Local sockaddr:"); iwpm_print_sockaddr(&map_info->mapped_sockaddr, "send_mapping_info: Mapped local sockaddr:"); mapping_num++; nlmsg_bytes += nlh->nlmsg_len; /* check if all mappings can fit in one skb */ if (NLMSG_GOODSIZE - nlmsg_bytes < nlh->nlmsg_len * 2) { /* and leave room for NLMSG_DONE */ nlmsg_bytes = 0; skb_num++; spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); /* send the skb */ ret = send_nlmsg_done(skb, nl_client, iwpm_pid); skb = NULL; if (ret) { err_str = "Unable to send map info"; goto send_mapping_info_exit; } if (skb_num == IWPM_MAPINFO_SKB_COUNT) { ret = -ENOMEM; err_str = "Insufficient skbs for map info"; goto send_mapping_info_exit; } skb = dev_alloc_skb(NLMSG_GOODSIZE); if (!skb) { ret = -ENOMEM; err_str = "Unable to allocate skb"; goto send_mapping_info_exit; } spin_lock_irqsave(&iwpm_mapinfo_lock, flags); } } } send_mapping_info_unlock: spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); send_mapping_info_exit: if (ret) { pr_warn("%s: %s (ret = %d)\n", __func__, err_str, ret); dev_kfree_skb(skb); return ret; } send_nlmsg_done(skb, nl_client, iwpm_pid); return send_mapinfo_num(mapping_num, nl_client, iwpm_pid); } int iwpm_mapinfo_available(void) { unsigned long flags; int full_bucket = 0, i = 0; spin_lock_irqsave(&iwpm_mapinfo_lock, flags); if (iwpm_hash_bucket) { for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) { if (!hlist_empty(&iwpm_hash_bucket[i])) { full_bucket = 1; break; } } } spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); return full_bucket; } int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version) { struct sk_buff *skb = NULL; struct nlmsghdr *nlh; const char *err_str; int ret = -EINVAL; skb = iwpm_create_nlmsg(RDMA_NL_IWPM_HELLO, &nlh, nl_client); if (!skb) { err_str = "Unable to create a nlmsg"; goto hello_num_error; } nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); err_str = "Unable to put attribute of abi_version into nlmsg"; ret = ibnl_put_attr(skb, nlh, sizeof(u16), &abi_version, IWPM_NLA_HELLO_ABI_VERSION); if (ret) goto hello_num_error; nlmsg_end(skb, nlh); ret = rdma_nl_unicast(&init_net, skb, iwpm_pid); if (ret) { skb = NULL; err_str = "Unable to send a nlmsg"; goto hello_num_error; } pr_debug("%s: Sent hello abi_version = %u\n", __func__, abi_version); return 0; hello_num_error: pr_info("%s: %s\n", __func__, err_str); dev_kfree_skb(skb); return ret; }
9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 // SPDX-License-Identifier: GPL-2.0-only #include <linux/init.h> #include <linux/scatterlist.h> #include <linux/mempool.h> #include <linux/slab.h> #define SG_MEMPOOL_NR ARRAY_SIZE(sg_pools) #define SG_MEMPOOL_SIZE 2 struct sg_pool { size_t size; char *name; struct kmem_cache *slab; mempool_t *pool; }; #define SP(x) { .size = x, "sgpool-" __stringify(x) } #if (SG_CHUNK_SIZE < 32) #error SG_CHUNK_SIZE is too small (must be 32 or greater) #endif static struct sg_pool sg_pools[] = { SP(8), SP(16), #if (SG_CHUNK_SIZE > 32) SP(32), #if (SG_CHUNK_SIZE > 64) SP(64), #if (SG_CHUNK_SIZE > 128) SP(128), #if (SG_CHUNK_SIZE > 256) #error SG_CHUNK_SIZE is too large (256 MAX) #endif #endif #endif #endif SP(SG_CHUNK_SIZE) }; #undef SP static inline unsigned int sg_pool_index(unsigned short nents) { unsigned int index; BUG_ON(nents > SG_CHUNK_SIZE); if (nents <= 8) index = 0; else index = get_count_order(nents) - 3; return index; } static void sg_pool_free(struct scatterlist *sgl, unsigned int nents) { struct sg_pool *sgp; sgp = sg_pools + sg_pool_index(nents); mempool_free(sgl, sgp->pool); } static struct scatterlist *sg_pool_alloc(unsigned int nents, gfp_t gfp_mask) { struct sg_pool *sgp; sgp = sg_pools + sg_pool_index(nents); return mempool_alloc(sgp->pool, gfp_mask); } /** * sg_free_table_chained - Free a previously mapped sg table * @table: The sg table header to use * @nents_first_chunk: size of the first_chunk SGL passed to * sg_alloc_table_chained * * Description: * Free an sg table previously allocated and setup with * sg_alloc_table_chained(). * * @nents_first_chunk has to be same with that same parameter passed * to sg_alloc_table_chained(). * **/ void sg_free_table_chained(struct sg_table *table, unsigned nents_first_chunk) { if (table->orig_nents <= nents_first_chunk) return; if (nents_first_chunk == 1) nents_first_chunk = 0; __sg_free_table(table, SG_CHUNK_SIZE, nents_first_chunk, sg_pool_free, table->orig_nents); } EXPORT_SYMBOL_GPL(sg_free_table_chained); /** * sg_alloc_table_chained - Allocate and chain SGLs in an sg table * @table: The sg table header to use * @nents: Number of entries in sg list * @first_chunk: first SGL * @nents_first_chunk: number of the SGL of @first_chunk * * Description: * Allocate and chain SGLs in an sg table. If @nents@ is larger than * @nents_first_chunk a chained sg table will be setup. @first_chunk is * ignored if nents_first_chunk <= 1 because user expects the SGL points * non-chain SGL. * **/ int sg_alloc_table_chained(struct sg_table *table, int nents, struct scatterlist *first_chunk, unsigned nents_first_chunk) { int ret; BUG_ON(!nents); if (first_chunk && nents_first_chunk) { if (nents <= nents_first_chunk) { table->nents = table->orig_nents = nents; sg_init_table(table->sgl, nents); return 0; } } /* User supposes that the 1st SGL includes real entry */ if (nents_first_chunk <= 1) { first_chunk = NULL; nents_first_chunk = 0; } ret = __sg_alloc_table(table, nents, SG_CHUNK_SIZE, first_chunk, nents_first_chunk, GFP_ATOMIC, sg_pool_alloc); if (unlikely(ret)) sg_free_table_chained(table, nents_first_chunk); return ret; } EXPORT_SYMBOL_GPL(sg_alloc_table_chained); static __init int sg_pool_init(void) { int i; for (i = 0; i < SG_MEMPOOL_NR; i++) { struct sg_pool *sgp = sg_pools + i; int size = sgp->size * sizeof(struct scatterlist); sgp->slab = kmem_cache_create(sgp->name, size, 0, SLAB_HWCACHE_ALIGN, NULL); if (!sgp->slab) { printk(KERN_ERR "SG_POOL: can't init sg slab %s\n", sgp->name); goto cleanup_sdb; } sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE, sgp->slab); if (!sgp->pool) { printk(KERN_ERR "SG_POOL: can't init sg mempool %s\n", sgp->name); goto cleanup_sdb; } } return 0; cleanup_sdb: for (i = 0; i < SG_MEMPOOL_NR; i++) { struct sg_pool *sgp = sg_pools + i; mempool_destroy(sgp->pool); kmem_cache_destroy(sgp->slab); } return -ENOMEM; } subsys_initcall(sg_pool_init);
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 // SPDX-License-Identifier: GPL-2.0-only /* * In-kernel rpcbind client supporting versions 2, 3, and 4 of the rpcbind * protocol * * Based on RFC 1833: "Binding Protocols for ONC RPC Version 2" and * RFC 3530: "Network File System (NFS) version 4 Protocol" * * Original: Gilles Quillard, Bull Open Source, 2005 <gilles.quillard@bull.net> * Updated: Chuck Lever, Oracle Corporation, 2007 <chuck.lever@oracle.com> * * Descended from net/sunrpc/pmap_clnt.c, * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de> */ #include <linux/module.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/un.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/mutex.h> #include <linux/slab.h> #include <net/ipv6.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/sched.h> #include <linux/sunrpc/xprtsock.h> #include <trace/events/sunrpc.h> #include "netns.h" #define RPCBIND_SOCK_PATHNAME "/var/run/rpcbind.sock" #define RPCBIND_SOCK_ABSTRACT_NAME "\0/run/rpcbind.sock" #define RPCBIND_PROGRAM (100000u) #define RPCBIND_PORT (111u) #define RPCBVERS_2 (2u) #define RPCBVERS_3 (3u) #define RPCBVERS_4 (4u) enum { RPCBPROC_NULL, RPCBPROC_SET, RPCBPROC_UNSET, RPCBPROC_GETPORT, RPCBPROC_GETADDR = 3, /* alias for GETPORT */ RPCBPROC_DUMP, RPCBPROC_CALLIT, RPCBPROC_BCAST = 5, /* alias for CALLIT */ RPCBPROC_GETTIME, RPCBPROC_UADDR2TADDR, RPCBPROC_TADDR2UADDR, RPCBPROC_GETVERSADDR, RPCBPROC_INDIRECT, RPCBPROC_GETADDRLIST, RPCBPROC_GETSTAT, }; /* * r_owner * * The "owner" is allowed to unset a service in the rpcbind database. * * For AF_LOCAL SET/UNSET requests, rpcbind treats this string as a * UID which it maps to a local user name via a password lookup. * In all other cases it is ignored. * * For SET/UNSET requests, user space provides a value, even for * network requests, and GETADDR uses an empty string. We follow * those precedents here. */ #define RPCB_OWNER_STRING "0" #define RPCB_MAXOWNERLEN sizeof(RPCB_OWNER_STRING) /* * XDR data type sizes */ #define RPCB_program_sz (1) #define RPCB_version_sz (1) #define RPCB_protocol_sz (1) #define RPCB_port_sz (1) #define RPCB_boolean_sz (1) #define RPCB_netid_sz (1 + XDR_QUADLEN(RPCBIND_MAXNETIDLEN)) #define RPCB_addr_sz (1 + XDR_QUADLEN(RPCBIND_MAXUADDRLEN)) #define RPCB_ownerstring_sz (1 + XDR_QUADLEN(RPCB_MAXOWNERLEN)) /* * XDR argument and result sizes */ #define RPCB_mappingargs_sz (RPCB_program_sz + RPCB_version_sz + \ RPCB_protocol_sz + RPCB_port_sz) #define RPCB_getaddrargs_sz (RPCB_program_sz + RPCB_version_sz + \ RPCB_netid_sz + RPCB_addr_sz + \ RPCB_ownerstring_sz) #define RPCB_getportres_sz RPCB_port_sz #define RPCB_setres_sz RPCB_boolean_sz /* * Note that RFC 1833 does not put any size restrictions on the * address string returned by the remote rpcbind database. */ #define RPCB_getaddrres_sz RPCB_addr_sz static void rpcb_getport_done(struct rpc_task *, void *); static void rpcb_map_release(void *data); static const struct rpc_program rpcb_program; struct rpcbind_args { struct rpc_xprt * r_xprt; u32 r_prog; u32 r_vers; u32 r_prot; unsigned short r_port; const char * r_netid; const char * r_addr; const char * r_owner; int r_status; }; static const struct rpc_procinfo rpcb_procedures2[]; static const struct rpc_procinfo rpcb_procedures3[]; static const struct rpc_procinfo rpcb_procedures4[]; struct rpcb_info { u32 rpc_vers; const struct rpc_procinfo *rpc_proc; }; static const struct rpcb_info rpcb_next_version[]; static const struct rpcb_info rpcb_next_version6[]; static const struct rpc_call_ops rpcb_getport_ops = { .rpc_call_done = rpcb_getport_done, .rpc_release = rpcb_map_release, }; static void rpcb_wake_rpcbind_waiters(struct rpc_xprt *xprt, int status) { xprt_clear_binding(xprt); rpc_wake_up_status(&xprt->binding, status); } static void rpcb_map_release(void *data) { struct rpcbind_args *map = data; rpcb_wake_rpcbind_waiters(map->r_xprt, map->r_status); xprt_put(map->r_xprt); kfree(map->r_addr); kfree(map); } static int rpcb_get_local(struct net *net) { int cnt; struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); spin_lock(&sn->rpcb_clnt_lock); if (sn->rpcb_users) sn->rpcb_users++; cnt = sn->rpcb_users; spin_unlock(&sn->rpcb_clnt_lock); return cnt; } void rpcb_put_local(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct rpc_clnt *clnt = sn->rpcb_local_clnt; struct rpc_clnt *clnt4 = sn->rpcb_local_clnt4; int shutdown = 0; spin_lock(&sn->rpcb_clnt_lock); if (sn->rpcb_users) { if (--sn->rpcb_users == 0) { sn->rpcb_local_clnt = NULL; sn->rpcb_local_clnt4 = NULL; } shutdown = !sn->rpcb_users; } spin_unlock(&sn->rpcb_clnt_lock); if (shutdown) { /* * cleanup_rpcb_clnt - remove xprtsock's sysctls, unregister */ if (clnt4) rpc_shutdown_client(clnt4); if (clnt) rpc_shutdown_client(clnt); } } static void rpcb_set_local(struct net *net, struct rpc_clnt *clnt, struct rpc_clnt *clnt4, bool is_af_local) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); /* Protected by rpcb_create_local_mutex */ sn->rpcb_local_clnt = clnt; sn->rpcb_local_clnt4 = clnt4; sn->rpcb_is_af_local = is_af_local ? 1 : 0; smp_wmb(); sn->rpcb_users = 1; } /* Evaluate to actual length of the `sockaddr_un' structure. */ # define SUN_LEN(ptr) (offsetof(struct sockaddr_un, sun_path) \ + 1 + strlen((ptr)->sun_path + 1)) /* * Returns zero on success, otherwise a negative errno value * is returned. */ static int rpcb_create_af_local(struct net *net, const struct sockaddr_un *addr) { struct rpc_create_args args = { .net = net, .protocol = XPRT_TRANSPORT_LOCAL, .address = (struct sockaddr *)addr, .addrsize = SUN_LEN(addr), .servername = "localhost", .program = &rpcb_program, .version = RPCBVERS_2, .authflavor = RPC_AUTH_NULL, .cred = current_cred(), /* * We turn off the idle timeout to prevent the kernel * from automatically disconnecting the socket. * Otherwise, we'd have to cache the mount namespace * of the caller and somehow pass that to the socket * reconnect code. */ .flags = RPC_CLNT_CREATE_NO_IDLE_TIMEOUT, }; struct rpc_clnt *clnt, *clnt4; int result = 0; /* * Because we requested an RPC PING at transport creation time, * this works only if the user space portmapper is rpcbind, and * it's listening on AF_LOCAL on the named socket. */ clnt = rpc_create(&args); if (IS_ERR(clnt)) { result = PTR_ERR(clnt); goto out; } clnt4 = rpc_bind_new_program(clnt, &rpcb_program, RPCBVERS_4); if (IS_ERR(clnt4)) clnt4 = NULL; rpcb_set_local(net, clnt, clnt4, true); out: return result; } static int rpcb_create_local_abstract(struct net *net) { static const struct sockaddr_un rpcb_localaddr_abstract = { .sun_family = AF_LOCAL, .sun_path = RPCBIND_SOCK_ABSTRACT_NAME, }; return rpcb_create_af_local(net, &rpcb_localaddr_abstract); } static int rpcb_create_local_unix(struct net *net) { static const struct sockaddr_un rpcb_localaddr_unix = { .sun_family = AF_LOCAL, .sun_path = RPCBIND_SOCK_PATHNAME, }; return rpcb_create_af_local(net, &rpcb_localaddr_unix); } /* * Returns zero on success, otherwise a negative errno value * is returned. */ static int rpcb_create_local_net(struct net *net) { static const struct sockaddr_in rpcb_inaddr_loopback = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_LOOPBACK), .sin_port = htons(RPCBIND_PORT), }; struct rpc_create_args args = { .net = net, .protocol = XPRT_TRANSPORT_TCP, .address = (struct sockaddr *)&rpcb_inaddr_loopback, .addrsize = sizeof(rpcb_inaddr_loopback), .servername = "localhost", .program = &rpcb_program, .version = RPCBVERS_2, .authflavor = RPC_AUTH_UNIX, .cred = current_cred(), .flags = RPC_CLNT_CREATE_NOPING, }; struct rpc_clnt *clnt, *clnt4; int result = 0; clnt = rpc_create(&args); if (IS_ERR(clnt)) { result = PTR_ERR(clnt); goto out; } /* * This results in an RPC ping. On systems running portmapper, * the v4 ping will fail. Proceed anyway, but disallow rpcb * v4 upcalls. */ clnt4 = rpc_bind_new_program(clnt, &rpcb_program, RPCBVERS_4); if (IS_ERR(clnt4)) clnt4 = NULL; rpcb_set_local(net, clnt, clnt4, false); out: return result; } /* * Returns zero on success, otherwise a negative errno value * is returned. */ int rpcb_create_local(struct net *net) { static DEFINE_MUTEX(rpcb_create_local_mutex); int result = 0; if (rpcb_get_local(net)) return result; mutex_lock(&rpcb_create_local_mutex); if (rpcb_get_local(net)) goto out; if (rpcb_create_local_abstract(net) != 0 && rpcb_create_local_unix(net) != 0) result = rpcb_create_local_net(net); out: mutex_unlock(&rpcb_create_local_mutex); return result; } static struct rpc_clnt *rpcb_create(struct net *net, const char *nodename, const char *hostname, struct sockaddr *srvaddr, size_t salen, int proto, u32 version, const struct cred *cred, const struct rpc_timeout *timeo) { struct rpc_create_args args = { .net = net, .protocol = proto, .address = srvaddr, .addrsize = salen, .timeout = timeo, .servername = hostname, .nodename = nodename, .program = &rpcb_program, .version = version, .authflavor = RPC_AUTH_UNIX, .cred = cred, .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_NONPRIVPORT), }; switch (srvaddr->sa_family) { case AF_INET: ((struct sockaddr_in *)srvaddr)->sin_port = htons(RPCBIND_PORT); break; case AF_INET6: ((struct sockaddr_in6 *)srvaddr)->sin6_port = htons(RPCBIND_PORT); break; default: return ERR_PTR(-EAFNOSUPPORT); } return rpc_create(&args); } static int rpcb_register_call(struct sunrpc_net *sn, struct rpc_clnt *clnt, struct rpc_message *msg, bool is_set) { int flags = RPC_TASK_NOCONNECT; int error, result = 0; if (is_set || !sn->rpcb_is_af_local) flags = RPC_TASK_SOFTCONN; msg->rpc_resp = &result; error = rpc_call_sync(clnt, msg, flags); if (error < 0) return error; if (!result) return -EACCES; return 0; } /** * rpcb_register - set or unset a port registration with the local rpcbind svc * @net: target network namespace * @prog: RPC program number to bind * @vers: RPC version number to bind * @prot: transport protocol to register * @port: port value to register * * Returns zero if the registration request was dispatched successfully * and the rpcbind daemon returned success. Otherwise, returns an errno * value that reflects the nature of the error (request could not be * dispatched, timed out, or rpcbind returned an error). * * RPC services invoke this function to advertise their contact * information via the system's rpcbind daemon. RPC services * invoke this function once for each [program, version, transport] * tuple they wish to advertise. * * Callers may also unregister RPC services that are no longer * available by setting the passed-in port to zero. This removes * all registered transports for [program, version] from the local * rpcbind database. * * This function uses rpcbind protocol version 2 to contact the * local rpcbind daemon. * * Registration works over both AF_INET and AF_INET6, and services * registered via this function are advertised as available for any * address. If the local rpcbind daemon is listening on AF_INET6, * services registered via this function will be advertised on * IN6ADDR_ANY (ie available for all AF_INET and AF_INET6 * addresses). */ int rpcb_register(struct net *net, u32 prog, u32 vers, int prot, unsigned short port) { struct rpcbind_args map = { .r_prog = prog, .r_vers = vers, .r_prot = prot, .r_port = port, }; struct rpc_message msg = { .rpc_argp = &map, }; struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); bool is_set = false; trace_pmap_register(prog, vers, prot, port); msg.rpc_proc = &rpcb_procedures2[RPCBPROC_UNSET]; if (port != 0) { msg.rpc_proc = &rpcb_procedures2[RPCBPROC_SET]; is_set = true; } return rpcb_register_call(sn, sn->rpcb_local_clnt, &msg, is_set); } /* * Fill in AF_INET family-specific arguments to register */ static int rpcb_register_inet4(struct sunrpc_net *sn, const struct sockaddr *sap, struct rpc_message *msg) { const struct sockaddr_in *sin = (const struct sockaddr_in *)sap; struct rpcbind_args *map = msg->rpc_argp; unsigned short port = ntohs(sin->sin_port); bool is_set = false; int result; map->r_addr = rpc_sockaddr2uaddr(sap, GFP_KERNEL); msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET]; if (port != 0) { msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET]; is_set = true; } result = rpcb_register_call(sn, sn->rpcb_local_clnt4, msg, is_set); kfree(map->r_addr); return result; } /* * Fill in AF_INET6 family-specific arguments to register */ static int rpcb_register_inet6(struct sunrpc_net *sn, const struct sockaddr *sap, struct rpc_message *msg) { const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)sap; struct rpcbind_args *map = msg->rpc_argp; unsigned short port = ntohs(sin6->sin6_port); bool is_set = false; int result; map->r_addr = rpc_sockaddr2uaddr(sap, GFP_KERNEL); msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET]; if (port != 0) { msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET]; is_set = true; } result = rpcb_register_call(sn, sn->rpcb_local_clnt4, msg, is_set); kfree(map->r_addr); return result; } static int rpcb_unregister_all_protofamilies(struct sunrpc_net *sn, struct rpc_message *msg) { struct rpcbind_args *map = msg->rpc_argp; trace_rpcb_unregister(map->r_prog, map->r_vers, map->r_netid); map->r_addr = ""; msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET]; return rpcb_register_call(sn, sn->rpcb_local_clnt4, msg, false); } /** * rpcb_v4_register - set or unset a port registration with the local rpcbind * @net: target network namespace * @program: RPC program number of service to (un)register * @version: RPC version number of service to (un)register * @address: address family, IP address, and port to (un)register * @netid: netid of transport protocol to (un)register * * Returns zero if the registration request was dispatched successfully * and the rpcbind daemon returned success. Otherwise, returns an errno * value that reflects the nature of the error (request could not be * dispatched, timed out, or rpcbind returned an error). * * RPC services invoke this function to advertise their contact * information via the system's rpcbind daemon. RPC services * invoke this function once for each [program, version, address, * netid] tuple they wish to advertise. * * Callers may also unregister RPC services that are registered at a * specific address by setting the port number in @address to zero. * They may unregister all registered protocol families at once for * a service by passing a NULL @address argument. If @netid is "" * then all netids for [program, version, address] are unregistered. * * This function uses rpcbind protocol version 4 to contact the * local rpcbind daemon. The local rpcbind daemon must support * version 4 of the rpcbind protocol in order for these functions * to register a service successfully. * * Supported netids include "udp" and "tcp" for UDP and TCP over * IPv4, and "udp6" and "tcp6" for UDP and TCP over IPv6, * respectively. * * The contents of @address determine the address family and the * port to be registered. The usual practice is to pass INADDR_ANY * as the raw address, but specifying a non-zero address is also * supported by this API if the caller wishes to advertise an RPC * service on a specific network interface. * * Note that passing in INADDR_ANY does not create the same service * registration as IN6ADDR_ANY. The former advertises an RPC * service on any IPv4 address, but not on IPv6. The latter * advertises the service on all IPv4 and IPv6 addresses. */ int rpcb_v4_register(struct net *net, const u32 program, const u32 version, const struct sockaddr *address, const char *netid) { struct rpcbind_args map = { .r_prog = program, .r_vers = version, .r_netid = netid, .r_owner = RPCB_OWNER_STRING, }; struct rpc_message msg = { .rpc_argp = &map, }; struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); if (sn->rpcb_local_clnt4 == NULL) return -EPROTONOSUPPORT; if (address == NULL) return rpcb_unregister_all_protofamilies(sn, &msg); trace_rpcb_register(map.r_prog, map.r_vers, map.r_addr, map.r_netid); switch (address->sa_family) { case AF_INET: return rpcb_register_inet4(sn, address, &msg); case AF_INET6: return rpcb_register_inet6(sn, address, &msg); } return -EAFNOSUPPORT; } static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbind_args *map, const struct rpc_procinfo *proc) { struct rpc_message msg = { .rpc_proc = proc, .rpc_argp = map, .rpc_resp = map, }; struct rpc_task_setup task_setup_data = { .rpc_client = rpcb_clnt, .rpc_message = &msg, .callback_ops = &rpcb_getport_ops, .callback_data = map, .flags = RPC_TASK_ASYNC | RPC_TASK_SOFTCONN, }; return rpc_run_task(&task_setup_data); } /* * In the case where rpc clients have been cloned, we want to make * sure that we use the program number/version etc of the actual * owner of the xprt. To do so, we walk back up the tree of parents * to find whoever created the transport and/or whoever has the * autobind flag set. */ static struct rpc_clnt *rpcb_find_transport_owner(struct rpc_clnt *clnt) { struct rpc_clnt *parent = clnt->cl_parent; struct rpc_xprt_switch *xps = rcu_access_pointer(clnt->cl_xpi.xpi_xpswitch); while (parent != clnt) { if (rcu_access_pointer(parent->cl_xpi.xpi_xpswitch) != xps) break; if (clnt->cl_autobind) break; clnt = parent; parent = parent->cl_parent; } return clnt; } /** * rpcb_getport_async - obtain the port for a given RPC service on a given host * @task: task that is waiting for portmapper request * * This one can be called for an ongoing RPC request, and can be used in * an async (rpciod) context. */ void rpcb_getport_async(struct rpc_task *task) { struct rpc_clnt *clnt; const struct rpc_procinfo *proc; u32 bind_version; struct rpc_xprt *xprt; struct rpc_clnt *rpcb_clnt; struct rpcbind_args *map; struct rpc_task *child; struct sockaddr_storage addr; struct sockaddr *sap = (struct sockaddr *)&addr; size_t salen; int status; rcu_read_lock(); clnt = rpcb_find_transport_owner(task->tk_client); rcu_read_unlock(); xprt = xprt_get(task->tk_xprt); /* Put self on the wait queue to ensure we get notified if * some other task is already attempting to bind the port */ rpc_sleep_on_timeout(&xprt->binding, task, NULL, jiffies + xprt->bind_timeout); if (xprt_test_and_set_binding(xprt)) { xprt_put(xprt); return; } /* Someone else may have bound if we slept */ if (xprt_bound(xprt)) { status = 0; goto bailout_nofree; } /* Parent transport's destination address */ salen = rpc_peeraddr(clnt, sap, sizeof(addr)); /* Don't ever use rpcbind v2 for AF_INET6 requests */ switch (sap->sa_family) { case AF_INET: proc = rpcb_next_version[xprt->bind_index].rpc_proc; bind_version = rpcb_next_version[xprt->bind_index].rpc_vers; break; case AF_INET6: proc = rpcb_next_version6[xprt->bind_index].rpc_proc; bind_version = rpcb_next_version6[xprt->bind_index].rpc_vers; break; default: status = -EAFNOSUPPORT; goto bailout_nofree; } if (proc == NULL) { xprt->bind_index = 0; status = -EPFNOSUPPORT; goto bailout_nofree; } trace_rpcb_getport(clnt, task, bind_version); rpcb_clnt = rpcb_create(xprt->xprt_net, clnt->cl_nodename, xprt->servername, sap, salen, xprt->prot, bind_version, clnt->cl_cred, task->tk_client->cl_timeout); if (IS_ERR(rpcb_clnt)) { status = PTR_ERR(rpcb_clnt); goto bailout_nofree; } map = kzalloc(sizeof(struct rpcbind_args), rpc_task_gfp_mask()); if (!map) { status = -ENOMEM; goto bailout_release_client; } map->r_prog = clnt->cl_prog; map->r_vers = clnt->cl_vers; map->r_prot = xprt->prot; map->r_port = 0; map->r_xprt = xprt; map->r_status = -EIO; switch (bind_version) { case RPCBVERS_4: case RPCBVERS_3: map->r_netid = xprt->address_strings[RPC_DISPLAY_NETID]; map->r_addr = rpc_sockaddr2uaddr(sap, rpc_task_gfp_mask()); if (!map->r_addr) { status = -ENOMEM; goto bailout_free_args; } map->r_owner = ""; break; case RPCBVERS_2: map->r_addr = NULL; break; default: BUG(); } child = rpcb_call_async(rpcb_clnt, map, proc); rpc_release_client(rpcb_clnt); if (IS_ERR(child)) { /* rpcb_map_release() has freed the arguments */ return; } xprt->stat.bind_count++; rpc_put_task(child); return; bailout_free_args: kfree(map); bailout_release_client: rpc_release_client(rpcb_clnt); bailout_nofree: rpcb_wake_rpcbind_waiters(xprt, status); task->tk_status = status; xprt_put(xprt); } EXPORT_SYMBOL_GPL(rpcb_getport_async); /* * Rpcbind child task calls this callback via tk_exit. */ static void rpcb_getport_done(struct rpc_task *child, void *data) { struct rpcbind_args *map = data; struct rpc_xprt *xprt = map->r_xprt; map->r_status = child->tk_status; /* Garbage reply: retry with a lesser rpcbind version */ if (map->r_status == -EIO) map->r_status = -EPROTONOSUPPORT; /* rpcbind server doesn't support this rpcbind protocol version */ if (map->r_status == -EPROTONOSUPPORT) xprt->bind_index++; if (map->r_status < 0) { /* rpcbind server not available on remote host? */ map->r_port = 0; } else if (map->r_port == 0) { /* Requested RPC service wasn't registered on remote host */ map->r_status = -EACCES; } else { /* Succeeded */ map->r_status = 0; } trace_rpcb_setport(child, map->r_status, map->r_port); if (map->r_port) { xprt->ops->set_port(xprt, map->r_port); xprt_set_bound(xprt); } } /* * XDR functions for rpcbind */ static void rpcb_enc_mapping(struct rpc_rqst *req, struct xdr_stream *xdr, const void *data) { const struct rpcbind_args *rpcb = data; __be32 *p; p = xdr_reserve_space(xdr, RPCB_mappingargs_sz << 2); *p++ = cpu_to_be32(rpcb->r_prog); *p++ = cpu_to_be32(rpcb->r_vers); *p++ = cpu_to_be32(rpcb->r_prot); *p = cpu_to_be32(rpcb->r_port); } static int rpcb_dec_getport(struct rpc_rqst *req, struct xdr_stream *xdr, void *data) { struct rpcbind_args *rpcb = data; unsigned long port; __be32 *p; rpcb->r_port = 0; p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) return -EIO; port = be32_to_cpup(p); if (unlikely(port > USHRT_MAX)) return -EIO; rpcb->r_port = port; return 0; } static int rpcb_dec_set(struct rpc_rqst *req, struct xdr_stream *xdr, void *data) { unsigned int *boolp = data; __be32 *p; p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) return -EIO; *boolp = 0; if (*p != xdr_zero) *boolp = 1; return 0; } static void encode_rpcb_string(struct xdr_stream *xdr, const char *string, const u32 maxstrlen) { __be32 *p; u32 len; len = strlen(string); WARN_ON_ONCE(len > maxstrlen); if (len > maxstrlen) /* truncate and hope for the best */ len = maxstrlen; p = xdr_reserve_space(xdr, 4 + len); xdr_encode_opaque(p, string, len); } static void rpcb_enc_getaddr(struct rpc_rqst *req, struct xdr_stream *xdr, const void *data) { const struct rpcbind_args *rpcb = data; __be32 *p; p = xdr_reserve_space(xdr, (RPCB_program_sz + RPCB_version_sz) << 2); *p++ = cpu_to_be32(rpcb->r_prog); *p = cpu_to_be32(rpcb->r_vers); encode_rpcb_string(xdr, rpcb->r_netid, RPCBIND_MAXNETIDLEN); encode_rpcb_string(xdr, rpcb->r_addr, RPCBIND_MAXUADDRLEN); encode_rpcb_string(xdr, rpcb->r_owner, RPCB_MAXOWNERLEN); } static int rpcb_dec_getaddr(struct rpc_rqst *req, struct xdr_stream *xdr, void *data) { struct rpcbind_args *rpcb = data; struct sockaddr_storage address; struct sockaddr *sap = (struct sockaddr *)&address; __be32 *p; u32 len; rpcb->r_port = 0; p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) goto out_fail; len = be32_to_cpup(p); /* * If the returned universal address is a null string, * the requested RPC service was not registered. */ if (len == 0) return 0; if (unlikely(len > RPCBIND_MAXUADDRLEN)) goto out_fail; p = xdr_inline_decode(xdr, len); if (unlikely(p == NULL)) goto out_fail; if (rpc_uaddr2sockaddr(req->rq_xprt->xprt_net, (char *)p, len, sap, sizeof(address)) == 0) goto out_fail; rpcb->r_port = rpc_get_port(sap); return 0; out_fail: return -EIO; } /* * Not all rpcbind procedures described in RFC 1833 are implemented * since the Linux kernel RPC code requires only these. */ static const struct rpc_procinfo rpcb_procedures2[] = { [RPCBPROC_SET] = { .p_proc = RPCBPROC_SET, .p_encode = rpcb_enc_mapping, .p_decode = rpcb_dec_set, .p_arglen = RPCB_mappingargs_sz, .p_replen = RPCB_setres_sz, .p_statidx = RPCBPROC_SET, .p_timer = 0, .p_name = "SET", }, [RPCBPROC_UNSET] = { .p_proc = RPCBPROC_UNSET, .p_encode = rpcb_enc_mapping, .p_decode = rpcb_dec_set, .p_arglen = RPCB_mappingargs_sz, .p_replen = RPCB_setres_sz, .p_statidx = RPCBPROC_UNSET, .p_timer = 0, .p_name = "UNSET", }, [RPCBPROC_GETPORT] = { .p_proc = RPCBPROC_GETPORT, .p_encode = rpcb_enc_mapping, .p_decode = rpcb_dec_getport, .p_arglen = RPCB_mappingargs_sz, .p_replen = RPCB_getportres_sz, .p_statidx = RPCBPROC_GETPORT, .p_timer = 0, .p_name = "GETPORT", }, }; static const struct rpc_procinfo rpcb_procedures3[] = { [RPCBPROC_SET] = { .p_proc = RPCBPROC_SET, .p_encode = rpcb_enc_getaddr, .p_decode = rpcb_dec_set, .p_arglen = RPCB_getaddrargs_sz, .p_replen = RPCB_setres_sz, .p_statidx = RPCBPROC_SET, .p_timer = 0, .p_name = "SET", }, [RPCBPROC_UNSET] = { .p_proc = RPCBPROC_UNSET, .p_encode = rpcb_enc_getaddr, .p_decode = rpcb_dec_set, .p_arglen = RPCB_getaddrargs_sz, .p_replen = RPCB_setres_sz, .p_statidx = RPCBPROC_UNSET, .p_timer = 0, .p_name = "UNSET", }, [RPCBPROC_GETADDR] = { .p_proc = RPCBPROC_GETADDR, .p_encode = rpcb_enc_getaddr, .p_decode = rpcb_dec_getaddr, .p_arglen = RPCB_getaddrargs_sz, .p_replen = RPCB_getaddrres_sz, .p_statidx = RPCBPROC_GETADDR, .p_timer = 0, .p_name = "GETADDR", }, }; static const struct rpc_procinfo rpcb_procedures4[] = { [RPCBPROC_SET] = { .p_proc = RPCBPROC_SET, .p_encode = rpcb_enc_getaddr, .p_decode = rpcb_dec_set, .p_arglen = RPCB_getaddrargs_sz, .p_replen = RPCB_setres_sz, .p_statidx = RPCBPROC_SET, .p_timer = 0, .p_name = "SET", }, [RPCBPROC_UNSET] = { .p_proc = RPCBPROC_UNSET, .p_encode = rpcb_enc_getaddr, .p_decode = rpcb_dec_set, .p_arglen = RPCB_getaddrargs_sz, .p_replen = RPCB_setres_sz, .p_statidx = RPCBPROC_UNSET, .p_timer = 0, .p_name = "UNSET", }, [RPCBPROC_GETADDR] = { .p_proc = RPCBPROC_GETADDR, .p_encode = rpcb_enc_getaddr, .p_decode = rpcb_dec_getaddr, .p_arglen = RPCB_getaddrargs_sz, .p_replen = RPCB_getaddrres_sz, .p_statidx = RPCBPROC_GETADDR, .p_timer = 0, .p_name = "GETADDR", }, }; static const struct rpcb_info rpcb_next_version[] = { { .rpc_vers = RPCBVERS_2, .rpc_proc = &rpcb_procedures2[RPCBPROC_GETPORT], }, { .rpc_proc = NULL, }, }; static const struct rpcb_info rpcb_next_version6[] = { { .rpc_vers = RPCBVERS_4, .rpc_proc = &rpcb_procedures4[RPCBPROC_GETADDR], }, { .rpc_vers = RPCBVERS_3, .rpc_proc = &rpcb_procedures3[RPCBPROC_GETADDR], }, { .rpc_proc = NULL, }, }; static unsigned int rpcb_version2_counts[ARRAY_SIZE(rpcb_procedures2)]; static const struct rpc_version rpcb_version2 = { .number = RPCBVERS_2, .nrprocs = ARRAY_SIZE(rpcb_procedures2), .procs = rpcb_procedures2, .counts = rpcb_version2_counts, }; static unsigned int rpcb_version3_counts[ARRAY_SIZE(rpcb_procedures3)]; static const struct rpc_version rpcb_version3 = { .number = RPCBVERS_3, .nrprocs = ARRAY_SIZE(rpcb_procedures3), .procs = rpcb_procedures3, .counts = rpcb_version3_counts, }; static unsigned int rpcb_version4_counts[ARRAY_SIZE(rpcb_procedures4)]; static const struct rpc_version rpcb_version4 = { .number = RPCBVERS_4, .nrprocs = ARRAY_SIZE(rpcb_procedures4), .procs = rpcb_procedures4, .counts = rpcb_version4_counts, }; static const struct rpc_version *rpcb_version[] = { NULL, NULL, &rpcb_version2, &rpcb_version3, &rpcb_version4 }; static struct rpc_stat rpcb_stats; static const struct rpc_program rpcb_program = { .name = "rpcbind", .number = RPCBIND_PROGRAM, .nrvers = ARRAY_SIZE(rpcb_version), .version = rpcb_version, .stats = &rpcb_stats, };
3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 // SPDX-License-Identifier: GPL-2.0 #include <linux/export.h> #include <linux/icmpv6.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/spinlock.h> #include <net/ipv6.h> #if IS_ENABLED(CONFIG_IPV6) #if !IS_BUILTIN(CONFIG_IPV6) static ip6_icmp_send_t __rcu *ip6_icmp_send; int inet6_register_icmp_sender(ip6_icmp_send_t *fn) { return (cmpxchg((ip6_icmp_send_t **)&ip6_icmp_send, NULL, fn) == NULL) ? 0 : -EBUSY; } EXPORT_SYMBOL(inet6_register_icmp_sender); int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn) { int ret; ret = (cmpxchg((ip6_icmp_send_t **)&ip6_icmp_send, fn, NULL) == fn) ? 0 : -EINVAL; synchronize_net(); return ret; } EXPORT_SYMBOL(inet6_unregister_icmp_sender); void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct inet6_skb_parm *parm) { ip6_icmp_send_t *send; rcu_read_lock(); send = rcu_dereference(ip6_icmp_send); if (send) send(skb, type, code, info, NULL, parm); rcu_read_unlock(); } EXPORT_SYMBOL(__icmpv6_send); #endif #if IS_ENABLED(CONFIG_NF_NAT) #include <net/netfilter/nf_conntrack.h> void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) { struct inet6_skb_parm parm = { 0 }; struct sk_buff *cloned_skb = NULL; enum ip_conntrack_info ctinfo; enum ip_conntrack_dir dir; struct in6_addr orig_ip; struct nf_conn *ct; ct = nf_ct_get(skb_in, &ctinfo); if (!ct || !(READ_ONCE(ct->status) & IPS_NAT_MASK)) { __icmpv6_send(skb_in, type, code, info, &parm); return; } if (skb_shared(skb_in)) skb_in = cloned_skb = skb_clone(skb_in, GFP_ATOMIC); if (unlikely(!skb_in || skb_network_header(skb_in) < skb_in->head || (skb_network_header(skb_in) + sizeof(struct ipv6hdr)) > skb_tail_pointer(skb_in) || skb_ensure_writable(skb_in, skb_network_offset(skb_in) + sizeof(struct ipv6hdr)))) goto out; orig_ip = ipv6_hdr(skb_in)->saddr; dir = CTINFO2DIR(ctinfo); ipv6_hdr(skb_in)->saddr = ct->tuplehash[dir].tuple.src.u3.in6; __icmpv6_send(skb_in, type, code, info, &parm); ipv6_hdr(skb_in)->saddr = orig_ip; out: consume_skb(cloned_skb); } EXPORT_SYMBOL(icmpv6_ndo_send); #endif #endif
189 44 224 514 187 652 160 160 300 212 160 160 160 160 382 419 418 419 300 212 353 353 69 418 419 114 160 160 160 160 160 67 159 226 187 160 160 158 160 159 160 160 160 160 160 67 158 159 157 87 159 32 32 66 66 66 66 66 33 34 34 34 34 33 147 147 147 147 147 336 334 337 76 335 510 429 497 147 497 494 511 510 510 510 510 511 159 159 511 159 497 34 34 497 147 491 184 167 48 13 209 64 487 486 380 485 482 483 285 33 192 257 5 5 1 12 12 26 26 12 26 26 26 391 209 184 115 115 115 209 209 208 245 209 4 145 128 145 1 145 16 157 14 357 356 141 211 3 1 119 1 137 4 9 1 9 7 4 134 110 8 135 183 121 206 16 1445 1449 1446 1823 1825 281 1826 1830 1082 1802 1803 813 894 774 856 1008 776 889 106 106 131 1597 1594 11 1590 1589 307 52 1450 1447 1427 229 229 107 176 216 216 58 20 196 295 260 224 210 249 195 6 2 248 6 241 26 219 22 222 22 227 228 2 100 210 84 84 84 84 82 83 80 82 81 82 81 82 38 36 26 26 25 23 26 26 1 26 26 25 26 26 26 26 26 26 25 25 26 26 272 272 267 272 270 271 271 272 271 272 272 272 109 212 271 1 272 271 74 270 2 2 2 2 2 2 2 2 2 2 2 2 2 26 20 21 26 26 26 26 26 26 26 26 14 17 29 9 18 17 14 11 10 3 10 16 30 32 31 30 79 1 79 191 191 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 // SPDX-License-Identifier: GPL-2.0-or-later /* * * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet * & Swedish University of Agricultural Sciences. * * Jens Laas <jens.laas@data.slu.se> Swedish University of * Agricultural Sciences. * * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet * * This work is based on the LPC-trie which is originally described in: * * An experimental study of compression methods for dynamic tries * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. * https://www.csc.kth.se/~snilsson/software/dyntrie2/ * * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999 * * Code from fib_hash has been reused which includes the following header: * * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * IPv4 FIB: lookup engine and maintenance routines. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Substantial contributions to this work comes from: * * David S. Miller, <davem@davemloft.net> * Stephen Hemminger <shemminger@osdl.org> * Paul E. McKenney <paulmck@us.ibm.com> * Patrick McHardy <kaber@trash.net> */ #include <linux/cache.h> #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/proc_fs.h> #include <linux/rcupdate.h> #include <linux/rcupdate_wait.h> #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/init.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/vmalloc.h> #include <linux/notifier.h> #include <net/net_namespace.h> #include <net/inet_dscp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <net/tcp.h> #include <net/sock.h> #include <net/ip_fib.h> #include <net/fib_notifier.h> #include <trace/events/fib.h> #include "fib_lookup.h" static int call_fib_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, u32 dst, int dst_len, struct fib_alias *fa, struct netlink_ext_ack *extack) { struct fib_entry_notifier_info info = { .info.extack = extack, .dst = dst, .dst_len = dst_len, .fi = fa->fa_info, .dscp = fa->fa_dscp, .type = fa->fa_type, .tb_id = fa->tb_id, }; return call_fib4_notifier(nb, event_type, &info.info); } static int call_fib_entry_notifiers(struct net *net, enum fib_event_type event_type, u32 dst, int dst_len, struct fib_alias *fa, struct netlink_ext_ack *extack) { struct fib_entry_notifier_info info = { .info.extack = extack, .dst = dst, .dst_len = dst_len, .fi = fa->fa_info, .dscp = fa->fa_dscp, .type = fa->fa_type, .tb_id = fa->tb_id, }; return call_fib4_notifiers(net, event_type, &info.info); } #define MAX_STAT_DEPTH 32 #define KEYLENGTH (8*sizeof(t_key)) #define KEY_MAX ((t_key)~0) typedef unsigned int t_key; #define IS_TRIE(n) ((n)->pos >= KEYLENGTH) #define IS_TNODE(n) ((n)->bits) #define IS_LEAF(n) (!(n)->bits) struct key_vector { t_key key; unsigned char pos; /* 2log(KEYLENGTH) bits needed */ unsigned char bits; /* 2log(KEYLENGTH) bits needed */ unsigned char slen; union { /* This list pointer if valid if (pos | bits) == 0 (LEAF) */ struct hlist_head leaf; /* This array is valid if (pos | bits) > 0 (TNODE) */ DECLARE_FLEX_ARRAY(struct key_vector __rcu *, tnode); }; }; struct tnode { struct rcu_head rcu; t_key empty_children; /* KEYLENGTH bits needed */ t_key full_children; /* KEYLENGTH bits needed */ struct key_vector __rcu *parent; struct key_vector kv[1]; #define tn_bits kv[0].bits }; #define TNODE_SIZE(n) offsetof(struct tnode, kv[0].tnode[n]) #define LEAF_SIZE TNODE_SIZE(1) #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats { unsigned int gets; unsigned int backtrack; unsigned int semantic_match_passed; unsigned int semantic_match_miss; unsigned int null_node_hit; unsigned int resize_node_skipped; }; #endif struct trie_stat { unsigned int totdepth; unsigned int maxdepth; unsigned int tnodes; unsigned int leaves; unsigned int nullpointers; unsigned int prefixes; unsigned int nodesizes[MAX_STAT_DEPTH]; }; struct trie { struct key_vector kv[1]; #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats __percpu *stats; #endif }; static struct key_vector *resize(struct trie *t, struct key_vector *tn); static unsigned int tnode_free_size; /* * synchronize_rcu after call_rcu for outstanding dirty memory; it should be * especially useful before resizing the root node with PREEMPT_NONE configs; * the value was obtained experimentally, aiming to avoid visible slowdown. */ unsigned int sysctl_fib_sync_mem = 512 * 1024; unsigned int sysctl_fib_sync_mem_min = 64 * 1024; unsigned int sysctl_fib_sync_mem_max = 64 * 1024 * 1024; static struct kmem_cache *fn_alias_kmem __ro_after_init; static struct kmem_cache *trie_leaf_kmem __ro_after_init; static inline struct tnode *tn_info(struct key_vector *kv) { return container_of(kv, struct tnode, kv[0]); } /* caller must hold RTNL */ #define node_parent(tn) rtnl_dereference(tn_info(tn)->parent) #define get_child(tn, i) rtnl_dereference((tn)->tnode[i]) /* caller must hold RCU read lock or RTNL */ #define node_parent_rcu(tn) rcu_dereference_rtnl(tn_info(tn)->parent) #define get_child_rcu(tn, i) rcu_dereference_rtnl((tn)->tnode[i]) /* wrapper for rcu_assign_pointer */ static inline void node_set_parent(struct key_vector *n, struct key_vector *tp) { if (n) rcu_assign_pointer(tn_info(n)->parent, tp); } #define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER(tn_info(n)->parent, p) /* This provides us with the number of children in this node, in the case of a * leaf this will return 0 meaning none of the children are accessible. */ static inline unsigned long child_length(const struct key_vector *tn) { return (1ul << tn->bits) & ~(1ul); } #define get_cindex(key, kv) (((key) ^ (kv)->key) >> (kv)->pos) static inline unsigned long get_index(t_key key, struct key_vector *kv) { unsigned long index = key ^ kv->key; if ((BITS_PER_LONG <= KEYLENGTH) && (KEYLENGTH == kv->pos)) return 0; return index >> kv->pos; } /* To understand this stuff, an understanding of keys and all their bits is * necessary. Every node in the trie has a key associated with it, but not * all of the bits in that key are significant. * * Consider a node 'n' and its parent 'tp'. * * If n is a leaf, every bit in its key is significant. Its presence is * necessitated by path compression, since during a tree traversal (when * searching for a leaf - unless we are doing an insertion) we will completely * ignore all skipped bits we encounter. Thus we need to verify, at the end of * a potentially successful search, that we have indeed been walking the * correct key path. * * Note that we can never "miss" the correct key in the tree if present by * following the wrong path. Path compression ensures that segments of the key * that are the same for all keys with a given prefix are skipped, but the * skipped part *is* identical for each node in the subtrie below the skipped * bit! trie_insert() in this implementation takes care of that. * * if n is an internal node - a 'tnode' here, the various parts of its key * have many different meanings. * * Example: * _________________________________________________________________ * | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C | * ----------------------------------------------------------------- * 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 * * _________________________________________________________________ * | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u | * ----------------------------------------------------------------- * 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 * * tp->pos = 22 * tp->bits = 3 * n->pos = 13 * n->bits = 4 * * First, let's just ignore the bits that come before the parent tp, that is * the bits from (tp->pos + tp->bits) to 31. They are *known* but at this * point we do not use them for anything. * * The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the * index into the parent's child array. That is, they will be used to find * 'n' among tp's children. * * The bits from (n->pos + n->bits) to (tp->pos - 1) - "S" - are skipped bits * for the node n. * * All the bits we have seen so far are significant to the node n. The rest * of the bits are really not needed or indeed known in n->key. * * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into * n's child array, and will of course be different for each child. * * The rest of the bits, from 0 to (n->pos -1) - "u" - are completely unknown * at this point. */ static const int halve_threshold = 25; static const int inflate_threshold = 50; static const int halve_threshold_root = 15; static const int inflate_threshold_root = 30; static inline void alias_free_mem_rcu(struct fib_alias *fa) { kfree_rcu(fa, rcu); } #define TNODE_VMALLOC_MAX \ ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct key_vector *)) static void __node_free_rcu(struct rcu_head *head) { struct tnode *n = container_of(head, struct tnode, rcu); if (!n->tn_bits) kmem_cache_free(trie_leaf_kmem, n); else kvfree(n); } #define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu) static struct tnode *tnode_alloc(int bits) { size_t size; /* verify bits is within bounds */ if (bits > TNODE_VMALLOC_MAX) return NULL; /* determine size and verify it is non-zero and didn't overflow */ size = TNODE_SIZE(1ul << bits); if (size <= PAGE_SIZE) return kzalloc(size, GFP_KERNEL); else return vzalloc(size); } static inline void empty_child_inc(struct key_vector *n) { tn_info(n)->empty_children++; if (!tn_info(n)->empty_children) tn_info(n)->full_children++; } static inline void empty_child_dec(struct key_vector *n) { if (!tn_info(n)->empty_children) tn_info(n)->full_children--; tn_info(n)->empty_children--; } static struct key_vector *leaf_new(t_key key, struct fib_alias *fa) { struct key_vector *l; struct tnode *kv; kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); if (!kv) return NULL; /* initialize key vector */ l = kv->kv; l->key = key; l->pos = 0; l->bits = 0; l->slen = fa->fa_slen; /* link leaf to fib alias */ INIT_HLIST_HEAD(&l->leaf); hlist_add_head(&fa->fa_list, &l->leaf); return l; } static struct key_vector *tnode_new(t_key key, int pos, int bits) { unsigned int shift = pos + bits; struct key_vector *tn; struct tnode *tnode; /* verify bits and pos their msb bits clear and values are valid */ BUG_ON(!bits || (shift > KEYLENGTH)); tnode = tnode_alloc(bits); if (!tnode) return NULL; pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0), sizeof(struct key_vector *) << bits); if (bits == KEYLENGTH) tnode->full_children = 1; else tnode->empty_children = 1ul << bits; tn = tnode->kv; tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0; tn->pos = pos; tn->bits = bits; tn->slen = pos; return tn; } /* Check whether a tnode 'n' is "full", i.e. it is an internal node * and no bits are skipped. See discussion in dyntree paper p. 6 */ static inline int tnode_full(struct key_vector *tn, struct key_vector *n) { return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n); } /* Add a child at position i overwriting the old value. * Update the value of full_children and empty_children. */ static void put_child(struct key_vector *tn, unsigned long i, struct key_vector *n) { struct key_vector *chi = get_child(tn, i); int isfull, wasfull; BUG_ON(i >= child_length(tn)); /* update emptyChildren, overflow into fullChildren */ if (!n && chi) empty_child_inc(tn); if (n && !chi) empty_child_dec(tn); /* update fullChildren */ wasfull = tnode_full(tn, chi); isfull = tnode_full(tn, n); if (wasfull && !isfull) tn_info(tn)->full_children--; else if (!wasfull && isfull) tn_info(tn)->full_children++; if (n && (tn->slen < n->slen)) tn->slen = n->slen; rcu_assign_pointer(tn->tnode[i], n); } static void update_children(struct key_vector *tn) { unsigned long i; /* update all of the child parent pointers */ for (i = child_length(tn); i;) { struct key_vector *inode = get_child(tn, --i); if (!inode) continue; /* Either update the children of a tnode that * already belongs to us or update the child * to point to ourselves. */ if (node_parent(inode) == tn) update_children(inode); else node_set_parent(inode, tn); } } static inline void put_child_root(struct key_vector *tp, t_key key, struct key_vector *n) { if (IS_TRIE(tp)) rcu_assign_pointer(tp->tnode[0], n); else put_child(tp, get_index(key, tp), n); } static inline void tnode_free_init(struct key_vector *tn) { tn_info(tn)->rcu.next = NULL; } static inline void tnode_free_append(struct key_vector *tn, struct key_vector *n) { tn_info(n)->rcu.next = tn_info(tn)->rcu.next; tn_info(tn)->rcu.next = &tn_info(n)->rcu; } static void tnode_free(struct key_vector *tn) { struct callback_head *head = &tn_info(tn)->rcu; while (head) { head = head->next; tnode_free_size += TNODE_SIZE(1ul << tn->bits); node_free(tn); tn = container_of(head, struct tnode, rcu)->kv; } if (tnode_free_size >= READ_ONCE(sysctl_fib_sync_mem)) { tnode_free_size = 0; synchronize_net(); } } static struct key_vector *replace(struct trie *t, struct key_vector *oldtnode, struct key_vector *tn) { struct key_vector *tp = node_parent(oldtnode); unsigned long i; /* setup the parent pointer out of and back into this node */ NODE_INIT_PARENT(tn, tp); put_child_root(tp, tn->key, tn); /* update all of the child parent pointers */ update_children(tn); /* all pointers should be clean so we are done */ tnode_free(oldtnode); /* resize children now that oldtnode is freed */ for (i = child_length(tn); i;) { struct key_vector *inode = get_child(tn, --i); /* resize child node */ if (tnode_full(tn, inode)) tn = resize(t, inode); } return tp; } static struct key_vector *inflate(struct trie *t, struct key_vector *oldtnode) { struct key_vector *tn; unsigned long i; t_key m; pr_debug("In inflate\n"); tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1); if (!tn) goto notnode; /* prepare oldtnode to be freed */ tnode_free_init(oldtnode); /* Assemble all of the pointers in our cluster, in this case that * represents all of the pointers out of our allocated nodes that * point to existing tnodes and the links between our allocated * nodes. */ for (i = child_length(oldtnode), m = 1u << tn->pos; i;) { struct key_vector *inode = get_child(oldtnode, --i); struct key_vector *node0, *node1; unsigned long j, k; /* An empty child */ if (!inode) continue; /* A leaf or an internal node with skipped bits */ if (!tnode_full(oldtnode, inode)) { put_child(tn, get_index(inode->key, tn), inode); continue; } /* drop the node in the old tnode free list */ tnode_free_append(oldtnode, inode); /* An internal node with two children */ if (inode->bits == 1) { put_child(tn, 2 * i + 1, get_child(inode, 1)); put_child(tn, 2 * i, get_child(inode, 0)); continue; } /* We will replace this node 'inode' with two new * ones, 'node0' and 'node1', each with half of the * original children. The two new nodes will have * a position one bit further down the key and this * means that the "significant" part of their keys * (see the discussion near the top of this file) * will differ by one bit, which will be "0" in * node0's key and "1" in node1's key. Since we are * moving the key position by one step, the bit that * we are moving away from - the bit at position * (tn->pos) - is the one that will differ between * node0 and node1. So... we synthesize that bit in the * two new keys. */ node1 = tnode_new(inode->key | m, inode->pos, inode->bits - 1); if (!node1) goto nomem; node0 = tnode_new(inode->key, inode->pos, inode->bits - 1); tnode_free_append(tn, node1); if (!node0) goto nomem; tnode_free_append(tn, node0); /* populate child pointers in new nodes */ for (k = child_length(inode), j = k / 2; j;) { put_child(node1, --j, get_child(inode, --k)); put_child(node0, j, get_child(inode, j)); put_child(node1, --j, get_child(inode, --k)); put_child(node0, j, get_child(inode, j)); } /* link new nodes to parent */ NODE_INIT_PARENT(node1, tn); NODE_INIT_PARENT(node0, tn); /* link parent to nodes */ put_child(tn, 2 * i + 1, node1); put_child(tn, 2 * i, node0); } /* setup the parent pointers into and out of this node */ return replace(t, oldtnode, tn); nomem: /* all pointers should be clean so we are done */ tnode_free(tn); notnode: return NULL; } static struct key_vector *halve(struct trie *t, struct key_vector *oldtnode) { struct key_vector *tn; unsigned long i; pr_debug("In halve\n"); tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1); if (!tn) goto notnode; /* prepare oldtnode to be freed */ tnode_free_init(oldtnode); /* Assemble all of the pointers in our cluster, in this case that * represents all of the pointers out of our allocated nodes that * point to existing tnodes and the links between our allocated * nodes. */ for (i = child_length(oldtnode); i;) { struct key_vector *node1 = get_child(oldtnode, --i); struct key_vector *node0 = get_child(oldtnode, --i); struct key_vector *inode; /* At least one of the children is empty */ if (!node1 || !node0) { put_child(tn, i / 2, node1 ? : node0); continue; } /* Two nonempty children */ inode = tnode_new(node0->key, oldtnode->pos, 1); if (!inode) goto nomem; tnode_free_append(tn, inode); /* initialize pointers out of node */ put_child(inode, 1, node1); put_child(inode, 0, node0); NODE_INIT_PARENT(inode, tn); /* link parent to node */ put_child(tn, i / 2, inode); } /* setup the parent pointers into and out of this node */ return replace(t, oldtnode, tn); nomem: /* all pointers should be clean so we are done */ tnode_free(tn); notnode: return NULL; } static struct key_vector *collapse(struct trie *t, struct key_vector *oldtnode) { struct key_vector *n, *tp; unsigned long i; /* scan the tnode looking for that one child that might still exist */ for (n = NULL, i = child_length(oldtnode); !n && i;) n = get_child(oldtnode, --i); /* compress one level */ tp = node_parent(oldtnode); put_child_root(tp, oldtnode->key, n); node_set_parent(n, tp); /* drop dead node */ node_free(oldtnode); return tp; } static unsigned char update_suffix(struct key_vector *tn) { unsigned char slen = tn->pos; unsigned long stride, i; unsigned char slen_max; /* only vector 0 can have a suffix length greater than or equal to * tn->pos + tn->bits, the second highest node will have a suffix * length at most of tn->pos + tn->bits - 1 */ slen_max = min_t(unsigned char, tn->pos + tn->bits - 1, tn->slen); /* search though the list of children looking for nodes that might * have a suffix greater than the one we currently have. This is * why we start with a stride of 2 since a stride of 1 would * represent the nodes with suffix length equal to tn->pos */ for (i = 0, stride = 0x2ul ; i < child_length(tn); i += stride) { struct key_vector *n = get_child(tn, i); if (!n || (n->slen <= slen)) continue; /* update stride and slen based on new value */ stride <<= (n->slen - slen); slen = n->slen; i &= ~(stride - 1); /* stop searching if we have hit the maximum possible value */ if (slen >= slen_max) break; } tn->slen = slen; return slen; } /* From "Implementing a dynamic compressed trie" by Stefan Nilsson of * the Helsinki University of Technology and Matti Tikkanen of Nokia * Telecommunications, page 6: * "A node is doubled if the ratio of non-empty children to all * children in the *doubled* node is at least 'high'." * * 'high' in this instance is the variable 'inflate_threshold'. It * is expressed as a percentage, so we multiply it with * child_length() and instead of multiplying by 2 (since the * child array will be doubled by inflate()) and multiplying * the left-hand side by 100 (to handle the percentage thing) we * multiply the left-hand side by 50. * * The left-hand side may look a bit weird: child_length(tn) * - tn->empty_children is of course the number of non-null children * in the current node. tn->full_children is the number of "full" * children, that is non-null tnodes with a skip value of 0. * All of those will be doubled in the resulting inflated tnode, so * we just count them one extra time here. * * A clearer way to write this would be: * * to_be_doubled = tn->full_children; * not_to_be_doubled = child_length(tn) - tn->empty_children - * tn->full_children; * * new_child_length = child_length(tn) * 2; * * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) / * new_child_length; * if (new_fill_factor >= inflate_threshold) * * ...and so on, tho it would mess up the while () loop. * * anyway, * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >= * inflate_threshold * * avoid a division: * 100 * (not_to_be_doubled + 2*to_be_doubled) >= * inflate_threshold * new_child_length * * expand not_to_be_doubled and to_be_doubled, and shorten: * 100 * (child_length(tn) - tn->empty_children + * tn->full_children) >= inflate_threshold * new_child_length * * expand new_child_length: * 100 * (child_length(tn) - tn->empty_children + * tn->full_children) >= * inflate_threshold * child_length(tn) * 2 * * shorten again: * 50 * (tn->full_children + child_length(tn) - * tn->empty_children) >= inflate_threshold * * child_length(tn) * */ static inline bool should_inflate(struct key_vector *tp, struct key_vector *tn) { unsigned long used = child_length(tn); unsigned long threshold = used; /* Keep root node larger */ threshold *= IS_TRIE(tp) ? inflate_threshold_root : inflate_threshold; used -= tn_info(tn)->empty_children; used += tn_info(tn)->full_children; /* if bits == KEYLENGTH then pos = 0, and will fail below */ return (used > 1) && tn->pos && ((50 * used) >= threshold); } static inline bool should_halve(struct key_vector *tp, struct key_vector *tn) { unsigned long used = child_length(tn); unsigned long threshold = used; /* Keep root node larger */ threshold *= IS_TRIE(tp) ? halve_threshold_root : halve_threshold; used -= tn_info(tn)->empty_children; /* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */ return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold); } static inline bool should_collapse(struct key_vector *tn) { unsigned long used = child_length(tn); used -= tn_info(tn)->empty_children; /* account for bits == KEYLENGTH case */ if ((tn->bits == KEYLENGTH) && tn_info(tn)->full_children) used -= KEY_MAX; /* One child or none, time to drop us from the trie */ return used < 2; } #define MAX_WORK 10 static struct key_vector *resize(struct trie *t, struct key_vector *tn) { #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats __percpu *stats = t->stats; #endif struct key_vector *tp = node_parent(tn); unsigned long cindex = get_index(tn->key, tp); int max_work = MAX_WORK; pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n", tn, inflate_threshold, halve_threshold); /* track the tnode via the pointer from the parent instead of * doing it ourselves. This way we can let RCU fully do its * thing without us interfering */ BUG_ON(tn != get_child(tp, cindex)); /* Double as long as the resulting node has a number of * nonempty nodes that are above the threshold. */ while (should_inflate(tp, tn) && max_work) { tp = inflate(t, tn); if (!tp) { #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->resize_node_skipped); #endif break; } max_work--; tn = get_child(tp, cindex); } /* update parent in case inflate failed */ tp = node_parent(tn); /* Return if at least one inflate is run */ if (max_work != MAX_WORK) return tp; /* Halve as long as the number of empty children in this * node is above threshold. */ while (should_halve(tp, tn) && max_work) { tp = halve(t, tn); if (!tp) { #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->resize_node_skipped); #endif break; } max_work--; tn = get_child(tp, cindex); } /* Only one child remains */ if (should_collapse(tn)) return collapse(t, tn); /* update parent in case halve failed */ return node_parent(tn); } static void node_pull_suffix(struct key_vector *tn, unsigned char slen) { unsigned char node_slen = tn->slen; while ((node_slen > tn->pos) && (node_slen > slen)) { slen = update_suffix(tn); if (node_slen == slen) break; tn = node_parent(tn); node_slen = tn->slen; } } static void node_push_suffix(struct key_vector *tn, unsigned char slen) { while (tn->slen < slen) { tn->slen = slen; tn = node_parent(tn); } } /* rcu_read_lock needs to be hold by caller from readside */ static struct key_vector *fib_find_node(struct trie *t, struct key_vector **tp, u32 key) { struct key_vector *pn, *n = t->kv; unsigned long index = 0; do { pn = n; n = get_child_rcu(n, index); if (!n) break; index = get_cindex(key, n); /* This bit of code is a bit tricky but it combines multiple * checks into a single check. The prefix consists of the * prefix plus zeros for the bits in the cindex. The index * is the difference between the key and this value. From * this we can actually derive several pieces of data. * if (index >= (1ul << bits)) * we have a mismatch in skip bits and failed * else * we know the value is cindex * * This check is safe even if bits == KEYLENGTH due to the * fact that we can only allocate a node with 32 bits if a * long is greater than 32 bits. */ if (index >= (1ul << n->bits)) { n = NULL; break; } /* keep searching until we find a perfect match leaf or NULL */ } while (IS_TNODE(n)); *tp = pn; return n; } /* Return the first fib alias matching DSCP with * priority less than or equal to PRIO. * If 'find_first' is set, return the first matching * fib alias, regardless of DSCP and priority. */ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen, dscp_t dscp, u32 prio, u32 tb_id, bool find_first) { struct fib_alias *fa; if (!fah) return NULL; hlist_for_each_entry(fa, fah, fa_list) { /* Avoid Sparse warning when using dscp_t in inequalities */ u8 __fa_dscp = inet_dscp_to_dsfield(fa->fa_dscp); u8 __dscp = inet_dscp_to_dsfield(dscp); if (fa->fa_slen < slen) continue; if (fa->fa_slen != slen) break; if (fa->tb_id > tb_id) continue; if (fa->tb_id != tb_id) break; if (find_first) return fa; if (__fa_dscp > __dscp) continue; if (fa->fa_info->fib_priority >= prio || __fa_dscp < __dscp) return fa; } return NULL; } static struct fib_alias * fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri) { u8 slen = KEYLENGTH - fri->dst_len; struct key_vector *l, *tp; struct fib_table *tb; struct fib_alias *fa; struct trie *t; tb = fib_get_table(net, fri->tb_id); if (!tb) return NULL; t = (struct trie *)tb->tb_data; l = fib_find_node(t, &tp, be32_to_cpu(fri->dst)); if (!l) return NULL; hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { if (fa->fa_slen == slen && fa->tb_id == fri->tb_id && fa->fa_dscp == fri->dscp && fa->fa_info == fri->fi && fa->fa_type == fri->type) return fa; } return NULL; } void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri) { u8 fib_notify_on_flag_change; struct fib_alias *fa_match; struct sk_buff *skb; int err; rcu_read_lock(); fa_match = fib_find_matching_alias(net, fri); if (!fa_match) goto out; /* These are paired with the WRITE_ONCE() happening in this function. * The reason is that we are only protected by RCU at this point. */ if (READ_ONCE(fa_match->offload) == fri->offload && READ_ONCE(fa_match->trap) == fri->trap && READ_ONCE(fa_match->offload_failed) == fri->offload_failed) goto out; WRITE_ONCE(fa_match->offload, fri->offload); WRITE_ONCE(fa_match->trap, fri->trap); fib_notify_on_flag_change = READ_ONCE(net->ipv4.sysctl_fib_notify_on_flag_change); /* 2 means send notifications only if offload_failed was changed. */ if (fib_notify_on_flag_change == 2 && READ_ONCE(fa_match->offload_failed) == fri->offload_failed) goto out; WRITE_ONCE(fa_match->offload_failed, fri->offload_failed); if (!fib_notify_on_flag_change) goto out; skb = nlmsg_new(fib_nlmsg_size(fa_match->fa_info), GFP_ATOMIC); if (!skb) { err = -ENOBUFS; goto errout; } err = fib_dump_info(skb, 0, 0, RTM_NEWROUTE, fri, 0); if (err < 0) { /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_IPV4_ROUTE, NULL, GFP_ATOMIC); goto out; errout: rtnl_set_sk_err(net, RTNLGRP_IPV4_ROUTE, err); out: rcu_read_unlock(); } EXPORT_SYMBOL_GPL(fib_alias_hw_flags_set); static void trie_rebalance(struct trie *t, struct key_vector *tn) { while (!IS_TRIE(tn)) tn = resize(t, tn); } static int fib_insert_node(struct trie *t, struct key_vector *tp, struct fib_alias *new, t_key key) { struct key_vector *n, *l; l = leaf_new(key, new); if (!l) goto noleaf; /* retrieve child from parent node */ n = get_child(tp, get_index(key, tp)); /* Case 2: n is a LEAF or a TNODE and the key doesn't match. * * Add a new tnode here * first tnode need some special handling * leaves us in position for handling as case 3 */ if (n) { struct key_vector *tn; tn = tnode_new(key, __fls(key ^ n->key), 1); if (!tn) goto notnode; /* initialize routes out of node */ NODE_INIT_PARENT(tn, tp); put_child(tn, get_index(key, tn) ^ 1, n); /* start adding routes into the node */ put_child_root(tp, key, tn); node_set_parent(n, tn); /* parent now has a NULL spot where the leaf can go */ tp = tn; } /* Case 3: n is NULL, and will just insert a new leaf */ node_push_suffix(tp, new->fa_slen); NODE_INIT_PARENT(l, tp); put_child_root(tp, key, l); trie_rebalance(t, tp); return 0; notnode: node_free(l); noleaf: return -ENOMEM; } static int fib_insert_alias(struct trie *t, struct key_vector *tp, struct key_vector *l, struct fib_alias *new, struct fib_alias *fa, t_key key) { if (!l) return fib_insert_node(t, tp, new, key); if (fa) { hlist_add_before_rcu(&new->fa_list, &fa->fa_list); } else { struct fib_alias *last; hlist_for_each_entry(last, &l->leaf, fa_list) { if (new->fa_slen < last->fa_slen) break; if ((new->fa_slen == last->fa_slen) && (new->tb_id > last->tb_id)) break; fa = last; } if (fa) hlist_add_behind_rcu(&new->fa_list, &fa->fa_list); else hlist_add_head_rcu(&new->fa_list, &l->leaf); } /* if we added to the tail node then we need to update slen */ if (l->slen < new->fa_slen) { l->slen = new->fa_slen; node_push_suffix(tp, new->fa_slen); } return 0; } static void fib_remove_alias(struct trie *t, struct key_vector *tp, struct key_vector *l, struct fib_alias *old); /* Caller must hold RTNL. */ int fib_table_insert(struct net *net, struct fib_table *tb, struct fib_config *cfg, struct netlink_ext_ack *extack) { struct trie *t = (struct trie *)tb->tb_data; struct fib_alias *fa, *new_fa; struct key_vector *l, *tp; u16 nlflags = NLM_F_EXCL; struct fib_info *fi; u8 plen = cfg->fc_dst_len; u8 slen = KEYLENGTH - plen; dscp_t dscp; u32 key; int err; key = ntohl(cfg->fc_dst); pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen); fi = fib_create_info(cfg, extack); if (IS_ERR(fi)) { err = PTR_ERR(fi); goto err; } dscp = cfg->fc_dscp; l = fib_find_node(t, &tp, key); fa = l ? fib_find_alias(&l->leaf, slen, dscp, fi->fib_priority, tb->tb_id, false) : NULL; /* Now fa, if non-NULL, points to the first fib alias * with the same keys [prefix,dscp,priority], if such key already * exists or to the node before which we will insert new one. * * If fa is NULL, we will need to allocate a new one and * insert to the tail of the section matching the suffix length * of the new alias. */ if (fa && fa->fa_dscp == dscp && fa->fa_info->fib_priority == fi->fib_priority) { struct fib_alias *fa_first, *fa_match; err = -EEXIST; if (cfg->fc_nlflags & NLM_F_EXCL) goto out; nlflags &= ~NLM_F_EXCL; /* We have 2 goals: * 1. Find exact match for type, scope, fib_info to avoid * duplicate routes * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it */ fa_match = NULL; fa_first = fa; hlist_for_each_entry_from(fa, fa_list) { if ((fa->fa_slen != slen) || (fa->tb_id != tb->tb_id) || (fa->fa_dscp != dscp)) break; if (fa->fa_info->fib_priority != fi->fib_priority) break; if (fa->fa_type == cfg->fc_type && fa->fa_info == fi) { fa_match = fa; break; } } if (cfg->fc_nlflags & NLM_F_REPLACE) { struct fib_info *fi_drop; u8 state; nlflags |= NLM_F_REPLACE; fa = fa_first; if (fa_match) { if (fa == fa_match) err = 0; goto out; } err = -ENOBUFS; new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); if (!new_fa) goto out; fi_drop = fa->fa_info; new_fa->fa_dscp = fa->fa_dscp; new_fa->fa_info = fi; new_fa->fa_type = cfg->fc_type; state = fa->fa_state; new_fa->fa_state = state & ~FA_S_ACCESSED; new_fa->fa_slen = fa->fa_slen; new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; new_fa->offload = 0; new_fa->trap = 0; new_fa->offload_failed = 0; hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); if (fib_find_alias(&l->leaf, fa->fa_slen, 0, 0, tb->tb_id, true) == new_fa) { enum fib_event_type fib_event; fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib_entry_notifiers(net, fib_event, key, plen, new_fa, extack); if (err) { hlist_replace_rcu(&new_fa->fa_list, &fa->fa_list); goto out_free_new_fa; } } rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, nlflags); alias_free_mem_rcu(fa); fib_release_info(fi_drop); if (state & FA_S_ACCESSED) rt_cache_flush(cfg->fc_nlinfo.nl_net); goto succeeded; } /* Error if we find a perfect match which * uses the same scope, type, and nexthop * information. */ if (fa_match) goto out; if (cfg->fc_nlflags & NLM_F_APPEND) nlflags |= NLM_F_APPEND; else fa = fa_first; } err = -ENOENT; if (!(cfg->fc_nlflags & NLM_F_CREATE)) goto out; nlflags |= NLM_F_CREATE; err = -ENOBUFS; new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); if (!new_fa) goto out; new_fa->fa_info = fi; new_fa->fa_dscp = dscp; new_fa->fa_type = cfg->fc_type; new_fa->fa_state = 0; new_fa->fa_slen = slen; new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; new_fa->offload = 0; new_fa->trap = 0; new_fa->offload_failed = 0; /* Insert new entry to the list. */ err = fib_insert_alias(t, tp, l, new_fa, fa, key); if (err) goto out_free_new_fa; /* The alias was already inserted, so the node must exist. */ l = l ? l : fib_find_node(t, &tp, key); if (WARN_ON_ONCE(!l)) { err = -ENOENT; goto out_free_new_fa; } if (fib_find_alias(&l->leaf, new_fa->fa_slen, 0, 0, tb->tb_id, true) == new_fa) { enum fib_event_type fib_event; fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib_entry_notifiers(net, fib_event, key, plen, new_fa, extack); if (err) goto out_remove_new_fa; } if (!plen) tb->tb_num_default++; rt_cache_flush(cfg->fc_nlinfo.nl_net); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, &cfg->fc_nlinfo, nlflags); succeeded: return 0; out_remove_new_fa: fib_remove_alias(t, tp, l, new_fa); out_free_new_fa: kmem_cache_free(fn_alias_kmem, new_fa); out: fib_release_info(fi); err: return err; } static inline t_key prefix_mismatch(t_key key, struct key_vector *n) { t_key prefix = n->key; return (key ^ prefix) & (prefix | -prefix); } bool fib_lookup_good_nhc(const struct fib_nh_common *nhc, int fib_flags, const struct flowi4 *flp) { if (nhc->nhc_flags & RTNH_F_DEAD) return false; if (ip_ignore_linkdown(nhc->nhc_dev) && nhc->nhc_flags & RTNH_F_LINKDOWN && !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) return false; if (flp->flowi4_oif && flp->flowi4_oif != nhc->nhc_oif) return false; return true; } /* should be called with rcu_read_lock */ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, struct fib_result *res, int fib_flags) { struct trie *t = (struct trie *) tb->tb_data; #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats __percpu *stats = t->stats; #endif const t_key key = ntohl(flp->daddr); struct key_vector *n, *pn; struct fib_alias *fa; unsigned long index; t_key cindex; pn = t->kv; cindex = 0; n = get_child_rcu(pn, cindex); if (!n) { trace_fib_table_lookup(tb->tb_id, flp, NULL, -EAGAIN); return -EAGAIN; } #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->gets); #endif /* Step 1: Travel to the longest prefix match in the trie */ for (;;) { index = get_cindex(key, n); /* This bit of code is a bit tricky but it combines multiple * checks into a single check. The prefix consists of the * prefix plus zeros for the "bits" in the prefix. The index * is the difference between the key and this value. From * this we can actually derive several pieces of data. * if (index >= (1ul << bits)) * we have a mismatch in skip bits and failed * else * we know the value is cindex * * This check is safe even if bits == KEYLENGTH due to the * fact that we can only allocate a node with 32 bits if a * long is greater than 32 bits. */ if (index >= (1ul << n->bits)) break; /* we have found a leaf. Prefixes have already been compared */ if (IS_LEAF(n)) goto found; /* only record pn and cindex if we are going to be chopping * bits later. Otherwise we are just wasting cycles. */ if (n->slen > n->pos) { pn = n; cindex = index; } n = get_child_rcu(n, index); if (unlikely(!n)) goto backtrace; } /* Step 2: Sort out leaves and begin backtracing for longest prefix */ for (;;) { /* record the pointer where our next node pointer is stored */ struct key_vector __rcu **cptr = n->tnode; /* This test verifies that none of the bits that differ * between the key and the prefix exist in the region of * the lsb and higher in the prefix. */ if (unlikely(prefix_mismatch(key, n)) || (n->slen == n->pos)) goto backtrace; /* exit out and process leaf */ if (unlikely(IS_LEAF(n))) break; /* Don't bother recording parent info. Since we are in * prefix match mode we will have to come back to wherever * we started this traversal anyway */ while ((n = rcu_dereference(*cptr)) == NULL) { backtrace: #ifdef CONFIG_IP_FIB_TRIE_STATS if (!n) this_cpu_inc(stats->null_node_hit); #endif /* If we are at cindex 0 there are no more bits for * us to strip at this level so we must ascend back * up one level to see if there are any more bits to * be stripped there. */ while (!cindex) { t_key pkey = pn->key; /* If we don't have a parent then there is * nothing for us to do as we do not have any * further nodes to parse. */ if (IS_TRIE(pn)) { trace_fib_table_lookup(tb->tb_id, flp, NULL, -EAGAIN); return -EAGAIN; } #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->backtrack); #endif /* Get Child's index */ pn = node_parent_rcu(pn); cindex = get_index(pkey, pn); } /* strip the least significant bit from the cindex */ cindex &= cindex - 1; /* grab pointer for next child node */ cptr = &pn->tnode[cindex]; } } found: /* this line carries forward the xor from earlier in the function */ index = key ^ n->key; /* Step 3: Process the leaf, if that fails fall back to backtracing */ hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) { struct fib_info *fi = fa->fa_info; struct fib_nh_common *nhc; int nhsel, err; if ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen < KEYLENGTH)) { if (index >= (1ul << fa->fa_slen)) continue; } if (fa->fa_dscp && !fib_dscp_masked_match(fa->fa_dscp, flp)) continue; /* Paired with WRITE_ONCE() in fib_release_info() */ if (READ_ONCE(fi->fib_dead)) continue; if (fa->fa_info->fib_scope < flp->flowi4_scope) continue; fib_alias_accessed(fa); err = fib_props[fa->fa_type].error; if (unlikely(err < 0)) { out_reject: #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_passed); #endif trace_fib_table_lookup(tb->tb_id, flp, NULL, err); return err; } if (fi->fib_flags & RTNH_F_DEAD) continue; if (unlikely(fi->nh)) { if (nexthop_is_blackhole(fi->nh)) { err = fib_props[RTN_BLACKHOLE].error; goto out_reject; } nhc = nexthop_get_nhc_lookup(fi->nh, fib_flags, flp, &nhsel); if (nhc) goto set_result; goto miss; } for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) { nhc = fib_info_nhc(fi, nhsel); if (!fib_lookup_good_nhc(nhc, fib_flags, flp)) continue; set_result: if (!(fib_flags & FIB_LOOKUP_NOREF)) refcount_inc(&fi->fib_clntref); res->prefix = htonl(n->key); res->prefixlen = KEYLENGTH - fa->fa_slen; res->nh_sel = nhsel; res->nhc = nhc; res->type = fa->fa_type; res->scope = fi->fib_scope; res->dscp = fa->fa_dscp; res->fi = fi; res->table = tb; res->fa_head = &n->leaf; #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_passed); #endif trace_fib_table_lookup(tb->tb_id, flp, nhc, err); return err; } } miss: #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_miss); #endif goto backtrace; } EXPORT_SYMBOL_GPL(fib_table_lookup); static void fib_remove_alias(struct trie *t, struct key_vector *tp, struct key_vector *l, struct fib_alias *old) { /* record the location of the previous list_info entry */ struct hlist_node **pprev = old->fa_list.pprev; struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next); /* remove the fib_alias from the list */ hlist_del_rcu(&old->fa_list); /* if we emptied the list this leaf will be freed and we can sort * out parent suffix lengths as a part of trie_rebalance */ if (hlist_empty(&l->leaf)) { if (tp->slen == l->slen) node_pull_suffix(tp, tp->pos); put_child_root(tp, l->key, NULL); node_free(l); trie_rebalance(t, tp); return; } /* only access fa if it is pointing at the last valid hlist_node */ if (*pprev) return; /* update the trie with the latest suffix length */ l->slen = fa->fa_slen; node_pull_suffix(tp, fa->fa_slen); } static void fib_notify_alias_delete(struct net *net, u32 key, struct hlist_head *fah, struct fib_alias *fa_to_delete, struct netlink_ext_ack *extack) { struct fib_alias *fa_next, *fa_to_notify; u32 tb_id = fa_to_delete->tb_id; u8 slen = fa_to_delete->fa_slen; enum fib_event_type fib_event; /* Do not notify if we do not care about the route. */ if (fib_find_alias(fah, slen, 0, 0, tb_id, true) != fa_to_delete) return; /* Determine if the route should be replaced by the next route in the * list. */ fa_next = hlist_entry_safe(fa_to_delete->fa_list.next, struct fib_alias, fa_list); if (fa_next && fa_next->fa_slen == slen && fa_next->tb_id == tb_id) { fib_event = FIB_EVENT_ENTRY_REPLACE; fa_to_notify = fa_next; } else { fib_event = FIB_EVENT_ENTRY_DEL; fa_to_notify = fa_to_delete; } call_fib_entry_notifiers(net, fib_event, key, KEYLENGTH - slen, fa_to_notify, extack); } /* Caller must hold RTNL. */ int fib_table_delete(struct net *net, struct fib_table *tb, struct fib_config *cfg, struct netlink_ext_ack *extack) { struct trie *t = (struct trie *) tb->tb_data; struct fib_alias *fa, *fa_to_delete; struct key_vector *l, *tp; u8 plen = cfg->fc_dst_len; u8 slen = KEYLENGTH - plen; dscp_t dscp; u32 key; key = ntohl(cfg->fc_dst); l = fib_find_node(t, &tp, key); if (!l) return -ESRCH; dscp = cfg->fc_dscp; fa = fib_find_alias(&l->leaf, slen, dscp, 0, tb->tb_id, false); if (!fa) return -ESRCH; pr_debug("Deleting %08x/%d dsfield=0x%02x t=%p\n", key, plen, inet_dscp_to_dsfield(dscp), t); fa_to_delete = NULL; hlist_for_each_entry_from(fa, fa_list) { struct fib_info *fi = fa->fa_info; if ((fa->fa_slen != slen) || (fa->tb_id != tb->tb_id) || (fa->fa_dscp != dscp)) break; if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) && (cfg->fc_scope == RT_SCOPE_NOWHERE || fa->fa_info->fib_scope == cfg->fc_scope) && (!cfg->fc_prefsrc || fi->fib_prefsrc == cfg->fc_prefsrc) && (!cfg->fc_protocol || fi->fib_protocol == cfg->fc_protocol) && fib_nh_match(net, cfg, fi, extack) == 0 && fib_metrics_match(cfg, fi)) { fa_to_delete = fa; break; } } if (!fa_to_delete) return -ESRCH; fib_notify_alias_delete(net, key, &l->leaf, fa_to_delete, extack); rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, &cfg->fc_nlinfo, 0); if (!plen) tb->tb_num_default--; fib_remove_alias(t, tp, l, fa_to_delete); if (fa_to_delete->fa_state & FA_S_ACCESSED) rt_cache_flush(cfg->fc_nlinfo.nl_net); fib_release_info(fa_to_delete->fa_info); alias_free_mem_rcu(fa_to_delete); return 0; } /* Scan for the next leaf starting at the provided key value */ static struct key_vector *leaf_walk_rcu(struct key_vector **tn, t_key key) { struct key_vector *pn, *n = *tn; unsigned long cindex; /* this loop is meant to try and find the key in the trie */ do { /* record parent and next child index */ pn = n; cindex = (key > pn->key) ? get_index(key, pn) : 0; if (cindex >> pn->bits) break; /* descend into the next child */ n = get_child_rcu(pn, cindex++); if (!n) break; /* guarantee forward progress on the keys */ if (IS_LEAF(n) && (n->key >= key)) goto found; } while (IS_TNODE(n)); /* this loop will search for the next leaf with a greater key */ while (!IS_TRIE(pn)) { /* if we exhausted the parent node we will need to climb */ if (cindex >= (1ul << pn->bits)) { t_key pkey = pn->key; pn = node_parent_rcu(pn); cindex = get_index(pkey, pn) + 1; continue; } /* grab the next available node */ n = get_child_rcu(pn, cindex++); if (!n) continue; /* no need to compare keys since we bumped the index */ if (IS_LEAF(n)) goto found; /* Rescan start scanning in new node */ pn = n; cindex = 0; } *tn = pn; return NULL; /* Root of trie */ found: /* if we are at the limit for keys just return NULL for the tnode */ *tn = pn; return n; } static void fib_trie_free(struct fib_table *tb) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *pn = t->kv; unsigned long cindex = 1; struct hlist_node *tmp; struct fib_alias *fa; /* walk trie in reverse order and free everything */ for (;;) { struct key_vector *n; if (!(cindex--)) { t_key pkey = pn->key; if (IS_TRIE(pn)) break; n = pn; pn = node_parent(pn); /* drop emptied tnode */ put_child_root(pn, n->key, NULL); node_free(n); cindex = get_index(pkey, pn); continue; } /* grab the next available node */ n = get_child(pn, cindex); if (!n) continue; if (IS_TNODE(n)) { /* record pn and cindex for leaf walking */ pn = n; cindex = 1ul << n->bits; continue; } hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { hlist_del_rcu(&fa->fa_list); alias_free_mem_rcu(fa); } put_child_root(pn, n->key, NULL); node_free(n); } #ifdef CONFIG_IP_FIB_TRIE_STATS free_percpu(t->stats); #endif kfree(tb); } struct fib_table *fib_trie_unmerge(struct fib_table *oldtb) { struct trie *ot = (struct trie *)oldtb->tb_data; struct key_vector *l, *tp = ot->kv; struct fib_table *local_tb; struct fib_alias *fa; struct trie *lt; t_key key = 0; if (oldtb->tb_data == oldtb->__data) return oldtb; local_tb = fib_trie_table(RT_TABLE_LOCAL, NULL); if (!local_tb) return NULL; lt = (struct trie *)local_tb->tb_data; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { struct key_vector *local_l = NULL, *local_tp; hlist_for_each_entry(fa, &l->leaf, fa_list) { struct fib_alias *new_fa; if (local_tb->tb_id != fa->tb_id) continue; /* clone fa for new local table */ new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); if (!new_fa) goto out; memcpy(new_fa, fa, sizeof(*fa)); /* insert clone into table */ if (!local_l) local_l = fib_find_node(lt, &local_tp, l->key); if (fib_insert_alias(lt, local_tp, local_l, new_fa, NULL, l->key)) { kmem_cache_free(fn_alias_kmem, new_fa); goto out; } } /* stop loop if key wrapped back to 0 */ key = l->key + 1; if (key < l->key) break; } return local_tb; out: fib_trie_free(local_tb); return NULL; } /* Caller must hold RTNL */ void fib_table_flush_external(struct fib_table *tb) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *pn = t->kv; unsigned long cindex = 1; struct hlist_node *tmp; struct fib_alias *fa; /* walk trie in reverse order */ for (;;) { unsigned char slen = 0; struct key_vector *n; if (!(cindex--)) { t_key pkey = pn->key; /* cannot resize the trie vector */ if (IS_TRIE(pn)) break; /* update the suffix to address pulled leaves */ if (pn->slen > pn->pos) update_suffix(pn); /* resize completed node */ pn = resize(t, pn); cindex = get_index(pkey, pn); continue; } /* grab the next available node */ n = get_child(pn, cindex); if (!n) continue; if (IS_TNODE(n)) { /* record pn and cindex for leaf walking */ pn = n; cindex = 1ul << n->bits; continue; } hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { /* if alias was cloned to local then we just * need to remove the local copy from main */ if (tb->tb_id != fa->tb_id) { hlist_del_rcu(&fa->fa_list); alias_free_mem_rcu(fa); continue; } /* record local slen */ slen = fa->fa_slen; } /* update leaf slen */ n->slen = slen; if (hlist_empty(&n->leaf)) { put_child_root(pn, n->key, NULL); node_free(n); } } } /* Caller must hold RTNL. */ int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all) { struct trie *t = (struct trie *)tb->tb_data; struct nl_info info = { .nl_net = net }; struct key_vector *pn = t->kv; unsigned long cindex = 1; struct hlist_node *tmp; struct fib_alias *fa; int found = 0; /* walk trie in reverse order */ for (;;) { unsigned char slen = 0; struct key_vector *n; if (!(cindex--)) { t_key pkey = pn->key; /* cannot resize the trie vector */ if (IS_TRIE(pn)) break; /* update the suffix to address pulled leaves */ if (pn->slen > pn->pos) update_suffix(pn); /* resize completed node */ pn = resize(t, pn); cindex = get_index(pkey, pn); continue; } /* grab the next available node */ n = get_child(pn, cindex); if (!n) continue; if (IS_TNODE(n)) { /* record pn and cindex for leaf walking */ pn = n; cindex = 1ul << n->bits; continue; } hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { struct fib_info *fi = fa->fa_info; if (!fi || tb->tb_id != fa->tb_id || (!(fi->fib_flags & RTNH_F_DEAD) && !fib_props[fa->fa_type].error)) { slen = fa->fa_slen; continue; } /* Do not flush error routes if network namespace is * not being dismantled */ if (!flush_all && fib_props[fa->fa_type].error) { slen = fa->fa_slen; continue; } fib_notify_alias_delete(net, n->key, &n->leaf, fa, NULL); if (fi->pfsrc_removed) rtmsg_fib(RTM_DELROUTE, htonl(n->key), fa, KEYLENGTH - fa->fa_slen, tb->tb_id, &info, 0); hlist_del_rcu(&fa->fa_list); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); found++; } /* update leaf slen */ n->slen = slen; if (hlist_empty(&n->leaf)) { put_child_root(pn, n->key, NULL); node_free(n); } } pr_debug("trie_flush found=%d\n", found); return found; } /* derived from fib_trie_free */ static void __fib_info_notify_update(struct net *net, struct fib_table *tb, struct nl_info *info) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *pn = t->kv; unsigned long cindex = 1; struct fib_alias *fa; for (;;) { struct key_vector *n; if (!(cindex--)) { t_key pkey = pn->key; if (IS_TRIE(pn)) break; pn = node_parent(pn); cindex = get_index(pkey, pn); continue; } /* grab the next available node */ n = get_child(pn, cindex); if (!n) continue; if (IS_TNODE(n)) { /* record pn and cindex for leaf walking */ pn = n; cindex = 1ul << n->bits; continue; } hlist_for_each_entry(fa, &n->leaf, fa_list) { struct fib_info *fi = fa->fa_info; if (!fi || !fi->nh_updated || fa->tb_id != tb->tb_id) continue; rtmsg_fib(RTM_NEWROUTE, htonl(n->key), fa, KEYLENGTH - fa->fa_slen, tb->tb_id, info, NLM_F_REPLACE); } } } void fib_info_notify_update(struct net *net, struct nl_info *info) { unsigned int h; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct fib_table *tb; hlist_for_each_entry_rcu(tb, head, tb_hlist, lockdep_rtnl_is_held()) __fib_info_notify_update(net, tb, info); } } static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct fib_alias *fa; int last_slen = -1; int err; hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { struct fib_info *fi = fa->fa_info; if (!fi) continue; /* local and main table can share the same trie, * so don't notify twice for the same entry. */ if (tb->tb_id != fa->tb_id) continue; if (fa->fa_slen == last_slen) continue; last_slen = fa->fa_slen; err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_REPLACE, l->key, KEYLENGTH - fa->fa_slen, fa, extack); if (err) return err; } return 0; } static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *l, *tp = t->kv; t_key key = 0; int err; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { err = fib_leaf_notify(l, tb, nb, extack); if (err) return err; key = l->key + 1; /* stop in case of wrap around */ if (key < l->key) break; } return 0; } int fib_notify(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { unsigned int h; int err; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct fib_table *tb; hlist_for_each_entry_rcu(tb, head, tb_hlist) { err = fib_table_notify(tb, nb, extack); if (err) return err; } } return 0; } static void __trie_free_rcu(struct rcu_head *head) { struct fib_table *tb = container_of(head, struct fib_table, rcu); #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie *t = (struct trie *)tb->tb_data; if (tb->tb_data == tb->__data) free_percpu(t->stats); #endif /* CONFIG_IP_FIB_TRIE_STATS */ kfree(tb); } void fib_free_table(struct fib_table *tb) { call_rcu(&tb->rcu, __trie_free_rcu); } static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb, struct fib_dump_filter *filter) { unsigned int flags = NLM_F_MULTI; __be32 xkey = htonl(l->key); int i, s_i, i_fa, s_fa, err; struct fib_alias *fa; if (filter->filter_set || !filter->dump_exceptions || !filter->dump_routes) flags |= NLM_F_DUMP_FILTERED; s_i = cb->args[4]; s_fa = cb->args[5]; i = 0; /* rcu_read_lock is hold by caller */ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { struct fib_info *fi = fa->fa_info; if (i < s_i) goto next; i_fa = 0; if (tb->tb_id != fa->tb_id) goto next; if (filter->filter_set) { if (filter->rt_type && fa->fa_type != filter->rt_type) goto next; if ((filter->protocol && fi->fib_protocol != filter->protocol)) goto next; if (filter->dev && !fib_info_nh_uses_dev(fi, filter->dev)) goto next; } if (filter->dump_routes) { if (!s_fa) { struct fib_rt_info fri; fri.fi = fi; fri.tb_id = tb->tb_id; fri.dst = xkey; fri.dst_len = KEYLENGTH - fa->fa_slen; fri.dscp = fa->fa_dscp; fri.type = fa->fa_type; fri.offload = READ_ONCE(fa->offload); fri.trap = READ_ONCE(fa->trap); fri.offload_failed = READ_ONCE(fa->offload_failed); err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWROUTE, &fri, flags); if (err < 0) goto stop; } i_fa++; } if (filter->dump_exceptions) { err = fib_dump_info_fnhe(skb, cb, tb->tb_id, fi, &i_fa, s_fa, flags); if (err < 0) goto stop; } next: i++; } cb->args[4] = i; return skb->len; stop: cb->args[4] = i; cb->args[5] = i_fa; return err; } /* rcu_read_lock needs to be hold by caller from readside */ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb, struct fib_dump_filter *filter) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *l, *tp = t->kv; /* Dump starting at last key. * Note: 0.0.0.0/0 (ie default) is first key. */ int count = cb->args[2]; t_key key = cb->args[3]; /* First time here, count and key are both always 0. Count > 0 * and key == 0 means the dump has wrapped around and we are done. */ if (count && !key) return 0; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { int err; err = fn_trie_dump_leaf(l, tb, skb, cb, filter); if (err < 0) { cb->args[3] = key; cb->args[2] = count; return err; } ++count; key = l->key + 1; memset(&cb->args[4], 0, sizeof(cb->args) - 4*sizeof(cb->args[0])); /* stop loop if key wrapped back to 0 */ if (key < l->key) break; } cb->args[3] = key; cb->args[2] = count; return 0; } void __init fib_trie_init(void) { fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias), 0, SLAB_PANIC | SLAB_ACCOUNT, NULL); trie_leaf_kmem = kmem_cache_create("ip_fib_trie", LEAF_SIZE, 0, SLAB_PANIC | SLAB_ACCOUNT, NULL); } struct fib_table *fib_trie_table(u32 id, struct fib_table *alias) { struct fib_table *tb; struct trie *t; size_t sz = sizeof(*tb); if (!alias) sz += sizeof(struct trie); tb = kzalloc(sz, GFP_KERNEL); if (!tb) return NULL; tb->tb_id = id; tb->tb_num_default = 0; tb->tb_data = (alias ? alias->__data : tb->__data); if (alias) return tb; t = (struct trie *) tb->tb_data; t->kv[0].pos = KEYLENGTH; t->kv[0].slen = KEYLENGTH; #ifdef CONFIG_IP_FIB_TRIE_STATS t->stats = alloc_percpu(struct trie_use_stats); if (!t->stats) { kfree(tb); tb = NULL; } #endif return tb; } #ifdef CONFIG_PROC_FS /* Depth first Trie walk iterator */ struct fib_trie_iter { struct seq_net_private p; struct fib_table *tb; struct key_vector *tnode; unsigned int index; unsigned int depth; }; static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter) { unsigned long cindex = iter->index; struct key_vector *pn = iter->tnode; t_key pkey; pr_debug("get_next iter={node=%p index=%d depth=%d}\n", iter->tnode, iter->index, iter->depth); while (!IS_TRIE(pn)) { while (cindex < child_length(pn)) { struct key_vector *n = get_child_rcu(pn, cindex++); if (!n) continue; if (IS_LEAF(n)) { iter->tnode = pn; iter->index = cindex; } else { /* push down one level */ iter->tnode = n; iter->index = 0; ++iter->depth; } return n; } /* Current node exhausted, pop back up */ pkey = pn->key; pn = node_parent_rcu(pn); cindex = get_index(pkey, pn) + 1; --iter->depth; } /* record root node so further searches know we are done */ iter->tnode = pn; iter->index = 0; return NULL; } static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter, struct trie *t) { struct key_vector *n, *pn; if (!t) return NULL; pn = t->kv; n = rcu_dereference(pn->tnode[0]); if (!n) return NULL; if (IS_TNODE(n)) { iter->tnode = n; iter->index = 0; iter->depth = 1; } else { iter->tnode = pn; iter->index = 0; iter->depth = 0; } return n; } static void trie_collect_stats(struct trie *t, struct trie_stat *s) { struct key_vector *n; struct fib_trie_iter iter; memset(s, 0, sizeof(*s)); rcu_read_lock(); for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) { if (IS_LEAF(n)) { struct fib_alias *fa; s->leaves++; s->totdepth += iter.depth; if (iter.depth > s->maxdepth) s->maxdepth = iter.depth; hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) ++s->prefixes; } else { s->tnodes++; if (n->bits < MAX_STAT_DEPTH) s->nodesizes[n->bits]++; s->nullpointers += tn_info(n)->empty_children; } } rcu_read_unlock(); } /* * This outputs /proc/net/fib_triestats */ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) { unsigned int i, max, pointers, bytes, avdepth; if (stat->leaves) avdepth = stat->totdepth*100 / stat->leaves; else avdepth = 0; seq_printf(seq, "\tAver depth: %u.%02d\n", avdepth / 100, avdepth % 100); seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth); seq_printf(seq, "\tLeaves: %u\n", stat->leaves); bytes = LEAF_SIZE * stat->leaves; seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes); bytes += sizeof(struct fib_alias) * stat->prefixes; seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes); bytes += TNODE_SIZE(0) * stat->tnodes; max = MAX_STAT_DEPTH; while (max > 0 && stat->nodesizes[max-1] == 0) max--; pointers = 0; for (i = 1; i < max; i++) if (stat->nodesizes[i] != 0) { seq_printf(seq, " %u: %u", i, stat->nodesizes[i]); pointers += (1<<i) * stat->nodesizes[i]; } seq_putc(seq, '\n'); seq_printf(seq, "\tPointers: %u\n", pointers); bytes += sizeof(struct key_vector *) * pointers; seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers); seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024); } #ifdef CONFIG_IP_FIB_TRIE_STATS static void trie_show_usage(struct seq_file *seq, const struct trie_use_stats __percpu *stats) { struct trie_use_stats s = { 0 }; int cpu; /* loop through all of the CPUs and gather up the stats */ for_each_possible_cpu(cpu) { const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu); s.gets += pcpu->gets; s.backtrack += pcpu->backtrack; s.semantic_match_passed += pcpu->semantic_match_passed; s.semantic_match_miss += pcpu->semantic_match_miss; s.null_node_hit += pcpu->null_node_hit; s.resize_node_skipped += pcpu->resize_node_skipped; } seq_printf(seq, "\nCounters:\n---------\n"); seq_printf(seq, "gets = %u\n", s.gets); seq_printf(seq, "backtracks = %u\n", s.backtrack); seq_printf(seq, "semantic match passed = %u\n", s.semantic_match_passed); seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss); seq_printf(seq, "null node hit= %u\n", s.null_node_hit); seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped); } #endif /* CONFIG_IP_FIB_TRIE_STATS */ static void fib_table_print(struct seq_file *seq, struct fib_table *tb) { if (tb->tb_id == RT_TABLE_LOCAL) seq_puts(seq, "Local:\n"); else if (tb->tb_id == RT_TABLE_MAIN) seq_puts(seq, "Main:\n"); else seq_printf(seq, "Id %d:\n", tb->tb_id); } static int fib_triestat_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; unsigned int h; seq_printf(seq, "Basic info: size of leaf:" " %zd bytes, size of tnode: %zd bytes.\n", LEAF_SIZE, TNODE_SIZE(0)); rcu_read_lock(); for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct fib_table *tb; hlist_for_each_entry_rcu(tb, head, tb_hlist) { struct trie *t = (struct trie *) tb->tb_data; struct trie_stat stat; if (!t) continue; fib_table_print(seq, tb); trie_collect_stats(t, &stat); trie_show_stats(seq, &stat); #ifdef CONFIG_IP_FIB_TRIE_STATS trie_show_usage(seq, t->stats); #endif } cond_resched_rcu(); } rcu_read_unlock(); return 0; } static struct key_vector *fib_trie_get_idx(struct seq_file *seq, loff_t pos) { struct fib_trie_iter *iter = seq->private; struct net *net = seq_file_net(seq); loff_t idx = 0; unsigned int h; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct fib_table *tb; hlist_for_each_entry_rcu(tb, head, tb_hlist) { struct key_vector *n; for (n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); n; n = fib_trie_get_next(iter)) if (pos == idx++) { iter->tb = tb; return n; } } } return NULL; } static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); return fib_trie_get_idx(seq, *pos); } static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct fib_trie_iter *iter = seq->private; struct net *net = seq_file_net(seq); struct fib_table *tb = iter->tb; struct hlist_node *tb_node; unsigned int h; struct key_vector *n; ++*pos; /* next node in same table */ n = fib_trie_get_next(iter); if (n) return n; /* walk rest of this hash chain */ h = tb->tb_id & (FIB_TABLE_HASHSZ - 1); while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) { tb = hlist_entry(tb_node, struct fib_table, tb_hlist); n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); if (n) goto found; } /* new hash chain */ while (++h < FIB_TABLE_HASHSZ) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb_hlist) { n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); if (n) goto found; } } return NULL; found: iter->tb = tb; return n; } static void fib_trie_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static void seq_indent(struct seq_file *seq, int n) { while (n-- > 0) seq_puts(seq, " "); } static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s) { switch (s) { case RT_SCOPE_UNIVERSE: return "universe"; case RT_SCOPE_SITE: return "site"; case RT_SCOPE_LINK: return "link"; case RT_SCOPE_HOST: return "host"; case RT_SCOPE_NOWHERE: return "nowhere"; default: snprintf(buf, len, "scope=%d", s); return buf; } } static const char *const rtn_type_names[__RTN_MAX] = { [RTN_UNSPEC] = "UNSPEC", [RTN_UNICAST] = "UNICAST", [RTN_LOCAL] = "LOCAL", [RTN_BROADCAST] = "BROADCAST", [RTN_ANYCAST] = "ANYCAST", [RTN_MULTICAST] = "MULTICAST", [RTN_BLACKHOLE] = "BLACKHOLE", [RTN_UNREACHABLE] = "UNREACHABLE", [RTN_PROHIBIT] = "PROHIBIT", [RTN_THROW] = "THROW", [RTN_NAT] = "NAT", [RTN_XRESOLVE] = "XRESOLVE", }; static inline const char *rtn_type(char *buf, size_t len, unsigned int t) { if (t < __RTN_MAX && rtn_type_names[t]) return rtn_type_names[t]; snprintf(buf, len, "type %u", t); return buf; } /* Pretty print the trie */ static int fib_trie_seq_show(struct seq_file *seq, void *v) { const struct fib_trie_iter *iter = seq->private; struct key_vector *n = v; if (IS_TRIE(node_parent_rcu(n))) fib_table_print(seq, iter->tb); if (IS_TNODE(n)) { __be32 prf = htonl(n->key); seq_indent(seq, iter->depth-1); seq_printf(seq, " +-- %pI4/%zu %u %u %u\n", &prf, KEYLENGTH - n->pos - n->bits, n->bits, tn_info(n)->full_children, tn_info(n)->empty_children); } else { __be32 val = htonl(n->key); struct fib_alias *fa; seq_indent(seq, iter->depth); seq_printf(seq, " |-- %pI4\n", &val); hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) { char buf1[32], buf2[32]; seq_indent(seq, iter->depth + 1); seq_printf(seq, " /%zu %s %s", KEYLENGTH - fa->fa_slen, rtn_scope(buf1, sizeof(buf1), fa->fa_info->fib_scope), rtn_type(buf2, sizeof(buf2), fa->fa_type)); if (fa->fa_dscp) seq_printf(seq, " tos=%d", inet_dscp_to_dsfield(fa->fa_dscp)); seq_putc(seq, '\n'); } } return 0; } static const struct seq_operations fib_trie_seq_ops = { .start = fib_trie_seq_start, .next = fib_trie_seq_next, .stop = fib_trie_seq_stop, .show = fib_trie_seq_show, }; struct fib_route_iter { struct seq_net_private p; struct fib_table *main_tb; struct key_vector *tnode; loff_t pos; t_key key; }; static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos) { struct key_vector *l, **tp = &iter->tnode; t_key key; /* use cached location of previously found key */ if (iter->pos > 0 && pos >= iter->pos) { key = iter->key; } else { iter->pos = 1; key = 0; } pos -= iter->pos; while ((l = leaf_walk_rcu(tp, key)) && (pos-- > 0)) { key = l->key + 1; iter->pos++; l = NULL; /* handle unlikely case of a key wrap */ if (!key) break; } if (l) iter->key = l->key; /* remember it */ else iter->pos = 0; /* forget it */ return l; } static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct fib_route_iter *iter = seq->private; struct fib_table *tb; struct trie *t; rcu_read_lock(); tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN); if (!tb) return NULL; iter->main_tb = tb; t = (struct trie *)tb->tb_data; iter->tnode = t->kv; if (*pos != 0) return fib_route_get_idx(iter, *pos); iter->pos = 0; iter->key = KEY_MAX; return SEQ_START_TOKEN; } static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct fib_route_iter *iter = seq->private; struct key_vector *l = NULL; t_key key = iter->key + 1; ++*pos; /* only allow key of 0 for start of sequence */ if ((v == SEQ_START_TOKEN) || key) l = leaf_walk_rcu(&iter->tnode, key); if (l) { iter->key = l->key; iter->pos++; } else { iter->pos = 0; } return l; } static void fib_route_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static unsigned int fib_flag_trans(int type, __be32 mask, struct fib_info *fi) { unsigned int flags = 0; if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT) flags = RTF_REJECT; if (fi) { const struct fib_nh_common *nhc = fib_info_nhc(fi, 0); if (nhc->nhc_gw.ipv4) flags |= RTF_GATEWAY; } if (mask == htonl(0xFFFFFFFF)) flags |= RTF_HOST; flags |= RTF_UP; return flags; } /* * This outputs /proc/net/route. * The format of the file is not supposed to be changed * and needs to be same as fib_hash output to avoid breaking * legacy utilities */ static int fib_route_seq_show(struct seq_file *seq, void *v) { struct fib_route_iter *iter = seq->private; struct fib_table *tb = iter->main_tb; struct fib_alias *fa; struct key_vector *l = v; __be32 prefix; if (v == SEQ_START_TOKEN) { seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway " "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU" "\tWindow\tIRTT"); return 0; } prefix = htonl(l->key); hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { struct fib_info *fi = fa->fa_info; __be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen); unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi); if ((fa->fa_type == RTN_BROADCAST) || (fa->fa_type == RTN_MULTICAST)) continue; if (fa->tb_id != tb->tb_id) continue; seq_setwidth(seq, 127); if (fi) { struct fib_nh_common *nhc = fib_info_nhc(fi, 0); __be32 gw = 0; if (nhc->nhc_gw_family == AF_INET) gw = nhc->nhc_gw.ipv4; seq_printf(seq, "%s\t%08X\t%08X\t%04X\t%d\t%u\t" "%u\t%08X\t%d\t%u\t%u", nhc->nhc_dev ? nhc->nhc_dev->name : "*", prefix, gw, flags, 0, 0, fi->fib_priority, mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0), fi->fib_window, fi->fib_rtt >> 3); } else { seq_printf(seq, "*\t%08X\t%08X\t%04X\t%d\t%u\t" "%u\t%08X\t%d\t%u\t%u", prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0); } seq_pad(seq, '\n'); } return 0; } static const struct seq_operations fib_route_seq_ops = { .start = fib_route_seq_start, .next = fib_route_seq_next, .stop = fib_route_seq_stop, .show = fib_route_seq_show, }; int __net_init fib_proc_init(struct net *net) { if (!proc_create_net("fib_trie", 0444, net->proc_net, &fib_trie_seq_ops, sizeof(struct fib_trie_iter))) goto out1; if (!proc_create_net_single("fib_triestat", 0444, net->proc_net, fib_triestat_seq_show, NULL)) goto out2; if (!proc_create_net("route", 0444, net->proc_net, &fib_route_seq_ops, sizeof(struct fib_route_iter))) goto out3; return 0; out3: remove_proc_entry("fib_triestat", net->proc_net); out2: remove_proc_entry("fib_trie", net->proc_net); out1: return -ENOMEM; } void __net_exit fib_proc_exit(struct net *net) { remove_proc_entry("fib_trie", net->proc_net); remove_proc_entry("fib_triestat", net->proc_net); remove_proc_entry("route", net->proc_net); } #endif /* CONFIG_PROC_FS */
7 6 1 5 5 4 2 2 2 2 4 2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" struct pause_req_info { struct ethnl_req_info base; enum ethtool_mac_stats_src src; }; #define PAUSE_REQINFO(__req_base) \ container_of(__req_base, struct pause_req_info, base) struct pause_reply_data { struct ethnl_reply_data base; struct ethtool_pauseparam pauseparam; struct ethtool_pause_stats pausestat; }; #define PAUSE_REPDATA(__reply_base) \ container_of(__reply_base, struct pause_reply_data, base) const struct nla_policy ethnl_pause_get_policy[] = { [ETHTOOL_A_PAUSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_stats), [ETHTOOL_A_PAUSE_STATS_SRC] = NLA_POLICY_MAX(NLA_U32, ETHTOOL_MAC_STATS_SRC_PMAC), }; static int pause_parse_request(struct ethnl_req_info *req_base, struct nlattr **tb, struct netlink_ext_ack *extack) { enum ethtool_mac_stats_src src = ETHTOOL_MAC_STATS_SRC_AGGREGATE; struct pause_req_info *req_info = PAUSE_REQINFO(req_base); if (tb[ETHTOOL_A_PAUSE_STATS_SRC]) { if (!(req_base->flags & ETHTOOL_FLAG_STATS)) { NL_SET_ERR_MSG_MOD(extack, "ETHTOOL_FLAG_STATS must be set when requesting a source of stats"); return -EINVAL; } src = nla_get_u32(tb[ETHTOOL_A_PAUSE_STATS_SRC]); } req_info->src = src; return 0; } static int pause_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { const struct pause_req_info *req_info = PAUSE_REQINFO(req_base); struct pause_reply_data *data = PAUSE_REPDATA(reply_base); enum ethtool_mac_stats_src src = req_info->src; struct net_device *dev = reply_base->dev; int ret; if (!dev->ethtool_ops->get_pauseparam) return -EOPNOTSUPP; ethtool_stats_init((u64 *)&data->pausestat, sizeof(data->pausestat) / 8); data->pausestat.src = src; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; if ((src == ETHTOOL_MAC_STATS_SRC_EMAC || src == ETHTOOL_MAC_STATS_SRC_PMAC) && !__ethtool_dev_mm_supported(dev)) { NL_SET_ERR_MSG_MOD(info->extack, "Device does not support MAC merge layer"); ethnl_ops_complete(dev); return -EOPNOTSUPP; } dev->ethtool_ops->get_pauseparam(dev, &data->pauseparam); if (req_base->flags & ETHTOOL_FLAG_STATS && dev->ethtool_ops->get_pause_stats) dev->ethtool_ops->get_pause_stats(dev, &data->pausestat); ethnl_ops_complete(dev); return 0; } static int pause_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { int n = nla_total_size(sizeof(u8)) + /* _PAUSE_AUTONEG */ nla_total_size(sizeof(u8)) + /* _PAUSE_RX */ nla_total_size(sizeof(u8)); /* _PAUSE_TX */ if (req_base->flags & ETHTOOL_FLAG_STATS) n += nla_total_size(0) + /* _PAUSE_STATS */ nla_total_size(sizeof(u32)) + /* _PAUSE_STATS_SRC */ nla_total_size_64bit(sizeof(u64)) * ETHTOOL_PAUSE_STAT_CNT; return n; } static int ethtool_put_stat(struct sk_buff *skb, u64 val, u16 attrtype, u16 padtype) { if (val == ETHTOOL_STAT_NOT_SET) return 0; if (nla_put_u64_64bit(skb, attrtype, val, padtype)) return -EMSGSIZE; return 0; } static int pause_put_stats(struct sk_buff *skb, const struct ethtool_pause_stats *pause_stats) { const u16 pad = ETHTOOL_A_PAUSE_STAT_PAD; struct nlattr *nest; if (nla_put_u32(skb, ETHTOOL_A_PAUSE_STATS_SRC, pause_stats->src)) return -EMSGSIZE; nest = nla_nest_start(skb, ETHTOOL_A_PAUSE_STATS); if (!nest) return -EMSGSIZE; if (ethtool_put_stat(skb, pause_stats->tx_pause_frames, ETHTOOL_A_PAUSE_STAT_TX_FRAMES, pad) || ethtool_put_stat(skb, pause_stats->rx_pause_frames, ETHTOOL_A_PAUSE_STAT_RX_FRAMES, pad)) goto err_cancel; nla_nest_end(skb, nest); return 0; err_cancel: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int pause_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct pause_reply_data *data = PAUSE_REPDATA(reply_base); const struct ethtool_pauseparam *pauseparam = &data->pauseparam; if (nla_put_u8(skb, ETHTOOL_A_PAUSE_AUTONEG, !!pauseparam->autoneg) || nla_put_u8(skb, ETHTOOL_A_PAUSE_RX, !!pauseparam->rx_pause) || nla_put_u8(skb, ETHTOOL_A_PAUSE_TX, !!pauseparam->tx_pause)) return -EMSGSIZE; if (req_base->flags & ETHTOOL_FLAG_STATS && pause_put_stats(skb, &data->pausestat)) return -EMSGSIZE; return 0; } /* PAUSE_SET */ const struct nla_policy ethnl_pause_set_policy[] = { [ETHTOOL_A_PAUSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_PAUSE_AUTONEG] = { .type = NLA_U8 }, [ETHTOOL_A_PAUSE_RX] = { .type = NLA_U8 }, [ETHTOOL_A_PAUSE_TX] = { .type = NLA_U8 }, [ETHTOOL_A_PAUSE_STATS_SRC] = { .type = NLA_REJECT }, }; static int ethnl_set_pause_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; return ops->get_pauseparam && ops->set_pauseparam ? 1 : -EOPNOTSUPP; } static int ethnl_set_pause(struct ethnl_req_info *req_info, struct genl_info *info) { struct net_device *dev = req_info->dev; struct ethtool_pauseparam params = {}; struct nlattr **tb = info->attrs; bool mod = false; int ret; dev->ethtool_ops->get_pauseparam(dev, &params); ethnl_update_bool32(&params.autoneg, tb[ETHTOOL_A_PAUSE_AUTONEG], &mod); ethnl_update_bool32(&params.rx_pause, tb[ETHTOOL_A_PAUSE_RX], &mod); ethnl_update_bool32(&params.tx_pause, tb[ETHTOOL_A_PAUSE_TX], &mod); if (!mod) return 0; ret = dev->ethtool_ops->set_pauseparam(dev, &params); return ret < 0 ? ret : 1; } const struct ethnl_request_ops ethnl_pause_request_ops = { .request_cmd = ETHTOOL_MSG_PAUSE_GET, .reply_cmd = ETHTOOL_MSG_PAUSE_GET_REPLY, .hdr_attr = ETHTOOL_A_PAUSE_HEADER, .req_info_size = sizeof(struct pause_req_info), .reply_data_size = sizeof(struct pause_reply_data), .parse_request = pause_parse_request, .prepare_data = pause_prepare_data, .reply_size = pause_reply_size, .fill_reply = pause_fill_reply, .set_validate = ethnl_set_pause_validate, .set = ethnl_set_pause, .set_ntf_cmd = ETHTOOL_MSG_PAUSE_NTF, };
36 16 20 80 78 1 9 2 7 1 224 192 16 2 50 228 224 225 224 6 6 192 192 11 7 7 5 5 5 1 1 3 43 43 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/sch_fifo.c The simplest FIFO queue. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> /* 1 band FIFO pseudo-"scheduler" */ static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= READ_ONCE(sch->limit))) return qdisc_enqueue_tail(skb, sch); return qdisc_drop(skb, sch, to_free); } static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { if (likely(sch->q.qlen < READ_ONCE(sch->limit))) return qdisc_enqueue_tail(skb, sch); return qdisc_drop(skb, sch, to_free); } static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { unsigned int prev_backlog; if (unlikely(READ_ONCE(sch->limit) == 0)) return qdisc_drop(skb, sch, to_free); if (likely(sch->q.qlen < READ_ONCE(sch->limit))) return qdisc_enqueue_tail(skb, sch); prev_backlog = sch->qstats.backlog; /* queue full, remove one skb to fulfill the limit */ __qdisc_queue_drop_head(sch, &sch->q, to_free); qdisc_qstats_drop(sch); qdisc_enqueue_tail(skb, sch); qdisc_tree_reduce_backlog(sch, 0, prev_backlog - sch->qstats.backlog); return NET_XMIT_CN; } static void fifo_offload_init(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct tc_fifo_qopt_offload qopt; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return; qopt.command = TC_FIFO_REPLACE; qopt.handle = sch->handle; qopt.parent = sch->parent; dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_FIFO, &qopt); } static void fifo_offload_destroy(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct tc_fifo_qopt_offload qopt; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return; qopt.command = TC_FIFO_DESTROY; qopt.handle = sch->handle; qopt.parent = sch->parent; dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_FIFO, &qopt); } static int fifo_offload_dump(struct Qdisc *sch) { struct tc_fifo_qopt_offload qopt; qopt.command = TC_FIFO_STATS; qopt.handle = sch->handle; qopt.parent = sch->parent; qopt.stats.bstats = &sch->bstats; qopt.stats.qstats = &sch->qstats; return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_FIFO, &qopt); } static int __fifo_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { bool bypass; bool is_bfifo = sch->ops == &bfifo_qdisc_ops; if (opt == NULL) { u32 limit = qdisc_dev(sch)->tx_queue_len; if (is_bfifo) limit *= psched_mtu(qdisc_dev(sch)); WRITE_ONCE(sch->limit, limit); } else { struct tc_fifo_qopt *ctl = nla_data(opt); if (nla_len(opt) < sizeof(*ctl)) return -EINVAL; WRITE_ONCE(sch->limit, ctl->limit); } if (is_bfifo) bypass = sch->limit >= psched_mtu(qdisc_dev(sch)); else bypass = sch->limit >= 1; if (bypass) sch->flags |= TCQ_F_CAN_BYPASS; else sch->flags &= ~TCQ_F_CAN_BYPASS; return 0; } static int fifo_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { int err; err = __fifo_init(sch, opt, extack); if (err) return err; fifo_offload_init(sch); return 0; } static int fifo_hd_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { return __fifo_init(sch, opt, extack); } static void fifo_destroy(struct Qdisc *sch) { fifo_offload_destroy(sch); } static int __fifo_dump(struct Qdisc *sch, struct sk_buff *skb) { struct tc_fifo_qopt opt = { .limit = READ_ONCE(sch->limit) }; if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) goto nla_put_failure; return skb->len; nla_put_failure: return -1; } static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb) { int err; err = fifo_offload_dump(sch); if (err) return err; return __fifo_dump(sch, skb); } static int fifo_hd_dump(struct Qdisc *sch, struct sk_buff *skb) { return __fifo_dump(sch, skb); } struct Qdisc_ops pfifo_qdisc_ops __read_mostly = { .id = "pfifo", .priv_size = 0, .enqueue = pfifo_enqueue, .dequeue = qdisc_dequeue_head, .peek = qdisc_peek_head, .init = fifo_init, .destroy = fifo_destroy, .reset = qdisc_reset_queue, .change = fifo_init, .dump = fifo_dump, .owner = THIS_MODULE, }; EXPORT_SYMBOL(pfifo_qdisc_ops); struct Qdisc_ops bfifo_qdisc_ops __read_mostly = { .id = "bfifo", .priv_size = 0, .enqueue = bfifo_enqueue, .dequeue = qdisc_dequeue_head, .peek = qdisc_peek_head, .init = fifo_init, .destroy = fifo_destroy, .reset = qdisc_reset_queue, .change = fifo_init, .dump = fifo_dump, .owner = THIS_MODULE, }; EXPORT_SYMBOL(bfifo_qdisc_ops); struct Qdisc_ops pfifo_head_drop_qdisc_ops __read_mostly = { .id = "pfifo_head_drop", .priv_size = 0, .enqueue = pfifo_tail_enqueue, .dequeue = qdisc_dequeue_head, .peek = qdisc_peek_head, .init = fifo_hd_init, .reset = qdisc_reset_queue, .change = fifo_hd_init, .dump = fifo_hd_dump, .owner = THIS_MODULE, }; /* Pass size change message down to embedded FIFO */ int fifo_set_limit(struct Qdisc *q, unsigned int limit) { struct nlattr *nla; int ret = -ENOMEM; /* Hack to avoid sending change message to non-FIFO */ if (strncmp(q->ops->id + 1, "fifo", 4) != 0) return 0; if (!q->ops->change) return 0; nla = kmalloc(nla_attr_size(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); if (nla) { nla->nla_type = RTM_NEWQDISC; nla->nla_len = nla_attr_size(sizeof(struct tc_fifo_qopt)); ((struct tc_fifo_qopt *)nla_data(nla))->limit = limit; ret = q->ops->change(q, nla, NULL); kfree(nla); } return ret; } EXPORT_SYMBOL(fifo_set_limit); struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops, unsigned int limit, struct netlink_ext_ack *extack) { struct Qdisc *q; int err = -ENOMEM; q = qdisc_create_dflt(sch->dev_queue, ops, TC_H_MAKE(sch->handle, 1), extack); if (q) { err = fifo_set_limit(q, limit); if (err < 0) { qdisc_put(q); q = NULL; } } return q ? : ERR_PTR(err); } EXPORT_SYMBOL(fifo_create_dflt); MODULE_DESCRIPTION("Single queue packet and byte based First In First Out(P/BFIFO) scheduler");
9 16 15 5 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 /* SPDX-License-Identifier: GPL-2.0 */ /* * Declarations of NET/ROM type objects. * * Jonathan Naylor G4KLX 9/4/95 */ #ifndef _NETROM_H #define _NETROM_H #include <linux/netrom.h> #include <linux/list.h> #include <linux/slab.h> #include <net/sock.h> #include <linux/refcount.h> #include <linux/seq_file.h> #include <net/ax25.h> #define NR_NETWORK_LEN 15 #define NR_TRANSPORT_LEN 5 #define NR_PROTO_IP 0x0C #define NR_PROTOEXT 0x00 #define NR_CONNREQ 0x01 #define NR_CONNACK 0x02 #define NR_DISCREQ 0x03 #define NR_DISCACK 0x04 #define NR_INFO 0x05 #define NR_INFOACK 0x06 #define NR_RESET 0x07 #define NR_CHOKE_FLAG 0x80 #define NR_NAK_FLAG 0x40 #define NR_MORE_FLAG 0x20 /* Define Link State constants. */ enum { NR_STATE_0, NR_STATE_1, NR_STATE_2, NR_STATE_3 }; #define NR_COND_ACK_PENDING 0x01 #define NR_COND_REJECT 0x02 #define NR_COND_PEER_RX_BUSY 0x04 #define NR_COND_OWN_RX_BUSY 0x08 #define NR_DEFAULT_T1 120000 /* Outstanding frames - 120 seconds */ #define NR_DEFAULT_T2 5000 /* Response delay - 5 seconds */ #define NR_DEFAULT_N2 3 /* Number of Retries - 3 */ #define NR_DEFAULT_T4 180000 /* Busy Delay - 180 seconds */ #define NR_DEFAULT_IDLE 0 /* No Activity Timeout - none */ #define NR_DEFAULT_WINDOW 4 /* Default Window Size - 4 */ #define NR_DEFAULT_OBS 6 /* Default Obsolescence Count - 6 */ #define NR_DEFAULT_QUAL 10 /* Default Neighbour Quality - 10 */ #define NR_DEFAULT_TTL 16 /* Default Time To Live - 16 */ #define NR_DEFAULT_ROUTING 1 /* Is routing enabled ? */ #define NR_DEFAULT_FAILS 2 /* Link fails until route fails */ #define NR_DEFAULT_RESET 0 /* Sent / accept reset cmds? */ #define NR_MODULUS 256 #define NR_MAX_WINDOW_SIZE 127 /* Maximum Window Allowable - 127 */ #define NR_MAX_PACKET_SIZE 236 /* Maximum Packet Length - 236 */ struct nr_sock { struct sock sock; ax25_address user_addr, source_addr, dest_addr; struct net_device *device; unsigned char my_index, my_id; unsigned char your_index, your_id; unsigned char state, condition, bpqext, window; unsigned short vs, vr, va, vl; unsigned char n2, n2count; unsigned long t1, t2, t4, idle; unsigned short fraglen; struct timer_list t1timer; struct timer_list t2timer; struct timer_list t4timer; struct timer_list idletimer; struct sk_buff_head ack_queue; struct sk_buff_head reseq_queue; struct sk_buff_head frag_queue; }; #define nr_sk(sk) ((struct nr_sock *)(sk)) struct nr_neigh { struct hlist_node neigh_node; ax25_address callsign; ax25_digi *digipeat; ax25_cb *ax25; struct net_device *dev; unsigned char quality; unsigned char locked; unsigned short count; unsigned int number; unsigned char failed; refcount_t refcount; }; struct nr_route { unsigned char quality; unsigned char obs_count; struct nr_neigh *neighbour; }; struct nr_node { struct hlist_node node_node; ax25_address callsign; char mnemonic[7]; unsigned char which; unsigned char count; struct nr_route routes[3]; refcount_t refcount; spinlock_t node_lock; }; /********************************************************************* * nr_node & nr_neigh lists, refcounting and locking *********************************************************************/ #define nr_node_hold(__nr_node) \ refcount_inc(&((__nr_node)->refcount)) static __inline__ void nr_node_put(struct nr_node *nr_node) { if (refcount_dec_and_test(&nr_node->refcount)) { kfree(nr_node); } } #define nr_neigh_hold(__nr_neigh) \ refcount_inc(&((__nr_neigh)->refcount)) static __inline__ void nr_neigh_put(struct nr_neigh *nr_neigh) { if (refcount_dec_and_test(&nr_neigh->refcount)) { if (nr_neigh->ax25) ax25_cb_put(nr_neigh->ax25); kfree(nr_neigh->digipeat); kfree(nr_neigh); } } /* nr_node_lock and nr_node_unlock also hold/put the node's refcounter. */ static __inline__ void nr_node_lock(struct nr_node *nr_node) { nr_node_hold(nr_node); spin_lock_bh(&nr_node->node_lock); } static __inline__ void nr_node_unlock(struct nr_node *nr_node) { spin_unlock_bh(&nr_node->node_lock); nr_node_put(nr_node); } #define nr_neigh_for_each(__nr_neigh, list) \ hlist_for_each_entry(__nr_neigh, list, neigh_node) #define nr_neigh_for_each_safe(__nr_neigh, node2, list) \ hlist_for_each_entry_safe(__nr_neigh, node2, list, neigh_node) #define nr_node_for_each(__nr_node, list) \ hlist_for_each_entry(__nr_node, list, node_node) #define nr_node_for_each_safe(__nr_node, node2, list) \ hlist_for_each_entry_safe(__nr_node, node2, list, node_node) /*********************************************************************/ /* af_netrom.c */ extern int sysctl_netrom_default_path_quality; extern int sysctl_netrom_obsolescence_count_initialiser; extern int sysctl_netrom_network_ttl_initialiser; extern int sysctl_netrom_transport_timeout; extern int sysctl_netrom_transport_maximum_tries; extern int sysctl_netrom_transport_acknowledge_delay; extern int sysctl_netrom_transport_busy_delay; extern int sysctl_netrom_transport_requested_window_size; extern int sysctl_netrom_transport_no_activity_timeout; extern int sysctl_netrom_routing_control; extern int sysctl_netrom_link_fails_count; extern int sysctl_netrom_reset_circuit; int nr_rx_frame(struct sk_buff *, struct net_device *); void nr_destroy_socket(struct sock *); /* nr_dev.c */ int nr_rx_ip(struct sk_buff *, struct net_device *); void nr_setup(struct net_device *); /* nr_in.c */ int nr_process_rx_frame(struct sock *, struct sk_buff *); /* nr_loopback.c */ void nr_loopback_init(void); void nr_loopback_clear(void); int nr_loopback_queue(struct sk_buff *); /* nr_out.c */ void nr_output(struct sock *, struct sk_buff *); void nr_send_nak_frame(struct sock *); void nr_kick(struct sock *); void nr_transmit_buffer(struct sock *, struct sk_buff *); void nr_establish_data_link(struct sock *); void nr_enquiry_response(struct sock *); void nr_check_iframes_acked(struct sock *, unsigned short); /* nr_route.c */ void nr_rt_device_down(struct net_device *); struct net_device *nr_dev_first(void); struct net_device *nr_dev_get(ax25_address *); int nr_rt_ioctl(unsigned int, void __user *); void nr_link_failed(ax25_cb *, int); int nr_route_frame(struct sk_buff *, ax25_cb *); extern const struct seq_operations nr_node_seqops; extern const struct seq_operations nr_neigh_seqops; void nr_rt_free(void); /* nr_subr.c */ void nr_clear_queues(struct sock *); void nr_frames_acked(struct sock *, unsigned short); void nr_requeue_frames(struct sock *); int nr_validate_nr(struct sock *, unsigned short); int nr_in_rx_window(struct sock *, unsigned short); void nr_write_internal(struct sock *, int); void __nr_transmit_reply(struct sk_buff *skb, int mine, unsigned char cmdflags); /* * This routine is called when a Connect Acknowledge with the Choke Flag * set is needed to refuse a connection. */ #define nr_transmit_refusal(skb, mine) \ do { \ __nr_transmit_reply((skb), (mine), NR_CONNACK | NR_CHOKE_FLAG); \ } while (0) /* * This routine is called when we don't have a circuit matching an incoming * NET/ROM packet. This is an G8PZT Xrouter extension. */ #define nr_transmit_reset(skb, mine) \ do { \ __nr_transmit_reply((skb), (mine), NR_RESET); \ } while (0) void nr_disconnect(struct sock *, int); /* nr_timer.c */ void nr_init_timers(struct sock *sk); void nr_start_heartbeat(struct sock *); void nr_start_t1timer(struct sock *); void nr_start_t2timer(struct sock *); void nr_start_t4timer(struct sock *); void nr_start_idletimer(struct sock *); void nr_stop_heartbeat(struct sock *); void nr_stop_t1timer(struct sock *); void nr_stop_t2timer(struct sock *); void nr_stop_t4timer(struct sock *); void nr_stop_idletimer(struct sock *); int nr_t1timer_running(struct sock *); /* sysctl_net_netrom.c */ int nr_register_sysctl(void); void nr_unregister_sysctl(void); #endif
37 37 26 36 38 1 37 7 30 30 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 // SPDX-License-Identifier: GPL-2.0-or-later /* * lib/textsearch.c Generic text search interface * * Authors: Thomas Graf <tgraf@suug.ch> * Pablo Neira Ayuso <pablo@netfilter.org> * * ========================================================================== */ /** * DOC: ts_intro * INTRODUCTION * * The textsearch infrastructure provides text searching facilities for * both linear and non-linear data. Individual search algorithms are * implemented in modules and chosen by the user. * * ARCHITECTURE * * .. code-block:: none * * User * +----------------+ * | finish()|<--------------(6)-----------------+ * |get_next_block()|<--------------(5)---------------+ | * | | Algorithm | | * | | +------------------------------+ * | | | init() find() destroy() | * | | +------------------------------+ * | | Core API ^ ^ ^ * | | +---------------+ (2) (4) (8) * | (1)|----->| prepare() |---+ | | * | (3)|----->| find()/next() |-----------+ | * | (7)|----->| destroy() |----------------------+ * +----------------+ +---------------+ * * (1) User configures a search by calling textsearch_prepare() specifying * the search parameters such as the pattern and algorithm name. * (2) Core requests the algorithm to allocate and initialize a search * configuration according to the specified parameters. * (3) User starts the search(es) by calling textsearch_find() or * textsearch_next() to fetch subsequent occurrences. A state variable * is provided to the algorithm to store persistent variables. * (4) Core eventually resets the search offset and forwards the find() * request to the algorithm. * (5) Algorithm calls get_next_block() provided by the user continuously * to fetch the data to be searched in block by block. * (6) Algorithm invokes finish() after the last call to get_next_block * to clean up any leftovers from get_next_block. (Optional) * (7) User destroys the configuration by calling textsearch_destroy(). * (8) Core notifies the algorithm to destroy algorithm specific * allocations. (Optional) * * USAGE * * Before a search can be performed, a configuration must be created * by calling textsearch_prepare() specifying the searching algorithm, * the pattern to look for and flags. As a flag, you can set TS_IGNORECASE * to perform case insensitive matching. But it might slow down * performance of algorithm, so you should use it at own your risk. * The returned configuration may then be used for an arbitrary * amount of times and even in parallel as long as a separate struct * ts_state variable is provided to every instance. * * The actual search is performed by either calling * textsearch_find_continuous() for linear data or by providing * an own get_next_block() implementation and * calling textsearch_find(). Both functions return * the position of the first occurrence of the pattern or UINT_MAX if * no match was found. Subsequent occurrences can be found by calling * textsearch_next() regardless of the linearity of the data. * * Once you're done using a configuration it must be given back via * textsearch_destroy. * * EXAMPLE:: * * int pos; * struct ts_config *conf; * struct ts_state state; * const char *pattern = "chicken"; * const char *example = "We dance the funky chicken"; * * conf = textsearch_prepare("kmp", pattern, strlen(pattern), * GFP_KERNEL, TS_AUTOLOAD); * if (IS_ERR(conf)) { * err = PTR_ERR(conf); * goto errout; * } * * pos = textsearch_find_continuous(conf, &state, example, strlen(example)); * if (pos != UINT_MAX) * panic("Oh my god, dancing chickens at %d\n", pos); * * textsearch_destroy(conf); */ /* ========================================================================== */ #include <linux/module.h> #include <linux/types.h> #include <linux/string.h> #include <linux/init.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/err.h> #include <linux/textsearch.h> #include <linux/slab.h> static LIST_HEAD(ts_ops); static DEFINE_SPINLOCK(ts_mod_lock); static inline struct ts_ops *lookup_ts_algo(const char *name) { struct ts_ops *o; rcu_read_lock(); list_for_each_entry_rcu(o, &ts_ops, list) { if (!strcmp(name, o->name)) { if (!try_module_get(o->owner)) o = NULL; rcu_read_unlock(); return o; } } rcu_read_unlock(); return NULL; } /** * textsearch_register - register a textsearch module * @ops: operations lookup table * * This function must be called by textsearch modules to announce * their presence. The specified &@ops must have %name set to a * unique identifier and the callbacks find(), init(), get_pattern(), * and get_pattern_len() must be implemented. * * Returns 0 or -EEXISTS if another module has already registered * with same name. */ int textsearch_register(struct ts_ops *ops) { int err = -EEXIST; struct ts_ops *o; if (ops->name == NULL || ops->find == NULL || ops->init == NULL || ops->get_pattern == NULL || ops->get_pattern_len == NULL) return -EINVAL; spin_lock(&ts_mod_lock); list_for_each_entry(o, &ts_ops, list) { if (!strcmp(ops->name, o->name)) goto errout; } list_add_tail_rcu(&ops->list, &ts_ops); err = 0; errout: spin_unlock(&ts_mod_lock); return err; } EXPORT_SYMBOL(textsearch_register); /** * textsearch_unregister - unregister a textsearch module * @ops: operations lookup table * * This function must be called by textsearch modules to announce * their disappearance for examples when the module gets unloaded. * The &ops parameter must be the same as the one during the * registration. * * Returns 0 on success or -ENOENT if no matching textsearch * registration was found. */ int textsearch_unregister(struct ts_ops *ops) { int err = 0; struct ts_ops *o; spin_lock(&ts_mod_lock); list_for_each_entry(o, &ts_ops, list) { if (o == ops) { list_del_rcu(&o->list); goto out; } } err = -ENOENT; out: spin_unlock(&ts_mod_lock); return err; } EXPORT_SYMBOL(textsearch_unregister); struct ts_linear_state { unsigned int len; const void *data; }; static unsigned int get_linear_data(unsigned int consumed, const u8 **dst, struct ts_config *conf, struct ts_state *state) { struct ts_linear_state *st = (struct ts_linear_state *) state->cb; if (likely(consumed < st->len)) { *dst = st->data + consumed; return st->len - consumed; } return 0; } /** * textsearch_find_continuous - search a pattern in continuous/linear data * @conf: search configuration * @state: search state * @data: data to search in * @len: length of data * * A simplified version of textsearch_find() for continuous/linear data. * Call textsearch_next() to retrieve subsequent matches. * * Returns the position of first occurrence of the pattern or * %UINT_MAX if no occurrence was found. */ unsigned int textsearch_find_continuous(struct ts_config *conf, struct ts_state *state, const void *data, unsigned int len) { struct ts_linear_state *st = (struct ts_linear_state *) state->cb; conf->get_next_block = get_linear_data; st->data = data; st->len = len; return textsearch_find(conf, state); } EXPORT_SYMBOL(textsearch_find_continuous); /** * textsearch_prepare - Prepare a search * @algo: name of search algorithm * @pattern: pattern data * @len: length of pattern * @gfp_mask: allocation mask * @flags: search flags * * Looks up the search algorithm module and creates a new textsearch * configuration for the specified pattern. * * Note: The format of the pattern may not be compatible between * the various search algorithms. * * Returns a new textsearch configuration according to the specified * parameters or a ERR_PTR(). If a zero length pattern is passed, this * function returns EINVAL. */ struct ts_config *textsearch_prepare(const char *algo, const void *pattern, unsigned int len, gfp_t gfp_mask, int flags) { int err = -ENOENT; struct ts_config *conf; struct ts_ops *ops; if (len == 0) return ERR_PTR(-EINVAL); ops = lookup_ts_algo(algo); #ifdef CONFIG_MODULES /* * Why not always autoload you may ask. Some users are * in a situation where requesting a module may deadlock, * especially when the module is located on a NFS mount. */ if (ops == NULL && flags & TS_AUTOLOAD) { request_module("ts_%s", algo); ops = lookup_ts_algo(algo); } #endif if (ops == NULL) goto errout; conf = ops->init(pattern, len, gfp_mask, flags); if (IS_ERR(conf)) { err = PTR_ERR(conf); goto errout; } conf->ops = ops; return conf; errout: if (ops) module_put(ops->owner); return ERR_PTR(err); } EXPORT_SYMBOL(textsearch_prepare); /** * textsearch_destroy - destroy a search configuration * @conf: search configuration * * Releases all references of the configuration and frees * up the memory. */ void textsearch_destroy(struct ts_config *conf) { if (conf->ops) { if (conf->ops->destroy) conf->ops->destroy(conf); module_put(conf->ops->owner); } kfree(conf); } EXPORT_SYMBOL(textsearch_destroy);
30 30 30 30 30 30 30 30 30 30 30 164 166 5 5 1 4 1 165 165 19 150 149 20 154 154 152 153 154 151 6 6 1 1 1 1 30 30 30 29 30 30 30 30 30 30 30 30 30 30 30 30 30 30 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 30 30 29 30 30 30 30 30 30 30 30 30 29 30 30 30 29 30 30 30 30 30 30 30 30 30 30 30 30 29 30 30 48 48 9 44 44 43 44 30 30 30 34 1 30 30 30 34 33 34 34 34 32 32 31 32 32 32 32 31 32 32 34 34 34 34 33 34 33 34 34 34 34 34 33 34 34 34 33 34 34 34 34 34 2 34 33 34 30 4 34 1 34 34 29 4 30 4 34 34 34 34 34 33 34 34 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH * Copyright (C) 2018-2025 Intel Corporation */ #include <linux/module.h> #include <linux/init.h> #include <linux/etherdevice.h> #include <linux/netdevice.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/timer.h> #include <linux/rtnetlink.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" #include "sta_info.h" #include "debugfs_sta.h" #include "mesh.h" #include "wme.h" /** * DOC: STA information lifetime rules * * STA info structures (&struct sta_info) are managed in a hash table * for faster lookup and a list for iteration. They are managed using * RCU, i.e. access to the list and hash table is protected by RCU. * * Upon allocating a STA info structure with sta_info_alloc(), the caller * owns that structure. It must then insert it into the hash table using * either sta_info_insert() or sta_info_insert_rcu(); only in the latter * case (which acquires an rcu read section but must not be called from * within one) will the pointer still be valid after the call. Note that * the caller may not do much with the STA info before inserting it; in * particular, it may not start any mesh peer link management or add * encryption keys. * * When the insertion fails (sta_info_insert()) returns non-zero), the * structure will have been freed by sta_info_insert()! * * Station entries are added by mac80211 when you establish a link with a * peer. This means different things for the different type of interfaces * we support. For a regular station this mean we add the AP sta when we * receive an association response from the AP. For IBSS this occurs when * get to know about a peer on the same IBSS. For WDS we add the sta for * the peer immediately upon device open. When using AP mode we add stations * for each respective station upon request from userspace through nl80211. * * In order to remove a STA info structure, various sta_info_destroy_*() * calls are available. * * There is no concept of ownership on a STA entry; each structure is * owned by the global hash table/list until it is removed. All users of * the structure need to be RCU protected so that the structure won't be * freed before they are done using it. */ struct sta_link_alloc { struct link_sta_info info; struct ieee80211_link_sta sta; struct rcu_head rcu_head; }; static const struct rhashtable_params sta_rht_params = { .nelem_hint = 3, /* start small */ .automatic_shrinking = true, .head_offset = offsetof(struct sta_info, hash_node), .key_offset = offsetof(struct sta_info, addr), .key_len = ETH_ALEN, .max_size = CONFIG_MAC80211_STA_HASH_MAX_SIZE, }; static const struct rhashtable_params link_sta_rht_params = { .nelem_hint = 3, /* start small */ .automatic_shrinking = true, .head_offset = offsetof(struct link_sta_info, link_hash_node), .key_offset = offsetof(struct link_sta_info, addr), .key_len = ETH_ALEN, .max_size = CONFIG_MAC80211_STA_HASH_MAX_SIZE, }; static int sta_info_hash_del(struct ieee80211_local *local, struct sta_info *sta) { return rhltable_remove(&local->sta_hash, &sta->hash_node, sta_rht_params); } static int link_sta_info_hash_add(struct ieee80211_local *local, struct link_sta_info *link_sta) { lockdep_assert_wiphy(local->hw.wiphy); return rhltable_insert(&local->link_sta_hash, &link_sta->link_hash_node, link_sta_rht_params); } static int link_sta_info_hash_del(struct ieee80211_local *local, struct link_sta_info *link_sta) { lockdep_assert_wiphy(local->hw.wiphy); return rhltable_remove(&local->link_sta_hash, &link_sta->link_hash_node, link_sta_rht_params); } void ieee80211_purge_sta_txqs(struct sta_info *sta) { struct ieee80211_local *local = sta->sdata->local; int i; for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { struct txq_info *txqi; if (!sta->sta.txq[i]) continue; txqi = to_txq_info(sta->sta.txq[i]); ieee80211_txq_purge(local, txqi); } } static void __cleanup_single_sta(struct sta_info *sta) { int ac, i; struct tid_ampdu_tx *tid_tx; struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct ps_data *ps; if (test_sta_flag(sta, WLAN_STA_PS_STA) || test_sta_flag(sta, WLAN_STA_PS_DRIVER) || test_sta_flag(sta, WLAN_STA_PS_DELIVER)) { if (sta->sdata->vif.type == NL80211_IFTYPE_AP || sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) ps = &sdata->bss->ps; else if (ieee80211_vif_is_mesh(&sdata->vif)) ps = &sdata->u.mesh.ps; else return; clear_sta_flag(sta, WLAN_STA_PS_STA); clear_sta_flag(sta, WLAN_STA_PS_DRIVER); clear_sta_flag(sta, WLAN_STA_PS_DELIVER); atomic_dec(&ps->num_sta_ps); } ieee80211_purge_sta_txqs(sta); for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { local->total_ps_buffered -= skb_queue_len(&sta->ps_tx_buf[ac]); ieee80211_purge_tx_queue(&local->hw, &sta->ps_tx_buf[ac]); ieee80211_purge_tx_queue(&local->hw, &sta->tx_filtered[ac]); } if (ieee80211_vif_is_mesh(&sdata->vif)) mesh_sta_cleanup(sta); cancel_work_sync(&sta->drv_deliver_wk); /* * Destroy aggregation state here. It would be nice to wait for the * driver to finish aggregation stop and then clean up, but for now * drivers have to handle aggregation stop being requested, followed * directly by station destruction. */ for (i = 0; i < IEEE80211_NUM_TIDS; i++) { kfree(sta->ampdu_mlme.tid_start_tx[i]); tid_tx = rcu_dereference_raw(sta->ampdu_mlme.tid_tx[i]); if (!tid_tx) continue; ieee80211_purge_tx_queue(&local->hw, &tid_tx->pending); kfree(tid_tx); } } static void cleanup_single_sta(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; __cleanup_single_sta(sta); sta_info_free(local, sta); } struct rhlist_head *sta_info_hash_lookup(struct ieee80211_local *local, const u8 *addr) { return rhltable_lookup(&local->sta_hash, addr, sta_rht_params); } /* protected by RCU */ struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct ieee80211_local *local = sdata->local; struct rhlist_head *tmp; struct sta_info *sta; rcu_read_lock(); for_each_sta_info(local, addr, sta, tmp) { if (sta->sdata == sdata) { rcu_read_unlock(); /* this is safe as the caller must already hold * another rcu read section or the mutex */ return sta; } } rcu_read_unlock(); return NULL; } /* * Get sta info either from the specified interface * or from one of its vlans */ struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct ieee80211_local *local = sdata->local; struct rhlist_head *tmp; struct sta_info *sta; rcu_read_lock(); for_each_sta_info(local, addr, sta, tmp) { if (sta->sdata == sdata || (sta->sdata->bss && sta->sdata->bss == sdata->bss)) { rcu_read_unlock(); /* this is safe as the caller must already hold * another rcu read section or the mutex */ return sta; } } rcu_read_unlock(); return NULL; } struct rhlist_head *link_sta_info_hash_lookup(struct ieee80211_local *local, const u8 *addr) { return rhltable_lookup(&local->link_sta_hash, addr, link_sta_rht_params); } struct link_sta_info * link_sta_info_get_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct ieee80211_local *local = sdata->local; struct rhlist_head *tmp; struct link_sta_info *link_sta; rcu_read_lock(); for_each_link_sta_info(local, addr, link_sta, tmp) { struct sta_info *sta = link_sta->sta; if (sta->sdata == sdata || (sta->sdata->bss && sta->sdata->bss == sdata->bss)) { rcu_read_unlock(); /* this is safe as the caller must already hold * another rcu read section or the mutex */ return link_sta; } } rcu_read_unlock(); return NULL; } struct ieee80211_sta * ieee80211_find_sta_by_link_addrs(struct ieee80211_hw *hw, const u8 *addr, const u8 *localaddr, unsigned int *link_id) { struct ieee80211_local *local = hw_to_local(hw); struct link_sta_info *link_sta; struct rhlist_head *tmp; for_each_link_sta_info(local, addr, link_sta, tmp) { struct sta_info *sta = link_sta->sta; struct ieee80211_link_data *link; u8 _link_id = link_sta->link_id; if (!localaddr) { if (link_id) *link_id = _link_id; return &sta->sta; } link = rcu_dereference(sta->sdata->link[_link_id]); if (!link) continue; if (memcmp(link->conf->addr, localaddr, ETH_ALEN)) continue; if (link_id) *link_id = _link_id; return &sta->sta; } return NULL; } EXPORT_SYMBOL_GPL(ieee80211_find_sta_by_link_addrs); struct sta_info *sta_info_get_by_addrs(struct ieee80211_local *local, const u8 *sta_addr, const u8 *vif_addr) { struct rhlist_head *tmp; struct sta_info *sta; for_each_sta_info(local, sta_addr, sta, tmp) { if (ether_addr_equal(vif_addr, sta->sdata->vif.addr)) return sta; } return NULL; } struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata, int idx) { struct ieee80211_local *local = sdata->local; struct sta_info *sta; int i = 0; list_for_each_entry_rcu(sta, &local->sta_list, list, lockdep_is_held(&local->hw.wiphy->mtx)) { if (sdata != sta->sdata) continue; if (i < idx) { ++i; continue; } return sta; } return NULL; } static void sta_info_free_link(struct link_sta_info *link_sta) { free_percpu(link_sta->pcpu_rx_stats); } static void sta_accumulate_removed_link_stats(struct sta_info *sta, int link_id) { struct link_sta_info *link_sta = wiphy_dereference(sta->local->hw.wiphy, sta->link[link_id]); struct ieee80211_link_data *link; int ac, tid; u32 thr; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { sta->rem_link_stats.tx_packets += link_sta->tx_stats.packets[ac]; sta->rem_link_stats.tx_bytes += link_sta->tx_stats.bytes[ac]; } sta->rem_link_stats.rx_packets += link_sta->rx_stats.packets; sta->rem_link_stats.rx_bytes += link_sta->rx_stats.bytes; sta->rem_link_stats.tx_retries += link_sta->status_stats.retry_count; sta->rem_link_stats.tx_failed += link_sta->status_stats.retry_failed; sta->rem_link_stats.rx_dropped_misc += link_sta->rx_stats.dropped; thr = sta_get_expected_throughput(sta); if (thr != 0) sta->rem_link_stats.expected_throughput += thr; for (tid = 0; tid < IEEE80211_NUM_TIDS; tid++) { sta->rem_link_stats.pertid_stats.rx_msdu += link_sta->rx_stats.msdu[tid]; sta->rem_link_stats.pertid_stats.tx_msdu += link_sta->tx_stats.msdu[tid]; sta->rem_link_stats.pertid_stats.tx_msdu_retries += link_sta->status_stats.msdu_retries[tid]; sta->rem_link_stats.pertid_stats.tx_msdu_failed += link_sta->status_stats.msdu_failed[tid]; } if (sta->sdata->vif.type == NL80211_IFTYPE_STATION) { link = wiphy_dereference(sta->sdata->local->hw.wiphy, sta->sdata->link[link_id]); if (link) sta->rem_link_stats.beacon_loss_count += link->u.mgd.beacon_loss_count; } } static void sta_remove_link(struct sta_info *sta, unsigned int link_id, bool unhash) { struct sta_link_alloc *alloc = NULL; struct link_sta_info *link_sta; lockdep_assert_wiphy(sta->local->hw.wiphy); link_sta = rcu_access_pointer(sta->link[link_id]); if (WARN_ON(!link_sta)) return; if (unhash) link_sta_info_hash_del(sta->local, link_sta); if (test_sta_flag(sta, WLAN_STA_INSERTED)) ieee80211_link_sta_debugfs_remove(link_sta); if (link_sta != &sta->deflink) alloc = container_of(link_sta, typeof(*alloc), info); sta->sta.valid_links &= ~BIT(link_id); /* store removed link info for accumulated stats consistency */ sta_accumulate_removed_link_stats(sta, link_id); RCU_INIT_POINTER(sta->link[link_id], NULL); RCU_INIT_POINTER(sta->sta.link[link_id], NULL); if (alloc) { sta_info_free_link(&alloc->info); kfree_rcu(alloc, rcu_head); } ieee80211_sta_recalc_aggregates(&sta->sta); } /** * sta_info_free - free STA * * @local: pointer to the global information * @sta: STA info to free * * This function must undo everything done by sta_info_alloc() * that may happen before sta_info_insert(). It may only be * called when sta_info_insert() has not been attempted (and * if that fails, the station is freed anyway.) */ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta) { int i; for (i = 0; i < ARRAY_SIZE(sta->link); i++) { struct link_sta_info *link_sta; link_sta = rcu_access_pointer(sta->link[i]); if (!link_sta) continue; sta_remove_link(sta, i, false); } /* * If we had used sta_info_pre_move_state() then we might not * have gone through the state transitions down again, so do * it here now (and warn if it's inserted). * * This will clear state such as fast TX/RX that may have been * allocated during state transitions. */ while (sta->sta_state > IEEE80211_STA_NONE) { int ret; WARN_ON_ONCE(test_sta_flag(sta, WLAN_STA_INSERTED)); ret = sta_info_move_state(sta, sta->sta_state - 1); if (WARN_ONCE(ret, "sta_info_move_state() returned %d\n", ret)) break; } if (sta->rate_ctrl) rate_control_free_sta(sta); sta_dbg(sta->sdata, "Destroyed STA %pM\n", sta->sta.addr); kfree(to_txq_info(sta->sta.txq[0])); kfree(rcu_dereference_raw(sta->sta.rates)); #ifdef CONFIG_MAC80211_MESH kfree(sta->mesh); #endif sta_info_free_link(&sta->deflink); kfree(sta); } static int sta_info_hash_add(struct ieee80211_local *local, struct sta_info *sta) { return rhltable_insert(&local->sta_hash, &sta->hash_node, sta_rht_params); } static void sta_deliver_ps_frames(struct work_struct *wk) { struct sta_info *sta; sta = container_of(wk, struct sta_info, drv_deliver_wk); if (sta->dead) return; local_bh_disable(); if (!test_sta_flag(sta, WLAN_STA_PS_STA)) ieee80211_sta_ps_deliver_wakeup(sta); else if (test_and_clear_sta_flag(sta, WLAN_STA_PSPOLL)) ieee80211_sta_ps_deliver_poll_response(sta); else if (test_and_clear_sta_flag(sta, WLAN_STA_UAPSD)) ieee80211_sta_ps_deliver_uapsd(sta); local_bh_enable(); } static int sta_prepare_rate_control(struct ieee80211_local *local, struct sta_info *sta, gfp_t gfp) { if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) return 0; sta->rate_ctrl = local->rate_ctrl; sta->rate_ctrl_priv = rate_control_alloc_sta(sta->rate_ctrl, sta, gfp); if (!sta->rate_ctrl_priv) return -ENOMEM; return 0; } static int sta_info_alloc_link(struct ieee80211_local *local, struct link_sta_info *link_info, gfp_t gfp) { struct ieee80211_hw *hw = &local->hw; int i; if (ieee80211_hw_check(hw, USES_RSS)) { link_info->pcpu_rx_stats = alloc_percpu_gfp(struct ieee80211_sta_rx_stats, gfp); if (!link_info->pcpu_rx_stats) return -ENOMEM; } link_info->rx_stats.last_rx = jiffies; u64_stats_init(&link_info->rx_stats.syncp); ewma_signal_init(&link_info->rx_stats_avg.signal); ewma_avg_signal_init(&link_info->status_stats.avg_ack_signal); for (i = 0; i < ARRAY_SIZE(link_info->rx_stats_avg.chain_signal); i++) ewma_signal_init(&link_info->rx_stats_avg.chain_signal[i]); link_info->rx_omi_bw_rx = IEEE80211_STA_RX_BW_MAX; link_info->rx_omi_bw_tx = IEEE80211_STA_RX_BW_MAX; link_info->rx_omi_bw_staging = IEEE80211_STA_RX_BW_MAX; /* * Cause (a) warning(s) if IEEE80211_STA_RX_BW_MAX != 320 * or if new values are added to the enum. */ switch (link_info->cur_max_bandwidth) { case IEEE80211_STA_RX_BW_20: case IEEE80211_STA_RX_BW_40: case IEEE80211_STA_RX_BW_80: case IEEE80211_STA_RX_BW_160: case IEEE80211_STA_RX_BW_MAX: /* intentionally nothing */ break; } return 0; } static void sta_info_add_link(struct sta_info *sta, unsigned int link_id, struct link_sta_info *link_info, struct ieee80211_link_sta *link_sta) { link_info->sta = sta; link_info->link_id = link_id; link_info->pub = link_sta; link_info->pub->sta = &sta->sta; link_sta->link_id = link_id; rcu_assign_pointer(sta->link[link_id], link_info); rcu_assign_pointer(sta->sta.link[link_id], link_sta); link_sta->smps_mode = IEEE80211_SMPS_OFF; link_sta->agg.max_rc_amsdu_len = IEEE80211_MAX_MPDU_LEN_HT_BA; } static struct sta_info * __sta_info_alloc(struct ieee80211_sub_if_data *sdata, const u8 *addr, int link_id, const u8 *link_addr, gfp_t gfp) { struct ieee80211_local *local = sdata->local; struct ieee80211_hw *hw = &local->hw; struct sta_info *sta; void *txq_data; int size; int i; sta = kzalloc(sizeof(*sta) + hw->sta_data_size, gfp); if (!sta) return NULL; sta->local = local; sta->sdata = sdata; if (sta_info_alloc_link(local, &sta->deflink, gfp)) goto free; if (link_id >= 0) { sta_info_add_link(sta, link_id, &sta->deflink, &sta->sta.deflink); sta->sta.valid_links = BIT(link_id); } else { sta_info_add_link(sta, 0, &sta->deflink, &sta->sta.deflink); } sta->sta.cur = &sta->sta.deflink.agg; spin_lock_init(&sta->lock); spin_lock_init(&sta->ps_lock); INIT_WORK(&sta->drv_deliver_wk, sta_deliver_ps_frames); wiphy_work_init(&sta->ampdu_mlme.work, ieee80211_ba_session_work); #ifdef CONFIG_MAC80211_MESH if (ieee80211_vif_is_mesh(&sdata->vif)) { sta->mesh = kzalloc(sizeof(*sta->mesh), gfp); if (!sta->mesh) goto free; sta->mesh->plink_sta = sta; spin_lock_init(&sta->mesh->plink_lock); if (!sdata->u.mesh.user_mpm) timer_setup(&sta->mesh->plink_timer, mesh_plink_timer, 0); sta->mesh->nonpeer_pm = NL80211_MESH_POWER_ACTIVE; } #endif memcpy(sta->addr, addr, ETH_ALEN); memcpy(sta->sta.addr, addr, ETH_ALEN); memcpy(sta->deflink.addr, link_addr, ETH_ALEN); memcpy(sta->sta.deflink.addr, link_addr, ETH_ALEN); sta->sta.max_rx_aggregation_subframes = local->hw.max_rx_aggregation_subframes; /* TODO link specific alloc and assignments for MLO Link STA */ /* Extended Key ID needs to install keys for keyid 0 and 1 Rx-only. * The Tx path starts to use a key as soon as the key slot ptk_idx * references to is not NULL. To not use the initial Rx-only key * prematurely for Tx initialize ptk_idx to an impossible PTK keyid * which always will refer to a NULL key. */ BUILD_BUG_ON(ARRAY_SIZE(sta->ptk) <= INVALID_PTK_KEYIDX); sta->ptk_idx = INVALID_PTK_KEYIDX; ieee80211_init_frag_cache(&sta->frags); sta->sta_state = IEEE80211_STA_NONE; if (sdata->vif.type == NL80211_IFTYPE_MESH_POINT) sta->amsdu_mesh_control = -1; /* Mark TID as unreserved */ sta->reserved_tid = IEEE80211_TID_UNRESERVED; sta->last_connected = ktime_get_seconds(); size = sizeof(struct txq_info) + ALIGN(hw->txq_data_size, sizeof(void *)); txq_data = kcalloc(ARRAY_SIZE(sta->sta.txq), size, gfp); if (!txq_data) goto free; for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { struct txq_info *txq = txq_data + i * size; /* might not do anything for the (bufferable) MMPDU TXQ */ ieee80211_txq_init(sdata, sta, txq, i); } if (sta_prepare_rate_control(local, sta, gfp)) goto free_txq; sta->airtime_weight = IEEE80211_DEFAULT_AIRTIME_WEIGHT; for (i = 0; i < IEEE80211_NUM_ACS; i++) { skb_queue_head_init(&sta->ps_tx_buf[i]); skb_queue_head_init(&sta->tx_filtered[i]); sta->airtime[i].deficit = sta->airtime_weight; atomic_set(&sta->airtime[i].aql_tx_pending, 0); sta->airtime[i].aql_limit_low = local->aql_txq_limit_low[i]; sta->airtime[i].aql_limit_high = local->aql_txq_limit_high[i]; } for (i = 0; i < IEEE80211_NUM_TIDS; i++) sta->last_seq_ctrl[i] = cpu_to_le16(USHRT_MAX); for (i = 0; i < NUM_NL80211_BANDS; i++) { u32 mandatory = 0; int r; if (!hw->wiphy->bands[i]) continue; switch (i) { case NL80211_BAND_2GHZ: case NL80211_BAND_LC: /* * We use both here, even if we cannot really know for * sure the station will support both, but the only use * for this is when we don't know anything yet and send * management frames, and then we'll pick the lowest * possible rate anyway. * If we don't include _G here, we cannot find a rate * in P2P, and thus trigger the WARN_ONCE() in rate.c */ mandatory = IEEE80211_RATE_MANDATORY_B | IEEE80211_RATE_MANDATORY_G; break; case NL80211_BAND_5GHZ: case NL80211_BAND_6GHZ: mandatory = IEEE80211_RATE_MANDATORY_A; break; case NL80211_BAND_60GHZ: WARN_ON(1); mandatory = 0; break; } for (r = 0; r < hw->wiphy->bands[i]->n_bitrates; r++) { struct ieee80211_rate *rate; rate = &hw->wiphy->bands[i]->bitrates[r]; if (!(rate->flags & mandatory)) continue; sta->sta.deflink.supp_rates[i] |= BIT(r); } } sta_dbg(sdata, "Allocated STA %pM\n", sta->sta.addr); return sta; free_txq: kfree(to_txq_info(sta->sta.txq[0])); free: sta_info_free_link(&sta->deflink); #ifdef CONFIG_MAC80211_MESH kfree(sta->mesh); #endif kfree(sta); return NULL; } struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, const u8 *addr, gfp_t gfp) { return __sta_info_alloc(sdata, addr, -1, addr, gfp); } struct sta_info *sta_info_alloc_with_link(struct ieee80211_sub_if_data *sdata, const u8 *mld_addr, unsigned int link_id, const u8 *link_addr, gfp_t gfp) { return __sta_info_alloc(sdata, mld_addr, link_id, link_addr, gfp); } static int sta_info_insert_check(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; lockdep_assert_wiphy(sdata->local->hw.wiphy); /* * Can't be a WARN_ON because it can be triggered through a race: * something inserts a STA (on one CPU) without holding the RTNL * and another CPU turns off the net device. */ if (unlikely(!ieee80211_sdata_running(sdata))) return -ENETDOWN; if (WARN_ON(ether_addr_equal(sta->sta.addr, sdata->vif.addr) || !is_valid_ether_addr(sta->sta.addr))) return -EINVAL; /* The RCU read lock is required by rhashtable due to * asynchronous resize/rehash. We also require the mutex * for correctness. */ rcu_read_lock(); if (ieee80211_hw_check(&sdata->local->hw, NEEDS_UNIQUE_STA_ADDR) && ieee80211_find_sta_by_ifaddr(&sdata->local->hw, sta->addr, NULL)) { rcu_read_unlock(); return -ENOTUNIQ; } rcu_read_unlock(); return 0; } static int sta_info_insert_drv_state(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta) { enum ieee80211_sta_state state; int err = 0; for (state = IEEE80211_STA_NOTEXIST; state < sta->sta_state; state++) { err = drv_sta_state(local, sdata, sta, state, state + 1); if (err) break; } if (!err) { /* * Drivers using legacy sta_add/sta_remove callbacks only * get uploaded set to true after sta_add is called. */ if (!local->ops->sta_add) sta->uploaded = true; return 0; } if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { sdata_info(sdata, "failed to move IBSS STA %pM to state %d (%d) - keeping it anyway\n", sta->sta.addr, state + 1, err); err = 0; } /* unwind on error */ for (; state > IEEE80211_STA_NOTEXIST; state--) WARN_ON(drv_sta_state(local, sdata, sta, state, state - 1)); return err; } static void ieee80211_recalc_p2p_go_ps_allowed(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; bool allow_p2p_go_ps = sdata->vif.p2p; struct sta_info *sta; rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) { if (sdata != sta->sdata || !test_sta_flag(sta, WLAN_STA_ASSOC)) continue; if (!sta->sta.support_p2p_ps) { allow_p2p_go_ps = false; break; } } rcu_read_unlock(); if (allow_p2p_go_ps != sdata->vif.bss_conf.allow_p2p_go_ps) { sdata->vif.bss_conf.allow_p2p_go_ps = allow_p2p_go_ps; ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_P2P_PS); } } static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU) { struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; struct station_info *sinfo = NULL; int err = 0; lockdep_assert_wiphy(local->hw.wiphy); /* check if STA exists already */ if (sta_info_get_bss(sdata, sta->sta.addr)) { err = -EEXIST; goto out_cleanup; } sinfo = kzalloc(sizeof(struct station_info), GFP_KERNEL); if (!sinfo) { err = -ENOMEM; goto out_cleanup; } local->num_sta++; local->sta_generation++; smp_mb(); /* simplify things and don't accept BA sessions yet */ set_sta_flag(sta, WLAN_STA_BLOCK_BA); /* make the station visible */ err = sta_info_hash_add(local, sta); if (err) goto out_drop_sta; if (sta->sta.valid_links) { err = link_sta_info_hash_add(local, &sta->deflink); if (err) { sta_info_hash_del(local, sta); goto out_drop_sta; } } list_add_tail_rcu(&sta->list, &local->sta_list); /* update channel context before notifying the driver about state * change, this enables driver using the updated channel context right away. */ if (sta->sta_state >= IEEE80211_STA_ASSOC) { ieee80211_recalc_min_chandef(sta->sdata, -1); if (!sta->sta.support_p2p_ps) ieee80211_recalc_p2p_go_ps_allowed(sta->sdata); } /* notify driver */ err = sta_info_insert_drv_state(local, sdata, sta); if (err) goto out_remove; set_sta_flag(sta, WLAN_STA_INSERTED); /* accept BA sessions now */ clear_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_sta_debugfs_add(sta); rate_control_add_sta_debugfs(sta); if (sta->sta.valid_links) { int i; for (i = 0; i < ARRAY_SIZE(sta->link); i++) { struct link_sta_info *link_sta; link_sta = rcu_dereference_protected(sta->link[i], lockdep_is_held(&local->hw.wiphy->mtx)); if (!link_sta) continue; ieee80211_link_sta_debugfs_add(link_sta); if (sdata->vif.active_links & BIT(i)) ieee80211_link_sta_debugfs_drv_add(link_sta); } } else { ieee80211_link_sta_debugfs_add(&sta->deflink); ieee80211_link_sta_debugfs_drv_add(&sta->deflink); } sinfo->generation = local->sta_generation; cfg80211_new_sta(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL); kfree(sinfo); sta_dbg(sdata, "Inserted STA %pM\n", sta->sta.addr); /* move reference to rcu-protected */ rcu_read_lock(); if (ieee80211_vif_is_mesh(&sdata->vif)) mesh_accept_plinks_update(sdata); ieee80211_check_fast_xmit(sta); return 0; out_remove: if (sta->sta.valid_links) link_sta_info_hash_del(local, &sta->deflink); sta_info_hash_del(local, sta); list_del_rcu(&sta->list); out_drop_sta: local->num_sta--; synchronize_net(); out_cleanup: cleanup_single_sta(sta); kfree(sinfo); rcu_read_lock(); return err; } int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU) { struct ieee80211_local *local = sta->local; int err; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); err = sta_info_insert_check(sta); if (err) { sta_info_free(local, sta); rcu_read_lock(); return err; } return sta_info_insert_finish(sta); } int sta_info_insert(struct sta_info *sta) { int err = sta_info_insert_rcu(sta); rcu_read_unlock(); return err; } static inline void __bss_tim_set(u8 *tim, u16 id) { /* * This format has been mandated by the IEEE specifications, * so this line may not be changed to use the __set_bit() format. */ tim[id / 8] |= (1 << (id % 8)); } static inline void __bss_tim_clear(u8 *tim, u16 id) { /* * This format has been mandated by the IEEE specifications, * so this line may not be changed to use the __clear_bit() format. */ tim[id / 8] &= ~(1 << (id % 8)); } static inline bool __bss_tim_get(u8 *tim, u16 id) { /* * This format has been mandated by the IEEE specifications, * so this line may not be changed to use the test_bit() format. */ return tim[id / 8] & (1 << (id % 8)); } static unsigned long ieee80211_tids_for_ac(int ac) { /* If we ever support TIDs > 7, this obviously needs to be adjusted */ switch (ac) { case IEEE80211_AC_VO: return BIT(6) | BIT(7); case IEEE80211_AC_VI: return BIT(4) | BIT(5); case IEEE80211_AC_BE: return BIT(0) | BIT(3); case IEEE80211_AC_BK: return BIT(1) | BIT(2); default: WARN_ON(1); return 0; } } static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending) { struct ieee80211_local *local = sta->local; struct ps_data *ps; bool indicate_tim = false; u8 ignore_for_tim = sta->sta.uapsd_queues; int ac; u16 id = sta->sta.aid; if (sta->sdata->vif.type == NL80211_IFTYPE_AP || sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { if (WARN_ON_ONCE(!sta->sdata->bss)) return; ps = &sta->sdata->bss->ps; #ifdef CONFIG_MAC80211_MESH } else if (ieee80211_vif_is_mesh(&sta->sdata->vif)) { ps = &sta->sdata->u.mesh.ps; #endif } else { return; } /* No need to do anything if the driver does all */ if (ieee80211_hw_check(&local->hw, AP_LINK_PS) && !local->ops->set_tim) return; if (sta->dead) goto done; /* * If all ACs are delivery-enabled then we should build * the TIM bit for all ACs anyway; if only some are then * we ignore those and build the TIM bit using only the * non-enabled ones. */ if (ignore_for_tim == BIT(IEEE80211_NUM_ACS) - 1) ignore_for_tim = 0; if (ignore_pending) ignore_for_tim = BIT(IEEE80211_NUM_ACS) - 1; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { unsigned long tids; if (ignore_for_tim & ieee80211_ac_to_qos_mask[ac]) continue; indicate_tim |= !skb_queue_empty(&sta->tx_filtered[ac]) || !skb_queue_empty(&sta->ps_tx_buf[ac]); if (indicate_tim) break; tids = ieee80211_tids_for_ac(ac); indicate_tim |= sta->driver_buffered_tids & tids; indicate_tim |= sta->txq_buffered_tids & tids; } done: spin_lock_bh(&local->tim_lock); if (indicate_tim == __bss_tim_get(ps->tim, id)) goto out_unlock; if (indicate_tim) __bss_tim_set(ps->tim, id); else __bss_tim_clear(ps->tim, id); if (local->ops->set_tim && !WARN_ON(sta->dead)) { local->tim_in_locked_section = true; drv_set_tim(local, &sta->sta, indicate_tim); local->tim_in_locked_section = false; } out_unlock: spin_unlock_bh(&local->tim_lock); } void sta_info_recalc_tim(struct sta_info *sta) { __sta_info_recalc_tim(sta, false); } static bool sta_info_buffer_expired(struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_tx_info *info; int timeout; if (!skb) return false; info = IEEE80211_SKB_CB(skb); /* Timeout: (2 * listen_interval * beacon_int * 1024 / 1000000) sec */ timeout = (sta->listen_interval * sta->sdata->vif.bss_conf.beacon_int * 32 / 15625) * HZ; if (timeout < STA_TX_BUFFER_EXPIRE) timeout = STA_TX_BUFFER_EXPIRE; return time_after(jiffies, info->control.jiffies + timeout); } static bool sta_info_cleanup_expire_buffered_ac(struct ieee80211_local *local, struct sta_info *sta, int ac) { unsigned long flags; struct sk_buff *skb; /* * First check for frames that should expire on the filtered * queue. Frames here were rejected by the driver and are on * a separate queue to avoid reordering with normal PS-buffered * frames. They also aren't accounted for right now in the * total_ps_buffered counter. */ for (;;) { spin_lock_irqsave(&sta->tx_filtered[ac].lock, flags); skb = skb_peek(&sta->tx_filtered[ac]); if (sta_info_buffer_expired(sta, skb)) skb = __skb_dequeue(&sta->tx_filtered[ac]); else skb = NULL; spin_unlock_irqrestore(&sta->tx_filtered[ac].lock, flags); /* * Frames are queued in order, so if this one * hasn't expired yet we can stop testing. If * we actually reached the end of the queue we * also need to stop, of course. */ if (!skb) break; ieee80211_free_txskb(&local->hw, skb); } /* * Now also check the normal PS-buffered queue, this will * only find something if the filtered queue was emptied * since the filtered frames are all before the normal PS * buffered frames. */ for (;;) { spin_lock_irqsave(&sta->ps_tx_buf[ac].lock, flags); skb = skb_peek(&sta->ps_tx_buf[ac]); if (sta_info_buffer_expired(sta, skb)) skb = __skb_dequeue(&sta->ps_tx_buf[ac]); else skb = NULL; spin_unlock_irqrestore(&sta->ps_tx_buf[ac].lock, flags); /* * frames are queued in order, so if this one * hasn't expired yet (or we reached the end of * the queue) we can stop testing */ if (!skb) break; local->total_ps_buffered--; ps_dbg(sta->sdata, "Buffered frame expired (STA %pM)\n", sta->sta.addr); ieee80211_free_txskb(&local->hw, skb); } /* * Finally, recalculate the TIM bit for this station -- it might * now be clear because the station was too slow to retrieve its * frames. */ sta_info_recalc_tim(sta); /* * Return whether there are any frames still buffered, this is * used to check whether the cleanup timer still needs to run, * if there are no frames we don't need to rearm the timer. */ return !(skb_queue_empty(&sta->ps_tx_buf[ac]) && skb_queue_empty(&sta->tx_filtered[ac])); } static bool sta_info_cleanup_expire_buffered(struct ieee80211_local *local, struct sta_info *sta) { bool have_buffered = false; int ac; /* This is only necessary for stations on BSS/MBSS interfaces */ if (!sta->sdata->bss && !ieee80211_vif_is_mesh(&sta->sdata->vif)) return false; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) have_buffered |= sta_info_cleanup_expire_buffered_ac(local, sta, ac); return have_buffered; } static int __must_check __sta_info_destroy_part1(struct sta_info *sta) { struct ieee80211_local *local; struct ieee80211_sub_if_data *sdata; int ret, i; might_sleep(); if (!sta) return -ENOENT; local = sta->local; sdata = sta->sdata; lockdep_assert_wiphy(local->hw.wiphy); /* * Before removing the station from the driver and * rate control, it might still start new aggregation * sessions -- block that to make sure the tear-down * will be sufficient. */ set_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_sta_tear_down_BA_sessions(sta, AGG_STOP_DESTROY_STA); /* * Before removing the station from the driver there might be pending * rx frames on RSS queues sent prior to the disassociation - wait for * all such frames to be processed. */ drv_sync_rx_queues(local, sta); for (i = 0; i < ARRAY_SIZE(sta->link); i++) { struct link_sta_info *link_sta; if (!(sta->sta.valid_links & BIT(i))) continue; link_sta = rcu_dereference_protected(sta->link[i], lockdep_is_held(&local->hw.wiphy->mtx)); link_sta_info_hash_del(local, link_sta); } ret = sta_info_hash_del(local, sta); if (WARN_ON(ret)) return ret; /* * for TDLS peers, make sure to return to the base channel before * removal. */ if (test_sta_flag(sta, WLAN_STA_TDLS_OFF_CHANNEL)) { drv_tdls_cancel_channel_switch(local, sdata, &sta->sta); clear_sta_flag(sta, WLAN_STA_TDLS_OFF_CHANNEL); } list_del_rcu(&sta->list); sta->removed = true; if (sta->uploaded) drv_sta_pre_rcu_remove(local, sta->sdata, sta); if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN && rcu_access_pointer(sdata->u.vlan.sta) == sta) RCU_INIT_POINTER(sdata->u.vlan.sta, NULL); return 0; } static int _sta_info_move_state(struct sta_info *sta, enum ieee80211_sta_state new_state, bool recalc) { struct ieee80211_local *local = sta->local; might_sleep(); if (sta->sta_state == new_state) return 0; /* check allowed transitions first */ switch (new_state) { case IEEE80211_STA_NONE: if (sta->sta_state != IEEE80211_STA_AUTH) return -EINVAL; break; case IEEE80211_STA_AUTH: if (sta->sta_state != IEEE80211_STA_NONE && sta->sta_state != IEEE80211_STA_ASSOC) return -EINVAL; break; case IEEE80211_STA_ASSOC: if (sta->sta_state != IEEE80211_STA_AUTH && sta->sta_state != IEEE80211_STA_AUTHORIZED) return -EINVAL; break; case IEEE80211_STA_AUTHORIZED: if (sta->sta_state != IEEE80211_STA_ASSOC) return -EINVAL; break; default: WARN(1, "invalid state %d", new_state); return -EINVAL; } sta_dbg(sta->sdata, "moving STA %pM to state %d\n", sta->sta.addr, new_state); /* notify the driver before the actual changes so it can * fail the transition if the state is increasing. * The driver is required not to fail when the transition * is decreasing the state, so first, do all the preparation * work and only then, notify the driver. */ if (new_state > sta->sta_state && test_sta_flag(sta, WLAN_STA_INSERTED)) { int err = drv_sta_state(sta->local, sta->sdata, sta, sta->sta_state, new_state); if (err) return err; } /* reflect the change in all state variables */ switch (new_state) { case IEEE80211_STA_NONE: if (sta->sta_state == IEEE80211_STA_AUTH) clear_bit(WLAN_STA_AUTH, &sta->_flags); break; case IEEE80211_STA_AUTH: if (sta->sta_state == IEEE80211_STA_NONE) { set_bit(WLAN_STA_AUTH, &sta->_flags); } else if (sta->sta_state == IEEE80211_STA_ASSOC) { clear_bit(WLAN_STA_ASSOC, &sta->_flags); if (recalc) { ieee80211_recalc_min_chandef(sta->sdata, -1); if (!sta->sta.support_p2p_ps) ieee80211_recalc_p2p_go_ps_allowed(sta->sdata); } } break; case IEEE80211_STA_ASSOC: if (sta->sta_state == IEEE80211_STA_AUTH) { set_bit(WLAN_STA_ASSOC, &sta->_flags); sta->assoc_at = ktime_get_boottime_ns(); if (recalc) { ieee80211_recalc_min_chandef(sta->sdata, -1); if (!sta->sta.support_p2p_ps) ieee80211_recalc_p2p_go_ps_allowed(sta->sdata); } } else if (sta->sta_state == IEEE80211_STA_AUTHORIZED) { ieee80211_vif_dec_num_mcast(sta->sdata); clear_bit(WLAN_STA_AUTHORIZED, &sta->_flags); /* * If we have encryption offload, flush (station) queues * (after ensuring concurrent TX completed) so we won't * transmit anything later unencrypted if/when keys are * also removed, which might otherwise happen depending * on how the hardware offload works. */ if (local->ops->set_key) { synchronize_net(); if (local->ops->flush_sta) drv_flush_sta(local, sta->sdata, sta); else ieee80211_flush_queues(local, sta->sdata, false); } ieee80211_clear_fast_xmit(sta); ieee80211_clear_fast_rx(sta); } break; case IEEE80211_STA_AUTHORIZED: if (sta->sta_state == IEEE80211_STA_ASSOC) { ieee80211_vif_inc_num_mcast(sta->sdata); set_bit(WLAN_STA_AUTHORIZED, &sta->_flags); ieee80211_check_fast_xmit(sta); ieee80211_check_fast_rx(sta); } if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN || sta->sdata->vif.type == NL80211_IFTYPE_AP) cfg80211_send_layer2_update(sta->sdata->dev, sta->sta.addr); break; default: break; } if (new_state < sta->sta_state && test_sta_flag(sta, WLAN_STA_INSERTED)) { int err = drv_sta_state(sta->local, sta->sdata, sta, sta->sta_state, new_state); WARN_ONCE(err, "Driver is not allowed to fail if the sta_state is transitioning down the list: %d\n", err); } sta->sta_state = new_state; return 0; } int sta_info_move_state(struct sta_info *sta, enum ieee80211_sta_state new_state) { return _sta_info_move_state(sta, new_state, true); } static void __sta_info_destroy_part2(struct sta_info *sta, bool recalc) { struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; struct station_info *sinfo; int ret; /* * NOTE: This assumes at least synchronize_net() was done * after _part1 and before _part2! */ /* * There's a potential race in _part1 where we set WLAN_STA_BLOCK_BA * but someone might have just gotten past a check, and not yet into * queuing the work/creating the data/etc. * * Do another round of destruction so that the worker is certainly * canceled before we later free the station. * * Since this is after synchronize_rcu()/synchronize_net() we're now * certain that nobody can actually hold a reference to the STA and * be calling e.g. ieee80211_start_tx_ba_session(). */ ieee80211_sta_tear_down_BA_sessions(sta, AGG_STOP_DESTROY_STA); might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (sta->sta_state == IEEE80211_STA_AUTHORIZED) { ret = _sta_info_move_state(sta, IEEE80211_STA_ASSOC, recalc); WARN_ON_ONCE(ret); } /* now keys can no longer be reached */ ieee80211_free_sta_keys(local, sta); /* disable TIM bit - last chance to tell driver */ __sta_info_recalc_tim(sta, true); sta->dead = true; local->num_sta--; local->sta_generation++; while (sta->sta_state > IEEE80211_STA_NONE) { ret = _sta_info_move_state(sta, sta->sta_state - 1, recalc); if (ret) { WARN_ON_ONCE(1); break; } } if (sta->uploaded) { ret = drv_sta_state(local, sdata, sta, IEEE80211_STA_NONE, IEEE80211_STA_NOTEXIST); WARN_ON_ONCE(ret != 0); } sta_dbg(sdata, "Removed STA %pM\n", sta->sta.addr); sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL); if (sinfo) sta_set_sinfo(sta, sinfo, true); cfg80211_del_sta_sinfo(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL); kfree(sinfo); ieee80211_sta_debugfs_remove(sta); ieee80211_destroy_frag_cache(&sta->frags); cleanup_single_sta(sta); } int __must_check __sta_info_destroy(struct sta_info *sta) { int err = __sta_info_destroy_part1(sta); if (err) return err; synchronize_net(); __sta_info_destroy_part2(sta, true); return 0; } int sta_info_destroy_addr(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct sta_info *sta; lockdep_assert_wiphy(sdata->local->hw.wiphy); sta = sta_info_get(sdata, addr); return __sta_info_destroy(sta); } int sta_info_destroy_addr_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct sta_info *sta; lockdep_assert_wiphy(sdata->local->hw.wiphy); sta = sta_info_get_bss(sdata, addr); return __sta_info_destroy(sta); } static void sta_info_cleanup(struct timer_list *t) { struct ieee80211_local *local = timer_container_of(local, t, sta_cleanup); struct sta_info *sta; bool timer_needed = false; rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) if (sta_info_cleanup_expire_buffered(local, sta)) timer_needed = true; rcu_read_unlock(); if (local->quiescing) return; if (!timer_needed) return; mod_timer(&local->sta_cleanup, round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL)); } int sta_info_init(struct ieee80211_local *local) { int err; err = rhltable_init(&local->sta_hash, &sta_rht_params); if (err) return err; err = rhltable_init(&local->link_sta_hash, &link_sta_rht_params); if (err) { rhltable_destroy(&local->sta_hash); return err; } spin_lock_init(&local->tim_lock); INIT_LIST_HEAD(&local->sta_list); timer_setup(&local->sta_cleanup, sta_info_cleanup, 0); return 0; } void sta_info_stop(struct ieee80211_local *local) { timer_delete_sync(&local->sta_cleanup); rhltable_destroy(&local->sta_hash); rhltable_destroy(&local->link_sta_hash); } int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans, int link_id, struct sta_info *do_not_flush_sta) { struct ieee80211_local *local = sdata->local; struct sta_info *sta, *tmp; LIST_HEAD(free_list); int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); WARN_ON(vlans && sdata->vif.type != NL80211_IFTYPE_AP); WARN_ON(vlans && !sdata->bss); list_for_each_entry_safe(sta, tmp, &local->sta_list, list) { if (sdata != sta->sdata && (!vlans || sdata->bss != sta->sdata->bss)) continue; if (sta == do_not_flush_sta) continue; if (link_id >= 0 && sta->sta.valid_links && !(sta->sta.valid_links & BIT(link_id))) continue; if (!WARN_ON(__sta_info_destroy_part1(sta))) list_add(&sta->free_list, &free_list); ret++; } if (!list_empty(&free_list)) { bool support_p2p_ps = true; synchronize_net(); list_for_each_entry_safe(sta, tmp, &free_list, free_list) { if (!sta->sta.support_p2p_ps) support_p2p_ps = false; __sta_info_destroy_part2(sta, false); } ieee80211_recalc_min_chandef(sdata, -1); if (!support_p2p_ps) ieee80211_recalc_p2p_go_ps_allowed(sdata); } return ret; } void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata, unsigned long exp_time) { struct ieee80211_local *local = sdata->local; struct sta_info *sta, *tmp; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry_safe(sta, tmp, &local->sta_list, list) { unsigned long last_active = ieee80211_sta_last_active(sta, -1); if (sdata != sta->sdata) continue; if (time_is_before_jiffies(last_active + exp_time)) { sta_dbg(sta->sdata, "expiring inactive STA %pM\n", sta->sta.addr); if (ieee80211_vif_is_mesh(&sdata->vif) && test_sta_flag(sta, WLAN_STA_PS_STA)) atomic_dec(&sdata->u.mesh.ps.num_sta_ps); WARN_ON(__sta_info_destroy(sta)); } } } struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw, const u8 *addr, const u8 *localaddr) { struct ieee80211_local *local = hw_to_local(hw); struct rhlist_head *tmp; struct sta_info *sta; /* * Just return a random station if localaddr is NULL * ... first in list. */ for_each_sta_info(local, addr, sta, tmp) { if (localaddr && !ether_addr_equal(sta->sdata->vif.addr, localaddr)) continue; if (!sta->uploaded) return NULL; return &sta->sta; } return NULL; } EXPORT_SYMBOL_GPL(ieee80211_find_sta_by_ifaddr); struct ieee80211_sta *ieee80211_find_sta(struct ieee80211_vif *vif, const u8 *addr) { struct sta_info *sta; if (!vif) return NULL; sta = sta_info_get_bss(vif_to_sdata(vif), addr); if (!sta) return NULL; if (!sta->uploaded) return NULL; return &sta->sta; } EXPORT_SYMBOL(ieee80211_find_sta); /* powersave support code */ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct sk_buff_head pending; int filtered = 0, buffered = 0, ac, i; unsigned long flags; struct ps_data *ps; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); if (sdata->vif.type == NL80211_IFTYPE_AP) ps = &sdata->bss->ps; else if (ieee80211_vif_is_mesh(&sdata->vif)) ps = &sdata->u.mesh.ps; else return; clear_sta_flag(sta, WLAN_STA_SP); BUILD_BUG_ON(BITS_TO_LONGS(IEEE80211_NUM_TIDS) > 1); sta->driver_buffered_tids = 0; sta->txq_buffered_tids = 0; if (!ieee80211_hw_check(&local->hw, AP_LINK_PS)) drv_sta_notify(local, sdata, STA_NOTIFY_AWAKE, &sta->sta); for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { if (!sta->sta.txq[i] || !txq_has_queue(sta->sta.txq[i])) continue; schedule_and_wake_txq(local, to_txq_info(sta->sta.txq[i])); } skb_queue_head_init(&pending); /* sync with ieee80211_tx_h_unicast_ps_buf */ spin_lock_bh(&sta->ps_lock); /* Send all buffered frames to the station */ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { int count = skb_queue_len(&pending), tmp; spin_lock_irqsave(&sta->tx_filtered[ac].lock, flags); skb_queue_splice_tail_init(&sta->tx_filtered[ac], &pending); spin_unlock_irqrestore(&sta->tx_filtered[ac].lock, flags); tmp = skb_queue_len(&pending); filtered += tmp - count; count = tmp; spin_lock_irqsave(&sta->ps_tx_buf[ac].lock, flags); skb_queue_splice_tail_init(&sta->ps_tx_buf[ac], &pending); spin_unlock_irqrestore(&sta->ps_tx_buf[ac].lock, flags); tmp = skb_queue_len(&pending); buffered += tmp - count; } ieee80211_add_pending_skbs(local, &pending); /* now we're no longer in the deliver code */ clear_sta_flag(sta, WLAN_STA_PS_DELIVER); /* The station might have polled and then woken up before we responded, * so clear these flags now to avoid them sticking around. */ clear_sta_flag(sta, WLAN_STA_PSPOLL); clear_sta_flag(sta, WLAN_STA_UAPSD); spin_unlock_bh(&sta->ps_lock); atomic_dec(&ps->num_sta_ps); local->total_ps_buffered -= buffered; sta_info_recalc_tim(sta); ps_dbg(sdata, "STA %pM aid %d sending %d filtered/%d PS frames since STA woke up\n", sta->sta.addr, sta->sta.aid, filtered, buffered); ieee80211_check_fast_xmit(sta); } static void ieee80211_send_null_response(struct sta_info *sta, int tid, enum ieee80211_frame_release_type reason, bool call_driver, bool more_data) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_qos_hdr *nullfunc; struct sk_buff *skb; int size = sizeof(*nullfunc); __le16 fc; bool qos = sta->sta.wme; struct ieee80211_tx_info *info; struct ieee80211_chanctx_conf *chanctx_conf; if (qos) { fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_QOS_NULLFUNC | IEEE80211_FCTL_FROMDS); } else { size -= 2; fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC | IEEE80211_FCTL_FROMDS); } skb = dev_alloc_skb(local->hw.extra_tx_headroom + size); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom); nullfunc = skb_put(skb, size); nullfunc->frame_control = fc; nullfunc->duration_id = 0; memcpy(nullfunc->addr1, sta->sta.addr, ETH_ALEN); memcpy(nullfunc->addr2, sdata->vif.addr, ETH_ALEN); memcpy(nullfunc->addr3, sdata->vif.addr, ETH_ALEN); nullfunc->seq_ctrl = 0; skb->priority = tid; skb_set_queue_mapping(skb, ieee802_1d_to_ac[tid]); if (qos) { nullfunc->qos_ctrl = cpu_to_le16(tid); if (reason == IEEE80211_FRAME_RELEASE_UAPSD) { nullfunc->qos_ctrl |= cpu_to_le16(IEEE80211_QOS_CTL_EOSP); if (more_data) nullfunc->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA); } } info = IEEE80211_SKB_CB(skb); /* * Tell TX path to send this frame even though the * STA may still remain is PS mode after this frame * exchange. Also set EOSP to indicate this packet * ends the poll/service period. */ info->flags |= IEEE80211_TX_CTL_NO_PS_BUFFER | IEEE80211_TX_STATUS_EOSP | IEEE80211_TX_CTL_REQ_TX_STATUS; info->control.flags |= IEEE80211_TX_CTRL_PS_RESPONSE; if (call_driver) drv_allow_buffered_frames(local, sta, BIT(tid), 1, reason, false); skb->dev = sdata->dev; rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (WARN_ON(!chanctx_conf)) { rcu_read_unlock(); kfree_skb(skb); return; } info->band = chanctx_conf->def.chan->band; ieee80211_xmit(sdata, sta, skb); rcu_read_unlock(); } static int find_highest_prio_tid(unsigned long tids) { /* lower 3 TIDs aren't ordered perfectly */ if (tids & 0xF8) return fls(tids) - 1; /* TID 0 is BE just like TID 3 */ if (tids & BIT(0)) return 0; return fls(tids) - 1; } /* Indicates if the MORE_DATA bit should be set in the last * frame obtained by ieee80211_sta_ps_get_frames. * Note that driver_release_tids is relevant only if * reason = IEEE80211_FRAME_RELEASE_PSPOLL */ static bool ieee80211_sta_ps_more_data(struct sta_info *sta, u8 ignored_acs, enum ieee80211_frame_release_type reason, unsigned long driver_release_tids) { int ac; /* If the driver has data on more than one TID then * certainly there's more data if we release just a * single frame now (from a single TID). This will * only happen for PS-Poll. */ if (reason == IEEE80211_FRAME_RELEASE_PSPOLL && hweight16(driver_release_tids) > 1) return true; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { if (ignored_acs & ieee80211_ac_to_qos_mask[ac]) continue; if (!skb_queue_empty(&sta->tx_filtered[ac]) || !skb_queue_empty(&sta->ps_tx_buf[ac])) return true; } return false; } static void ieee80211_sta_ps_get_frames(struct sta_info *sta, int n_frames, u8 ignored_acs, enum ieee80211_frame_release_type reason, struct sk_buff_head *frames, unsigned long *driver_release_tids) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; int ac; /* Get response frame(s) and more data bit for the last one. */ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { unsigned long tids; if (ignored_acs & ieee80211_ac_to_qos_mask[ac]) continue; tids = ieee80211_tids_for_ac(ac); /* if we already have frames from software, then we can't also * release from hardware queues */ if (skb_queue_empty(frames)) { *driver_release_tids |= sta->driver_buffered_tids & tids; *driver_release_tids |= sta->txq_buffered_tids & tids; } if (!*driver_release_tids) { struct sk_buff *skb; while (n_frames > 0) { skb = skb_dequeue(&sta->tx_filtered[ac]); if (!skb) { skb = skb_dequeue( &sta->ps_tx_buf[ac]); if (skb) local->total_ps_buffered--; } if (!skb) break; n_frames--; __skb_queue_tail(frames, skb); } } /* If we have more frames buffered on this AC, then abort the * loop since we can't send more data from other ACs before * the buffered frames from this. */ if (!skb_queue_empty(&sta->tx_filtered[ac]) || !skb_queue_empty(&sta->ps_tx_buf[ac])) break; } } static void ieee80211_sta_ps_deliver_response(struct sta_info *sta, int n_frames, u8 ignored_acs, enum ieee80211_frame_release_type reason) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; unsigned long driver_release_tids = 0; struct sk_buff_head frames; bool more_data; /* Service or PS-Poll period starts */ set_sta_flag(sta, WLAN_STA_SP); __skb_queue_head_init(&frames); ieee80211_sta_ps_get_frames(sta, n_frames, ignored_acs, reason, &frames, &driver_release_tids); more_data = ieee80211_sta_ps_more_data(sta, ignored_acs, reason, driver_release_tids); if (driver_release_tids && reason == IEEE80211_FRAME_RELEASE_PSPOLL) driver_release_tids = BIT(find_highest_prio_tid(driver_release_tids)); if (skb_queue_empty(&frames) && !driver_release_tids) { int tid, ac; /* * For PS-Poll, this can only happen due to a race condition * when we set the TIM bit and the station notices it, but * before it can poll for the frame we expire it. * * For uAPSD, this is said in the standard (11.2.1.5 h): * At each unscheduled SP for a non-AP STA, the AP shall * attempt to transmit at least one MSDU or MMPDU, but no * more than the value specified in the Max SP Length field * in the QoS Capability element from delivery-enabled ACs, * that are destined for the non-AP STA. * * Since we have no other MSDU/MMPDU, transmit a QoS null frame. */ /* This will evaluate to 1, 3, 5 or 7. */ for (ac = IEEE80211_AC_VO; ac < IEEE80211_NUM_ACS; ac++) if (!(ignored_acs & ieee80211_ac_to_qos_mask[ac])) break; tid = 7 - 2 * ac; ieee80211_send_null_response(sta, tid, reason, true, false); } else if (!driver_release_tids) { struct sk_buff_head pending; struct sk_buff *skb; int num = 0; u16 tids = 0; bool need_null = false; skb_queue_head_init(&pending); while ((skb = __skb_dequeue(&frames))) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (void *) skb->data; u8 *qoshdr = NULL; num++; /* * Tell TX path to send this frame even though the * STA may still remain is PS mode after this frame * exchange. */ info->flags |= IEEE80211_TX_CTL_NO_PS_BUFFER; info->control.flags |= IEEE80211_TX_CTRL_PS_RESPONSE; /* * Use MoreData flag to indicate whether there are * more buffered frames for this STA */ if (more_data || !skb_queue_empty(&frames)) hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA); else hdr->frame_control &= cpu_to_le16(~IEEE80211_FCTL_MOREDATA); if (ieee80211_is_data_qos(hdr->frame_control) || ieee80211_is_qos_nullfunc(hdr->frame_control)) qoshdr = ieee80211_get_qos_ctl(hdr); tids |= BIT(skb->priority); __skb_queue_tail(&pending, skb); /* end service period after last frame or add one */ if (!skb_queue_empty(&frames)) continue; if (reason != IEEE80211_FRAME_RELEASE_UAPSD) { /* for PS-Poll, there's only one frame */ info->flags |= IEEE80211_TX_STATUS_EOSP | IEEE80211_TX_CTL_REQ_TX_STATUS; break; } /* For uAPSD, things are a bit more complicated. If the * last frame has a QoS header (i.e. is a QoS-data or * QoS-nulldata frame) then just set the EOSP bit there * and be done. * If the frame doesn't have a QoS header (which means * it should be a bufferable MMPDU) then we can't set * the EOSP bit in the QoS header; add a QoS-nulldata * frame to the list to send it after the MMPDU. * * Note that this code is only in the mac80211-release * code path, we assume that the driver will not buffer * anything but QoS-data frames, or if it does, will * create the QoS-nulldata frame by itself if needed. * * Cf. 802.11-2012 10.2.1.10 (c). */ if (qoshdr) { *qoshdr |= IEEE80211_QOS_CTL_EOSP; info->flags |= IEEE80211_TX_STATUS_EOSP | IEEE80211_TX_CTL_REQ_TX_STATUS; } else { /* The standard isn't completely clear on this * as it says the more-data bit should be set * if there are more BUs. The QoS-Null frame * we're about to send isn't buffered yet, we * only create it below, but let's pretend it * was buffered just in case some clients only * expect more-data=0 when eosp=1. */ hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA); need_null = true; num++; } break; } drv_allow_buffered_frames(local, sta, tids, num, reason, more_data); ieee80211_add_pending_skbs(local, &pending); if (need_null) ieee80211_send_null_response( sta, find_highest_prio_tid(tids), reason, false, false); sta_info_recalc_tim(sta); } else { int tid; /* * We need to release a frame that is buffered somewhere in the * driver ... it'll have to handle that. * Note that the driver also has to check the number of frames * on the TIDs we're releasing from - if there are more than * n_frames it has to set the more-data bit (if we didn't ask * it to set it anyway due to other buffered frames); if there * are fewer than n_frames it has to make sure to adjust that * to allow the service period to end properly. */ drv_release_buffered_frames(local, sta, driver_release_tids, n_frames, reason, more_data); /* * Note that we don't recalculate the TIM bit here as it would * most likely have no effect at all unless the driver told us * that the TID(s) became empty before returning here from the * release function. * Either way, however, when the driver tells us that the TID(s) * became empty or we find that a txq became empty, we'll do the * TIM recalculation. */ for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) { if (!sta->sta.txq[tid] || !(driver_release_tids & BIT(tid)) || txq_has_queue(sta->sta.txq[tid])) continue; sta_info_recalc_tim(sta); break; } } } void ieee80211_sta_ps_deliver_poll_response(struct sta_info *sta) { u8 ignore_for_response = sta->sta.uapsd_queues; /* * If all ACs are delivery-enabled then we should reply * from any of them, if only some are enabled we reply * only from the non-enabled ones. */ if (ignore_for_response == BIT(IEEE80211_NUM_ACS) - 1) ignore_for_response = 0; ieee80211_sta_ps_deliver_response(sta, 1, ignore_for_response, IEEE80211_FRAME_RELEASE_PSPOLL); } void ieee80211_sta_ps_deliver_uapsd(struct sta_info *sta) { int n_frames = sta->sta.max_sp; u8 delivery_enabled = sta->sta.uapsd_queues; /* * If we ever grow support for TSPEC this might happen if * the TSPEC update from hostapd comes in between a trigger * frame setting WLAN_STA_UAPSD in the RX path and this * actually getting called. */ if (!delivery_enabled) return; switch (sta->sta.max_sp) { case 1: n_frames = 2; break; case 2: n_frames = 4; break; case 3: n_frames = 6; break; case 0: /* XXX: what is a good value? */ n_frames = 128; break; } ieee80211_sta_ps_deliver_response(sta, n_frames, ~delivery_enabled, IEEE80211_FRAME_RELEASE_UAPSD); } void ieee80211_sta_block_awake(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, bool block) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); trace_api_sta_block_awake(sta->local, pubsta, block); if (block) { set_sta_flag(sta, WLAN_STA_PS_DRIVER); ieee80211_clear_fast_xmit(sta); return; } if (!test_sta_flag(sta, WLAN_STA_PS_DRIVER)) return; if (!test_sta_flag(sta, WLAN_STA_PS_STA)) { set_sta_flag(sta, WLAN_STA_PS_DELIVER); clear_sta_flag(sta, WLAN_STA_PS_DRIVER); ieee80211_queue_work(hw, &sta->drv_deliver_wk); } else if (test_sta_flag(sta, WLAN_STA_PSPOLL) || test_sta_flag(sta, WLAN_STA_UAPSD)) { /* must be asleep in this case */ clear_sta_flag(sta, WLAN_STA_PS_DRIVER); ieee80211_queue_work(hw, &sta->drv_deliver_wk); } else { clear_sta_flag(sta, WLAN_STA_PS_DRIVER); ieee80211_check_fast_xmit(sta); } } EXPORT_SYMBOL(ieee80211_sta_block_awake); void ieee80211_sta_eosp(struct ieee80211_sta *pubsta) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_local *local = sta->local; trace_api_eosp(local, pubsta); clear_sta_flag(sta, WLAN_STA_SP); } EXPORT_SYMBOL(ieee80211_sta_eosp); void ieee80211_send_eosp_nullfunc(struct ieee80211_sta *pubsta, int tid) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); enum ieee80211_frame_release_type reason; bool more_data; trace_api_send_eosp_nullfunc(sta->local, pubsta, tid); reason = IEEE80211_FRAME_RELEASE_UAPSD; more_data = ieee80211_sta_ps_more_data(sta, ~sta->sta.uapsd_queues, reason, 0); ieee80211_send_null_response(sta, tid, reason, false, more_data); } EXPORT_SYMBOL(ieee80211_send_eosp_nullfunc); void ieee80211_sta_set_buffered(struct ieee80211_sta *pubsta, u8 tid, bool buffered) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); if (WARN_ON(tid >= IEEE80211_NUM_TIDS)) return; trace_api_sta_set_buffered(sta->local, pubsta, tid, buffered); if (buffered) set_bit(tid, &sta->driver_buffered_tids); else clear_bit(tid, &sta->driver_buffered_tids); sta_info_recalc_tim(sta); } EXPORT_SYMBOL(ieee80211_sta_set_buffered); void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid, u32 tx_airtime, u32 rx_airtime) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_local *local = sta->sdata->local; u8 ac = ieee80211_ac_from_tid(tid); u32 airtime = 0; if (sta->local->airtime_flags & AIRTIME_USE_TX) airtime += tx_airtime; if (sta->local->airtime_flags & AIRTIME_USE_RX) airtime += rx_airtime; spin_lock_bh(&local->active_txq_lock[ac]); sta->airtime[ac].tx_airtime += tx_airtime; sta->airtime[ac].rx_airtime += rx_airtime; if (ieee80211_sta_keep_active(sta, ac)) sta->airtime[ac].deficit -= airtime; spin_unlock_bh(&local->active_txq_lock[ac]); } EXPORT_SYMBOL(ieee80211_sta_register_airtime); void __ieee80211_sta_recalc_aggregates(struct sta_info *sta, u16 active_links) { bool first = true; int link_id; if (!sta->sta.valid_links || !sta->sta.mlo) { sta->sta.cur = &sta->sta.deflink.agg; return; } rcu_read_lock(); for (link_id = 0; link_id < ARRAY_SIZE((sta)->link); link_id++) { struct ieee80211_link_sta *link_sta; int i; if (!(active_links & BIT(link_id))) continue; link_sta = rcu_dereference(sta->sta.link[link_id]); if (!link_sta) continue; if (first) { sta->cur = sta->sta.deflink.agg; first = false; continue; } sta->cur.max_amsdu_len = min(sta->cur.max_amsdu_len, link_sta->agg.max_amsdu_len); sta->cur.max_rc_amsdu_len = min(sta->cur.max_rc_amsdu_len, link_sta->agg.max_rc_amsdu_len); for (i = 0; i < ARRAY_SIZE(sta->cur.max_tid_amsdu_len); i++) sta->cur.max_tid_amsdu_len[i] = min(sta->cur.max_tid_amsdu_len[i], link_sta->agg.max_tid_amsdu_len[i]); } rcu_read_unlock(); sta->sta.cur = &sta->cur; } void ieee80211_sta_recalc_aggregates(struct ieee80211_sta *pubsta) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); __ieee80211_sta_recalc_aggregates(sta, sta->sdata->vif.active_links); } EXPORT_SYMBOL(ieee80211_sta_recalc_aggregates); void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local, struct sta_info *sta, u8 ac, u16 tx_airtime, bool tx_completed) { int tx_pending; if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) return; if (!tx_completed) { if (sta) atomic_add(tx_airtime, &sta->airtime[ac].aql_tx_pending); atomic_add(tx_airtime, &local->aql_total_pending_airtime); atomic_add(tx_airtime, &local->aql_ac_pending_airtime[ac]); return; } if (sta) { tx_pending = atomic_sub_return(tx_airtime, &sta->airtime[ac].aql_tx_pending); if (tx_pending < 0) atomic_cmpxchg(&sta->airtime[ac].aql_tx_pending, tx_pending, 0); } atomic_sub(tx_airtime, &local->aql_total_pending_airtime); tx_pending = atomic_sub_return(tx_airtime, &local->aql_ac_pending_airtime[ac]); if (WARN_ONCE(tx_pending < 0, "Device %s AC %d pending airtime underflow: %u, %u", wiphy_name(local->hw.wiphy), ac, tx_pending, tx_airtime)) { atomic_cmpxchg(&local->aql_ac_pending_airtime[ac], tx_pending, 0); atomic_sub(tx_pending, &local->aql_total_pending_airtime); } } static struct ieee80211_sta_rx_stats * sta_get_last_rx_stats(struct sta_info *sta, int link_id) { struct ieee80211_sta_rx_stats *stats; struct link_sta_info *link_sta_info; int cpu; if (link_id < 0) link_sta_info = &sta->deflink; else link_sta_info = wiphy_dereference(sta->local->hw.wiphy, sta->link[link_id]); stats = &link_sta_info->rx_stats; if (!link_sta_info->pcpu_rx_stats) return stats; for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpustats; cpustats = per_cpu_ptr(link_sta_info->pcpu_rx_stats, cpu); if (time_after(cpustats->last_rx, stats->last_rx)) stats = cpustats; } return stats; } static void sta_stats_decode_rate(struct ieee80211_local *local, u32 rate, struct rate_info *rinfo) { rinfo->bw = STA_STATS_GET(BW, rate); switch (STA_STATS_GET(TYPE, rate)) { case STA_STATS_RATE_TYPE_VHT: rinfo->flags = RATE_INFO_FLAGS_VHT_MCS; rinfo->mcs = STA_STATS_GET(VHT_MCS, rate); rinfo->nss = STA_STATS_GET(VHT_NSS, rate); if (STA_STATS_GET(SGI, rate)) rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI; break; case STA_STATS_RATE_TYPE_HT: rinfo->flags = RATE_INFO_FLAGS_MCS; rinfo->mcs = STA_STATS_GET(HT_MCS, rate); if (STA_STATS_GET(SGI, rate)) rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI; break; case STA_STATS_RATE_TYPE_LEGACY: { struct ieee80211_supported_band *sband; u16 brate; unsigned int shift; int band = STA_STATS_GET(LEGACY_BAND, rate); int rate_idx = STA_STATS_GET(LEGACY_IDX, rate); sband = local->hw.wiphy->bands[band]; if (WARN_ON_ONCE(!sband->bitrates)) break; brate = sband->bitrates[rate_idx].bitrate; if (rinfo->bw == RATE_INFO_BW_5) shift = 2; else if (rinfo->bw == RATE_INFO_BW_10) shift = 1; else shift = 0; rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift); break; } case STA_STATS_RATE_TYPE_HE: rinfo->flags = RATE_INFO_FLAGS_HE_MCS; rinfo->mcs = STA_STATS_GET(HE_MCS, rate); rinfo->nss = STA_STATS_GET(HE_NSS, rate); rinfo->he_gi = STA_STATS_GET(HE_GI, rate); rinfo->he_ru_alloc = STA_STATS_GET(HE_RU, rate); rinfo->he_dcm = STA_STATS_GET(HE_DCM, rate); break; case STA_STATS_RATE_TYPE_EHT: rinfo->flags = RATE_INFO_FLAGS_EHT_MCS; rinfo->mcs = STA_STATS_GET(EHT_MCS, rate); rinfo->nss = STA_STATS_GET(EHT_NSS, rate); rinfo->eht_gi = STA_STATS_GET(EHT_GI, rate); rinfo->eht_ru_alloc = STA_STATS_GET(EHT_RU, rate); break; } } static int sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo, int link_id) { u32 rate = READ_ONCE(sta_get_last_rx_stats(sta, link_id)->last_rate); if (rate == STA_STATS_RATE_INVALID) return -EINVAL; sta_stats_decode_rate(sta->local, rate, rinfo); return 0; } static inline u64 sta_get_tidstats_msdu(struct ieee80211_sta_rx_stats *rxstats, int tid) { unsigned int start; u64 value; do { start = u64_stats_fetch_begin(&rxstats->syncp); value = rxstats->msdu[tid]; } while (u64_stats_fetch_retry(&rxstats->syncp, start)); return value; } static void sta_set_tidstats(struct sta_info *sta, struct cfg80211_tid_stats *tidstats, int tid, int link_id) { struct ieee80211_local *local = sta->local; struct link_sta_info *link_sta_info; int cpu; if (link_id < 0) link_sta_info = &sta->deflink; else link_sta_info = wiphy_dereference(sta->local->hw.wiphy, sta->link[link_id]); if (!(tidstats->filled & BIT(NL80211_TID_STATS_RX_MSDU))) { tidstats->rx_msdu += sta_get_tidstats_msdu(&link_sta_info->rx_stats, tid); if (link_sta_info->pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(link_sta_info->pcpu_rx_stats, cpu); tidstats->rx_msdu += sta_get_tidstats_msdu(cpurxs, tid); } } tidstats->filled |= BIT(NL80211_TID_STATS_RX_MSDU); } if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU))) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU); tidstats->tx_msdu = link_sta_info->tx_stats.msdu[tid]; } if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_RETRIES)) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_RETRIES); tidstats->tx_msdu_retries = link_sta_info->status_stats.msdu_retries[tid]; } if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_FAILED)) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_FAILED); tidstats->tx_msdu_failed = link_sta_info->status_stats.msdu_failed[tid]; } if (link_id < 0 && tid < IEEE80211_NUM_TIDS) { spin_lock_bh(&local->fq.lock); tidstats->filled |= BIT(NL80211_TID_STATS_TXQ_STATS); ieee80211_fill_txq_stats(&tidstats->txq_stats, to_txq_info(sta->sta.txq[tid])); spin_unlock_bh(&local->fq.lock); } } static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats) { unsigned int start; u64 value; do { start = u64_stats_fetch_begin(&rxstats->syncp); value = rxstats->bytes; } while (u64_stats_fetch_retry(&rxstats->syncp, start)); return value; } #ifdef CONFIG_MAC80211_MESH static void sta_set_mesh_sinfo(struct sta_info *sta, struct station_info *sinfo) { struct ieee80211_local *local = sta->sdata->local; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_LLID) | BIT_ULL(NL80211_STA_INFO_PLID) | BIT_ULL(NL80211_STA_INFO_PLINK_STATE) | BIT_ULL(NL80211_STA_INFO_LOCAL_PM) | BIT_ULL(NL80211_STA_INFO_PEER_PM) | BIT_ULL(NL80211_STA_INFO_NONPEER_PM) | BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_GATE) | BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_AS); sinfo->llid = sta->mesh->llid; sinfo->plid = sta->mesh->plid; sinfo->plink_state = sta->mesh->plink_state; if (test_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN)) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_T_OFFSET); sinfo->t_offset = sta->mesh->t_offset; } sinfo->local_pm = sta->mesh->local_pm; sinfo->peer_pm = sta->mesh->peer_pm; sinfo->nonpeer_pm = sta->mesh->nonpeer_pm; sinfo->connected_to_gate = sta->mesh->connected_to_gate; sinfo->connected_to_as = sta->mesh->connected_to_as; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_AIRTIME_LINK_METRIC); sinfo->airtime_link_metric = airtime_link_metric_get(local, sta); } #endif void sta_set_accumulated_removed_links_sinfo(struct sta_info *sta, struct station_info *sinfo) { /* Accumulating the removed link statistics. */ sinfo->tx_packets = sta->rem_link_stats.tx_packets; sinfo->rx_packets = sta->rem_link_stats.rx_packets; sinfo->tx_bytes = sta->rem_link_stats.tx_bytes; sinfo->rx_bytes = sta->rem_link_stats.rx_bytes; sinfo->tx_retries = sta->rem_link_stats.tx_retries; sinfo->tx_failed = sta->rem_link_stats.tx_failed; sinfo->rx_dropped_misc = sta->rem_link_stats.rx_dropped_misc; sinfo->beacon_loss_count = sta->rem_link_stats.beacon_loss_count; sinfo->expected_throughput = sta->rem_link_stats.expected_throughput; if (sinfo->pertid) { sinfo->pertid->rx_msdu = sta->rem_link_stats.pertid_stats.rx_msdu; sinfo->pertid->tx_msdu = sta->rem_link_stats.pertid_stats.tx_msdu; sinfo->pertid->tx_msdu_retries = sta->rem_link_stats.pertid_stats.tx_msdu_retries; sinfo->pertid->tx_msdu_failed = sta->rem_link_stats.pertid_stats.tx_msdu_failed; } } static void sta_set_link_sinfo(struct sta_info *sta, struct link_station_info *link_sinfo, struct ieee80211_link_data *link, bool tidstats) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_sta_rx_stats *last_rxstats; int i, ac, cpu, link_id = link->link_id; struct link_sta_info *link_sta_info; u32 thr = 0; last_rxstats = sta_get_last_rx_stats(sta, link_id); link_sta_info = wiphy_dereference(sta->local->hw.wiphy, sta->link[link_id]); /* do before driver, so beacon filtering drivers have a * chance to e.g. just add the number of filtered beacons * (or just modify the value entirely, of course) */ if (sdata->vif.type == NL80211_IFTYPE_STATION) link_sinfo->rx_beacon = link->u.mgd.count_beacon_signal; ether_addr_copy(link_sinfo->addr, link_sta_info->addr); drv_link_sta_statistics(sta->local, sdata, link_sta_info->pub, link_sinfo); link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_INACTIVE_TIME) | BIT_ULL(NL80211_STA_INFO_BSS_PARAM) | BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC); if (sdata->vif.type == NL80211_IFTYPE_STATION) { link_sinfo->beacon_loss_count = link->u.mgd.beacon_loss_count; link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_LOSS); } link_sinfo->inactive_time = jiffies_to_msecs(jiffies - ieee80211_sta_last_active(sta, link_id)); if (!(link_sinfo->filled & (BIT_ULL(NL80211_STA_INFO_TX_BYTES64) | BIT_ULL(NL80211_STA_INFO_TX_BYTES)))) { link_sinfo->tx_bytes = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) link_sinfo->tx_bytes += link_sta_info->tx_stats.bytes[ac]; link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BYTES64); } if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_PACKETS))) { link_sinfo->tx_packets = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) link_sinfo->tx_packets += link_sta_info->tx_stats.packets[ac]; link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_PACKETS); } if (!(link_sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES64) | BIT_ULL(NL80211_STA_INFO_RX_BYTES)))) { link_sinfo->rx_bytes += sta_get_stats_bytes(&link_sta_info->rx_stats); if (link_sta_info->pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(link_sta_info->pcpu_rx_stats, cpu); link_sinfo->rx_bytes += sta_get_stats_bytes(cpurxs); } } link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BYTES64); } if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_PACKETS))) { link_sinfo->rx_packets = link_sta_info->rx_stats.packets; if (link_sta_info->pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(link_sta_info->pcpu_rx_stats, cpu); link_sinfo->rx_packets += cpurxs->packets; } } link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_PACKETS); } if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_RETRIES))) { link_sinfo->tx_retries = link_sta_info->status_stats.retry_count; link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_RETRIES); } if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED))) { link_sinfo->tx_failed = link_sta_info->status_stats.retry_failed; link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_FAILED); } if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_DURATION))) { for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) link_sinfo->rx_duration += sta->airtime[ac].rx_airtime; link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_DURATION); } if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_DURATION))) { for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) link_sinfo->tx_duration += sta->airtime[ac].tx_airtime; link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_DURATION); } if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_AIRTIME_WEIGHT))) { link_sinfo->airtime_weight = sta->airtime_weight; link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_AIRTIME_WEIGHT); } link_sinfo->rx_dropped_misc = link_sta_info->rx_stats.dropped; if (link_sta_info->pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(link_sta_info->pcpu_rx_stats, cpu); link_sinfo->rx_dropped_misc += cpurxs->dropped; } } if (sdata->vif.type == NL80211_IFTYPE_STATION && !(sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER)) { link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_RX) | BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG); link_sinfo->rx_beacon_signal_avg = ieee80211_ave_rssi(&sdata->vif, -1); } if (ieee80211_hw_check(&sta->local->hw, SIGNAL_DBM) || ieee80211_hw_check(&sta->local->hw, SIGNAL_UNSPEC)) { if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL))) { link_sinfo->signal = (s8)last_rxstats->last_signal; link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL); } if (!link_sta_info->pcpu_rx_stats && !(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG))) { link_sinfo->signal_avg = -ewma_signal_read(&link_sta_info->rx_stats_avg.signal); link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG); } } /* for the average - if pcpu_rx_stats isn't set - rxstats must point to * the sta->rx_stats struct, so the check here is fine with and without * pcpu statistics */ if (last_rxstats->chains && !(link_sinfo->filled & (BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL) | BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)))) { link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL); if (!link_sta_info->pcpu_rx_stats) link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG); link_sinfo->chains = last_rxstats->chains; for (i = 0; i < ARRAY_SIZE(link_sinfo->chain_signal); i++) { link_sinfo->chain_signal[i] = last_rxstats->chain_signal_last[i]; link_sinfo->chain_signal_avg[i] = -ewma_signal_read( &link_sta_info->rx_stats_avg.chain_signal[i]); } } if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE)) && ieee80211_rate_valid(&link_sta_info->tx_stats.last_rate)) { sta_set_rate_info_tx(sta, &link_sta_info->tx_stats.last_rate, &link_sinfo->txrate); link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BITRATE); } if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE))) { if (sta_set_rate_info_rx(sta, &link_sinfo->rxrate, link_id) == 0) link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BITRATE); } if (tidstats && !cfg80211_link_sinfo_alloc_tid_stats(link_sinfo, GFP_KERNEL)) { for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) sta_set_tidstats(sta, &link_sinfo->pertid[i], i, link_id); } link_sinfo->bss_param.flags = 0; if (sdata->vif.bss_conf.use_cts_prot) link_sinfo->bss_param.flags |= BSS_PARAM_FLAGS_CTS_PROT; if (sdata->vif.bss_conf.use_short_preamble) link_sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_PREAMBLE; if (sdata->vif.bss_conf.use_short_slot) link_sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_SLOT_TIME; link_sinfo->bss_param.dtim_period = link->conf->dtim_period; link_sinfo->bss_param.beacon_interval = link->conf->beacon_int; thr = sta_get_expected_throughput(sta); if (thr != 0) { link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_EXPECTED_THROUGHPUT); link_sinfo->expected_throughput = thr; } if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL)) && link_sta_info->status_stats.ack_signal_filled) { link_sinfo->ack_signal = link_sta_info->status_stats.last_ack_signal; link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL); } if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG)) && link_sta_info->status_stats.ack_signal_filled) { link_sinfo->avg_ack_signal = -(s8)ewma_avg_signal_read( &link_sta_info->status_stats.avg_ack_signal); link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG); } } void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, bool tidstats) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; u32 thr = 0; int i, ac, cpu; struct ieee80211_sta_rx_stats *last_rxstats; last_rxstats = sta_get_last_rx_stats(sta, -1); sinfo->generation = sdata->local->sta_generation; /* do before driver, so beacon filtering drivers have a * chance to e.g. just add the number of filtered beacons * (or just modify the value entirely, of course) */ if (sdata->vif.type == NL80211_IFTYPE_STATION) sinfo->rx_beacon = sdata->deflink.u.mgd.count_beacon_signal; drv_sta_statistics(local, sdata, &sta->sta, sinfo); sinfo->filled |= BIT_ULL(NL80211_STA_INFO_INACTIVE_TIME) | BIT_ULL(NL80211_STA_INFO_STA_FLAGS) | BIT_ULL(NL80211_STA_INFO_BSS_PARAM) | BIT_ULL(NL80211_STA_INFO_CONNECTED_TIME) | BIT_ULL(NL80211_STA_INFO_ASSOC_AT_BOOTTIME) | BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC); if (sdata->vif.type == NL80211_IFTYPE_STATION) { sinfo->beacon_loss_count = sdata->deflink.u.mgd.beacon_loss_count; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_LOSS); } sinfo->connected_time = ktime_get_seconds() - sta->last_connected; sinfo->assoc_at = sta->assoc_at; sinfo->inactive_time = jiffies_to_msecs(jiffies - ieee80211_sta_last_active(sta, -1)); if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_TX_BYTES64) | BIT_ULL(NL80211_STA_INFO_TX_BYTES)))) { sinfo->tx_bytes = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->tx_bytes += sta->deflink.tx_stats.bytes[ac]; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BYTES64); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_PACKETS))) { sinfo->tx_packets = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->tx_packets += sta->deflink.tx_stats.packets[ac]; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_PACKETS); } if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES64) | BIT_ULL(NL80211_STA_INFO_RX_BYTES)))) { sinfo->rx_bytes += sta_get_stats_bytes(&sta->deflink.rx_stats); if (sta->deflink.pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); sinfo->rx_bytes += sta_get_stats_bytes(cpurxs); } } sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BYTES64); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_PACKETS))) { sinfo->rx_packets = sta->deflink.rx_stats.packets; if (sta->deflink.pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); sinfo->rx_packets += cpurxs->packets; } } sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_PACKETS); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_RETRIES))) { sinfo->tx_retries = sta->deflink.status_stats.retry_count; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_RETRIES); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED))) { sinfo->tx_failed = sta->deflink.status_stats.retry_failed; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_FAILED); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_DURATION))) { for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->rx_duration += sta->airtime[ac].rx_airtime; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_DURATION); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_DURATION))) { for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->tx_duration += sta->airtime[ac].tx_airtime; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_DURATION); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_AIRTIME_WEIGHT))) { sinfo->airtime_weight = sta->airtime_weight; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_AIRTIME_WEIGHT); } sinfo->rx_dropped_misc = sta->deflink.rx_stats.dropped; if (sta->deflink.pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); sinfo->rx_dropped_misc += cpurxs->dropped; } } if (sdata->vif.type == NL80211_IFTYPE_STATION && !(sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER)) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_RX) | BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG); sinfo->rx_beacon_signal_avg = ieee80211_ave_rssi(&sdata->vif, -1); } if (ieee80211_hw_check(&sta->local->hw, SIGNAL_DBM) || ieee80211_hw_check(&sta->local->hw, SIGNAL_UNSPEC)) { if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL))) { sinfo->signal = (s8)last_rxstats->last_signal; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL); } if (!sta->deflink.pcpu_rx_stats && !(sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG))) { sinfo->signal_avg = -ewma_signal_read(&sta->deflink.rx_stats_avg.signal); sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG); } } /* for the average - if pcpu_rx_stats isn't set - rxstats must point to * the sta->rx_stats struct, so the check here is fine with and without * pcpu statistics */ if (last_rxstats->chains && !(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL) | BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)))) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL); if (!sta->deflink.pcpu_rx_stats) sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG); sinfo->chains = last_rxstats->chains; for (i = 0; i < ARRAY_SIZE(sinfo->chain_signal); i++) { sinfo->chain_signal[i] = last_rxstats->chain_signal_last[i]; sinfo->chain_signal_avg[i] = -ewma_signal_read(&sta->deflink.rx_stats_avg.chain_signal[i]); } } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE)) && !sta->sta.valid_links && ieee80211_rate_valid(&sta->deflink.tx_stats.last_rate)) { sta_set_rate_info_tx(sta, &sta->deflink.tx_stats.last_rate, &sinfo->txrate); sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BITRATE); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE)) && !sta->sta.valid_links) { if (sta_set_rate_info_rx(sta, &sinfo->rxrate, -1) == 0) sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BITRATE); } if (tidstats && !cfg80211_sinfo_alloc_tid_stats(sinfo, GFP_KERNEL)) { for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) sta_set_tidstats(sta, &sinfo->pertid[i], i, -1); } #ifdef CONFIG_MAC80211_MESH if (ieee80211_vif_is_mesh(&sdata->vif)) sta_set_mesh_sinfo(sta, sinfo); #endif sinfo->bss_param.flags = 0; if (sdata->vif.bss_conf.use_cts_prot) sinfo->bss_param.flags |= BSS_PARAM_FLAGS_CTS_PROT; if (sdata->vif.bss_conf.use_short_preamble) sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_PREAMBLE; if (sdata->vif.bss_conf.use_short_slot) sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_SLOT_TIME; sinfo->bss_param.dtim_period = sdata->vif.bss_conf.dtim_period; sinfo->bss_param.beacon_interval = sdata->vif.bss_conf.beacon_int; sinfo->sta_flags.set = 0; sinfo->sta_flags.mask = BIT(NL80211_STA_FLAG_AUTHORIZED) | BIT(NL80211_STA_FLAG_SHORT_PREAMBLE) | BIT(NL80211_STA_FLAG_WME) | BIT(NL80211_STA_FLAG_MFP) | BIT(NL80211_STA_FLAG_AUTHENTICATED) | BIT(NL80211_STA_FLAG_ASSOCIATED) | BIT(NL80211_STA_FLAG_TDLS_PEER); if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_AUTHORIZED); if (test_sta_flag(sta, WLAN_STA_SHORT_PREAMBLE)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_SHORT_PREAMBLE); if (sta->sta.wme) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_WME); if (test_sta_flag(sta, WLAN_STA_MFP)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_MFP); if (test_sta_flag(sta, WLAN_STA_AUTH)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_AUTHENTICATED); if (test_sta_flag(sta, WLAN_STA_ASSOC)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_ASSOCIATED); if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_TDLS_PEER); thr = sta_get_expected_throughput(sta); if (thr != 0) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_EXPECTED_THROUGHPUT); sinfo->expected_throughput = thr; } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL)) && sta->deflink.status_stats.ack_signal_filled) { sinfo->ack_signal = sta->deflink.status_stats.last_ack_signal; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG)) && sta->deflink.status_stats.ack_signal_filled) { sinfo->avg_ack_signal = -(s8)ewma_avg_signal_read( &sta->deflink.status_stats.avg_ack_signal); sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG); } if (sta->sta.valid_links) { struct ieee80211_link_data *link; struct link_sta_info *link_sta; int link_id; ether_addr_copy(sinfo->mld_addr, sta->addr); /* assign valid links first for iteration */ sinfo->valid_links = sta->sta.valid_links; for_each_valid_link(sinfo, link_id) { link_sta = wiphy_dereference(sta->local->hw.wiphy, sta->link[link_id]); link = wiphy_dereference(sdata->local->hw.wiphy, sdata->link[link_id]); if (!link_sta || !sinfo->links[link_id] || !link) { sinfo->valid_links &= ~BIT(link_id); continue; } sta_set_link_sinfo(sta, sinfo->links[link_id], link, tidstats); } } } u32 sta_get_expected_throughput(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct rate_control_ref *ref = NULL; u32 thr = 0; if (test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) ref = local->rate_ctrl; /* check if the driver has a SW RC implementation */ if (ref && ref->ops->get_expected_throughput) thr = ref->ops->get_expected_throughput(sta->rate_ctrl_priv); else thr = drv_get_expected_throughput(local, sta); return thr; } unsigned long ieee80211_sta_last_active(struct sta_info *sta, int link_id) { struct ieee80211_sta_rx_stats *stats; struct link_sta_info *link_sta_info; stats = sta_get_last_rx_stats(sta, link_id); if (link_id < 0) link_sta_info = &sta->deflink; else link_sta_info = wiphy_dereference(sta->local->hw.wiphy, sta->link[link_id]); if (!link_sta_info->status_stats.last_ack || time_after(stats->last_rx, link_sta_info->status_stats.last_ack)) return stats->last_rx; return link_sta_info->status_stats.last_ack; } int ieee80211_sta_allocate_link(struct sta_info *sta, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct sta_link_alloc *alloc; int ret; lockdep_assert_wiphy(sdata->local->hw.wiphy); WARN_ON(!test_sta_flag(sta, WLAN_STA_INSERTED)); /* must represent an MLD from the start */ if (WARN_ON(!sta->sta.valid_links)) return -EINVAL; if (WARN_ON(sta->sta.valid_links & BIT(link_id) || sta->link[link_id])) return -EBUSY; alloc = kzalloc(sizeof(*alloc), GFP_KERNEL); if (!alloc) return -ENOMEM; ret = sta_info_alloc_link(sdata->local, &alloc->info, GFP_KERNEL); if (ret) { kfree(alloc); return ret; } sta_info_add_link(sta, link_id, &alloc->info, &alloc->sta); ieee80211_link_sta_debugfs_add(&alloc->info); return 0; } void ieee80211_sta_free_link(struct sta_info *sta, unsigned int link_id) { lockdep_assert_wiphy(sta->sdata->local->hw.wiphy); WARN_ON(!test_sta_flag(sta, WLAN_STA_INSERTED)); sta_remove_link(sta, link_id, false); } int ieee80211_sta_activate_link(struct sta_info *sta, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct link_sta_info *link_sta; u16 old_links = sta->sta.valid_links; u16 new_links = old_links | BIT(link_id); int ret; link_sta = rcu_dereference_protected(sta->link[link_id], lockdep_is_held(&sdata->local->hw.wiphy->mtx)); if (WARN_ON(old_links == new_links || !link_sta)) return -EINVAL; rcu_read_lock(); if (link_sta_info_hash_lookup(sdata->local, link_sta->addr)) { rcu_read_unlock(); return -EALREADY; } /* we only modify under the mutex so this is fine */ rcu_read_unlock(); sta->sta.valid_links = new_links; if (WARN_ON(!test_sta_flag(sta, WLAN_STA_INSERTED))) goto hash; ieee80211_recalc_min_chandef(sdata, link_id); /* Ensure the values are updated for the driver, * redone by sta_remove_link on failure. */ ieee80211_sta_recalc_aggregates(&sta->sta); ret = drv_change_sta_links(sdata->local, sdata, &sta->sta, old_links, new_links); if (ret) { sta->sta.valid_links = old_links; sta_remove_link(sta, link_id, false); return ret; } hash: ret = link_sta_info_hash_add(sdata->local, link_sta); WARN_ON(ret); return 0; } void ieee80211_sta_remove_link(struct sta_info *sta, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = sta->sdata; u16 old_links = sta->sta.valid_links; lockdep_assert_wiphy(sdata->local->hw.wiphy); sta->sta.valid_links &= ~BIT(link_id); if (!WARN_ON(!test_sta_flag(sta, WLAN_STA_INSERTED))) drv_change_sta_links(sdata->local, sdata, &sta->sta, old_links, sta->sta.valid_links); sta_remove_link(sta, link_id, true); } void ieee80211_sta_set_max_amsdu_subframes(struct sta_info *sta, const u8 *ext_capab, unsigned int ext_capab_len) { u8 val; sta->sta.max_amsdu_subframes = 0; if (ext_capab_len < 8) return; /* The sender might not have sent the last bit, consider it to be 0 */ val = u8_get_bits(ext_capab[7], WLAN_EXT_CAPA8_MAX_MSDU_IN_AMSDU_LSB); /* we did get all the bits, take the MSB as well */ if (ext_capab_len >= 9) val |= u8_get_bits(ext_capab[8], WLAN_EXT_CAPA9_MAX_MSDU_IN_AMSDU_MSB) << 1; if (val) sta->sta.max_amsdu_subframes = 4 << (4 - val); } #ifdef CONFIG_LOCKDEP bool lockdep_sta_mutex_held(struct ieee80211_sta *pubsta) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); return lockdep_is_held(&sta->local->hw.wiphy->mtx); } EXPORT_SYMBOL(lockdep_sta_mutex_held); #endif
431 435 372 65 93 18 76 77 62 36 109 129 7 10 111 12 12 105 211 6 204 240 4 238 3 1 132 37 135 3 133 38 1 37 9 101 101 5 90 21 89 22 101 5 96 6 49 3 3 29 7 6 6 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 // SPDX-License-Identifier: GPL-2.0-or-later /* * Symmetric key cipher operations. * * Generic encrypt/decrypt wrapper for ciphers, handles operations across * multiple page boundaries by using temporary blocks. In user context, * the kernel is given a chance to schedule us once per page. * * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/internal/aead.h> #include <crypto/internal/cipher.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <linux/bug.h> #include <linux/cryptouser.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/string_choices.h> #include <net/netlink.h> #include "skcipher.h" #define CRYPTO_ALG_TYPE_SKCIPHER_MASK 0x0000000e static const struct crypto_type crypto_skcipher_type; static inline struct skcipher_alg *__crypto_skcipher_alg( struct crypto_alg *alg) { return container_of(alg, struct skcipher_alg, base); } int skcipher_walk_virt(struct skcipher_walk *__restrict walk, struct skcipher_request *__restrict req, bool atomic) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct skcipher_alg *alg; might_sleep_if(req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP); alg = crypto_skcipher_alg(tfm); walk->total = req->cryptlen; walk->nbytes = 0; walk->iv = req->iv; walk->oiv = req->iv; if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP)) atomic = true; if (unlikely(!walk->total)) return 0; scatterwalk_start(&walk->in, req->src); scatterwalk_start(&walk->out, req->dst); walk->blocksize = crypto_skcipher_blocksize(tfm); walk->ivsize = crypto_skcipher_ivsize(tfm); walk->alignmask = crypto_skcipher_alignmask(tfm); if (alg->co.base.cra_type != &crypto_skcipher_type) walk->stride = alg->co.chunksize; else walk->stride = alg->walksize; return skcipher_walk_first(walk, atomic); } EXPORT_SYMBOL_GPL(skcipher_walk_virt); static int skcipher_walk_aead_common(struct skcipher_walk *__restrict walk, struct aead_request *__restrict req, bool atomic) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); walk->nbytes = 0; walk->iv = req->iv; walk->oiv = req->iv; if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP)) atomic = true; if (unlikely(!walk->total)) return 0; scatterwalk_start_at_pos(&walk->in, req->src, req->assoclen); scatterwalk_start_at_pos(&walk->out, req->dst, req->assoclen); walk->blocksize = crypto_aead_blocksize(tfm); walk->stride = crypto_aead_chunksize(tfm); walk->ivsize = crypto_aead_ivsize(tfm); walk->alignmask = crypto_aead_alignmask(tfm); return skcipher_walk_first(walk, atomic); } int skcipher_walk_aead_encrypt(struct skcipher_walk *__restrict walk, struct aead_request *__restrict req, bool atomic) { walk->total = req->cryptlen; return skcipher_walk_aead_common(walk, req, atomic); } EXPORT_SYMBOL_GPL(skcipher_walk_aead_encrypt); int skcipher_walk_aead_decrypt(struct skcipher_walk *__restrict walk, struct aead_request *__restrict req, bool atomic) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); walk->total = req->cryptlen - crypto_aead_authsize(tfm); return skcipher_walk_aead_common(walk, req, atomic); } EXPORT_SYMBOL_GPL(skcipher_walk_aead_decrypt); static void skcipher_set_needkey(struct crypto_skcipher *tfm) { if (crypto_skcipher_max_keysize(tfm) != 0) crypto_skcipher_set_flags(tfm, CRYPTO_TFM_NEED_KEY); } static int skcipher_setkey_unaligned(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { unsigned long alignmask = crypto_skcipher_alignmask(tfm); struct skcipher_alg *cipher = crypto_skcipher_alg(tfm); u8 *buffer, *alignbuffer; unsigned long absize; int ret; absize = keylen + alignmask; buffer = kmalloc(absize, GFP_ATOMIC); if (!buffer) return -ENOMEM; alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); memcpy(alignbuffer, key, keylen); ret = cipher->setkey(tfm, alignbuffer, keylen); kfree_sensitive(buffer); return ret; } int crypto_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { struct skcipher_alg *cipher = crypto_skcipher_alg(tfm); unsigned long alignmask = crypto_skcipher_alignmask(tfm); int err; if (cipher->co.base.cra_type != &crypto_skcipher_type) { struct crypto_lskcipher **ctx = crypto_skcipher_ctx(tfm); crypto_lskcipher_clear_flags(*ctx, CRYPTO_TFM_REQ_MASK); crypto_lskcipher_set_flags(*ctx, crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); err = crypto_lskcipher_setkey(*ctx, key, keylen); goto out; } if (keylen < cipher->min_keysize || keylen > cipher->max_keysize) return -EINVAL; if ((unsigned long)key & alignmask) err = skcipher_setkey_unaligned(tfm, key, keylen); else err = cipher->setkey(tfm, key, keylen); out: if (unlikely(err)) { skcipher_set_needkey(tfm); return err; } crypto_skcipher_clear_flags(tfm, CRYPTO_TFM_NEED_KEY); return 0; } EXPORT_SYMBOL_GPL(crypto_skcipher_setkey); int crypto_skcipher_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct skcipher_alg *alg = crypto_skcipher_alg(tfm); if (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) return -ENOKEY; if (alg->co.base.cra_type != &crypto_skcipher_type) return crypto_lskcipher_encrypt_sg(req); return alg->encrypt(req); } EXPORT_SYMBOL_GPL(crypto_skcipher_encrypt); int crypto_skcipher_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct skcipher_alg *alg = crypto_skcipher_alg(tfm); if (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) return -ENOKEY; if (alg->co.base.cra_type != &crypto_skcipher_type) return crypto_lskcipher_decrypt_sg(req); return alg->decrypt(req); } EXPORT_SYMBOL_GPL(crypto_skcipher_decrypt); static int crypto_lskcipher_export(struct skcipher_request *req, void *out) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); u8 *ivs = skcipher_request_ctx(req); ivs = PTR_ALIGN(ivs, crypto_skcipher_alignmask(tfm) + 1); memcpy(out, ivs + crypto_skcipher_ivsize(tfm), crypto_skcipher_statesize(tfm)); return 0; } static int crypto_lskcipher_import(struct skcipher_request *req, const void *in) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); u8 *ivs = skcipher_request_ctx(req); ivs = PTR_ALIGN(ivs, crypto_skcipher_alignmask(tfm) + 1); memcpy(ivs + crypto_skcipher_ivsize(tfm), in, crypto_skcipher_statesize(tfm)); return 0; } static int skcipher_noexport(struct skcipher_request *req, void *out) { return 0; } static int skcipher_noimport(struct skcipher_request *req, const void *in) { return 0; } int crypto_skcipher_export(struct skcipher_request *req, void *out) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct skcipher_alg *alg = crypto_skcipher_alg(tfm); if (alg->co.base.cra_type != &crypto_skcipher_type) return crypto_lskcipher_export(req, out); return alg->export(req, out); } EXPORT_SYMBOL_GPL(crypto_skcipher_export); int crypto_skcipher_import(struct skcipher_request *req, const void *in) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct skcipher_alg *alg = crypto_skcipher_alg(tfm); if (alg->co.base.cra_type != &crypto_skcipher_type) return crypto_lskcipher_import(req, in); return alg->import(req, in); } EXPORT_SYMBOL_GPL(crypto_skcipher_import); static void crypto_skcipher_exit_tfm(struct crypto_tfm *tfm) { struct crypto_skcipher *skcipher = __crypto_skcipher_cast(tfm); struct skcipher_alg *alg = crypto_skcipher_alg(skcipher); alg->exit(skcipher); } static int crypto_skcipher_init_tfm(struct crypto_tfm *tfm) { struct crypto_skcipher *skcipher = __crypto_skcipher_cast(tfm); struct skcipher_alg *alg = crypto_skcipher_alg(skcipher); skcipher_set_needkey(skcipher); if (tfm->__crt_alg->cra_type != &crypto_skcipher_type) { unsigned am = crypto_skcipher_alignmask(skcipher); unsigned reqsize; reqsize = am & ~(crypto_tfm_ctx_alignment() - 1); reqsize += crypto_skcipher_ivsize(skcipher); reqsize += crypto_skcipher_statesize(skcipher); crypto_skcipher_set_reqsize(skcipher, reqsize); return crypto_init_lskcipher_ops_sg(tfm); } crypto_skcipher_set_reqsize(skcipher, crypto_tfm_alg_reqsize(tfm)); if (alg->exit) skcipher->base.exit = crypto_skcipher_exit_tfm; if (alg->init) return alg->init(skcipher); return 0; } static unsigned int crypto_skcipher_extsize(struct crypto_alg *alg) { if (alg->cra_type != &crypto_skcipher_type) return sizeof(struct crypto_lskcipher *); return crypto_alg_extsize(alg); } static void crypto_skcipher_free_instance(struct crypto_instance *inst) { struct skcipher_instance *skcipher = container_of(inst, struct skcipher_instance, s.base); skcipher->free(skcipher); } static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg) __maybe_unused; static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg) { struct skcipher_alg *skcipher = __crypto_skcipher_alg(alg); seq_printf(m, "type : skcipher\n"); seq_printf(m, "async : %s\n", str_yes_no(alg->cra_flags & CRYPTO_ALG_ASYNC)); seq_printf(m, "blocksize : %u\n", alg->cra_blocksize); seq_printf(m, "min keysize : %u\n", skcipher->min_keysize); seq_printf(m, "max keysize : %u\n", skcipher->max_keysize); seq_printf(m, "ivsize : %u\n", skcipher->ivsize); seq_printf(m, "chunksize : %u\n", skcipher->chunksize); seq_printf(m, "walksize : %u\n", skcipher->walksize); seq_printf(m, "statesize : %u\n", skcipher->statesize); } static int __maybe_unused crypto_skcipher_report( struct sk_buff *skb, struct crypto_alg *alg) { struct skcipher_alg *skcipher = __crypto_skcipher_alg(alg); struct crypto_report_blkcipher rblkcipher; memset(&rblkcipher, 0, sizeof(rblkcipher)); strscpy(rblkcipher.type, "skcipher", sizeof(rblkcipher.type)); strscpy(rblkcipher.geniv, "<none>", sizeof(rblkcipher.geniv)); rblkcipher.blocksize = alg->cra_blocksize; rblkcipher.min_keysize = skcipher->min_keysize; rblkcipher.max_keysize = skcipher->max_keysize; rblkcipher.ivsize = skcipher->ivsize; return nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER, sizeof(rblkcipher), &rblkcipher); } static const struct crypto_type crypto_skcipher_type = { .extsize = crypto_skcipher_extsize, .init_tfm = crypto_skcipher_init_tfm, .free = crypto_skcipher_free_instance, #ifdef CONFIG_PROC_FS .show = crypto_skcipher_show, #endif #if IS_ENABLED(CONFIG_CRYPTO_USER) .report = crypto_skcipher_report, #endif .maskclear = ~CRYPTO_ALG_TYPE_MASK, .maskset = CRYPTO_ALG_TYPE_SKCIPHER_MASK, .type = CRYPTO_ALG_TYPE_SKCIPHER, .tfmsize = offsetof(struct crypto_skcipher, base), .algsize = offsetof(struct skcipher_alg, base), }; int crypto_grab_skcipher(struct crypto_skcipher_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask) { spawn->base.frontend = &crypto_skcipher_type; return crypto_grab_spawn(&spawn->base, inst, name, type, mask); } EXPORT_SYMBOL_GPL(crypto_grab_skcipher); struct crypto_skcipher *crypto_alloc_skcipher(const char *alg_name, u32 type, u32 mask) { return crypto_alloc_tfm(alg_name, &crypto_skcipher_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_alloc_skcipher); struct crypto_sync_skcipher *crypto_alloc_sync_skcipher( const char *alg_name, u32 type, u32 mask) { struct crypto_skcipher *tfm; /* Only sync algorithms allowed. */ mask |= CRYPTO_ALG_ASYNC | CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE; type &= ~(CRYPTO_ALG_ASYNC | CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE); tfm = crypto_alloc_tfm(alg_name, &crypto_skcipher_type, type, mask); /* * Make sure we do not allocate something that might get used with * an on-stack request: check the request size. */ if (!IS_ERR(tfm) && WARN_ON(crypto_skcipher_reqsize(tfm) > MAX_SYNC_SKCIPHER_REQSIZE)) { crypto_free_skcipher(tfm); return ERR_PTR(-EINVAL); } return (struct crypto_sync_skcipher *)tfm; } EXPORT_SYMBOL_GPL(crypto_alloc_sync_skcipher); int crypto_has_skcipher(const char *alg_name, u32 type, u32 mask) { return crypto_type_has_alg(alg_name, &crypto_skcipher_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_has_skcipher); int skcipher_prepare_alg_common(struct skcipher_alg_common *alg) { struct crypto_alg *base = &alg->base; if (alg->ivsize > PAGE_SIZE / 8 || alg->chunksize > PAGE_SIZE / 8 || alg->statesize > PAGE_SIZE / 2 || (alg->ivsize + alg->statesize) > PAGE_SIZE / 2) return -EINVAL; if (!alg->chunksize) alg->chunksize = base->cra_blocksize; base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK; return 0; } static int skcipher_prepare_alg(struct skcipher_alg *alg) { struct crypto_alg *base = &alg->base; int err; err = skcipher_prepare_alg_common(&alg->co); if (err) return err; if (alg->walksize > PAGE_SIZE / 8) return -EINVAL; if (!alg->walksize) alg->walksize = alg->chunksize; if (!alg->statesize) { alg->import = skcipher_noimport; alg->export = skcipher_noexport; } else if (!(alg->import && alg->export)) return -EINVAL; base->cra_type = &crypto_skcipher_type; base->cra_flags |= CRYPTO_ALG_TYPE_SKCIPHER; return 0; } int crypto_register_skcipher(struct skcipher_alg *alg) { struct crypto_alg *base = &alg->base; int err; err = skcipher_prepare_alg(alg); if (err) return err; return crypto_register_alg(base); } EXPORT_SYMBOL_GPL(crypto_register_skcipher); void crypto_unregister_skcipher(struct skcipher_alg *alg) { crypto_unregister_alg(&alg->base); } EXPORT_SYMBOL_GPL(crypto_unregister_skcipher); int crypto_register_skciphers(struct skcipher_alg *algs, int count) { int i, ret; for (i = 0; i < count; i++) { ret = crypto_register_skcipher(&algs[i]); if (ret) goto err; } return 0; err: for (--i; i >= 0; --i) crypto_unregister_skcipher(&algs[i]); return ret; } EXPORT_SYMBOL_GPL(crypto_register_skciphers); void crypto_unregister_skciphers(struct skcipher_alg *algs, int count) { int i; for (i = count - 1; i >= 0; --i) crypto_unregister_skcipher(&algs[i]); } EXPORT_SYMBOL_GPL(crypto_unregister_skciphers); int skcipher_register_instance(struct crypto_template *tmpl, struct skcipher_instance *inst) { int err; if (WARN_ON(!inst->free)) return -EINVAL; err = skcipher_prepare_alg(&inst->alg); if (err) return err; return crypto_register_instance(tmpl, skcipher_crypto_instance(inst)); } EXPORT_SYMBOL_GPL(skcipher_register_instance); static int skcipher_setkey_simple(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { struct crypto_cipher *cipher = skcipher_cipher_simple(tfm); crypto_cipher_clear_flags(cipher, CRYPTO_TFM_REQ_MASK); crypto_cipher_set_flags(cipher, crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); return crypto_cipher_setkey(cipher, key, keylen); } static int skcipher_init_tfm_simple(struct crypto_skcipher *tfm) { struct skcipher_instance *inst = skcipher_alg_instance(tfm); struct crypto_cipher_spawn *spawn = skcipher_instance_ctx(inst); struct skcipher_ctx_simple *ctx = crypto_skcipher_ctx(tfm); struct crypto_cipher *cipher; cipher = crypto_spawn_cipher(spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->cipher = cipher; return 0; } static void skcipher_exit_tfm_simple(struct crypto_skcipher *tfm) { struct skcipher_ctx_simple *ctx = crypto_skcipher_ctx(tfm); crypto_free_cipher(ctx->cipher); } static void skcipher_free_instance_simple(struct skcipher_instance *inst) { crypto_drop_cipher(skcipher_instance_ctx(inst)); kfree(inst); } /** * skcipher_alloc_instance_simple - allocate instance of simple block cipher mode * * Allocate an skcipher_instance for a simple block cipher mode of operation, * e.g. cbc or ecb. The instance context will have just a single crypto_spawn, * that for the underlying cipher. The {min,max}_keysize, ivsize, blocksize, * alignmask, and priority are set from the underlying cipher but can be * overridden if needed. The tfm context defaults to skcipher_ctx_simple, and * default ->setkey(), ->init(), and ->exit() methods are installed. * * @tmpl: the template being instantiated * @tb: the template parameters * * Return: a pointer to the new instance, or an ERR_PTR(). The caller still * needs to register the instance. */ struct skcipher_instance *skcipher_alloc_instance_simple( struct crypto_template *tmpl, struct rtattr **tb) { u32 mask; struct skcipher_instance *inst; struct crypto_cipher_spawn *spawn; struct crypto_alg *cipher_alg; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SKCIPHER, &mask); if (err) return ERR_PTR(err); inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); if (!inst) return ERR_PTR(-ENOMEM); spawn = skcipher_instance_ctx(inst); err = crypto_grab_cipher(spawn, skcipher_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; cipher_alg = crypto_spawn_cipher_alg(spawn); err = crypto_inst_setname(skcipher_crypto_instance(inst), tmpl->name, cipher_alg); if (err) goto err_free_inst; inst->free = skcipher_free_instance_simple; /* Default algorithm properties, can be overridden */ inst->alg.base.cra_blocksize = cipher_alg->cra_blocksize; inst->alg.base.cra_alignmask = cipher_alg->cra_alignmask; inst->alg.base.cra_priority = cipher_alg->cra_priority; inst->alg.min_keysize = cipher_alg->cra_cipher.cia_min_keysize; inst->alg.max_keysize = cipher_alg->cra_cipher.cia_max_keysize; inst->alg.ivsize = cipher_alg->cra_blocksize; /* Use skcipher_ctx_simple by default, can be overridden */ inst->alg.base.cra_ctxsize = sizeof(struct skcipher_ctx_simple); inst->alg.setkey = skcipher_setkey_simple; inst->alg.init = skcipher_init_tfm_simple; inst->alg.exit = skcipher_exit_tfm_simple; return inst; err_free_inst: skcipher_free_instance_simple(inst); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(skcipher_alloc_instance_simple); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Symmetric key cipher type"); MODULE_IMPORT_NS("CRYPTO_INTERNAL");
7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 // SPDX-License-Identifier: GPL-2.0 /* * Interface between ext4 and JBD */ #include "ext4_jbd2.h" #include <trace/events/ext4.h> int ext4_inode_journal_mode(struct inode *inode) { if (EXT4_JOURNAL(inode) == NULL) return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ /* We do not support data journalling with delayed allocation */ if (!S_ISREG(inode->i_mode) || ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && !test_opt(inode->i_sb, DELALLOC) && !mapping_large_folio_support(inode->i_mapping))) { /* We do not support data journalling for encrypted data */ if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ } if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ BUG(); } /* Just increment the non-pointer handle value */ static handle_t *ext4_get_nojournal(void) { handle_t *handle = current->journal_info; unsigned long ref_cnt = (unsigned long)handle; BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT); ref_cnt++; handle = (handle_t *)ref_cnt; current->journal_info = handle; return handle; } /* Decrement the non-pointer handle value */ static void ext4_put_nojournal(handle_t *handle) { unsigned long ref_cnt = (unsigned long)handle; BUG_ON(ref_cnt == 0); ref_cnt--; handle = (handle_t *)ref_cnt; current->journal_info = handle; } /* * Wrappers for jbd2_journal_start/end. */ static int ext4_journal_check_start(struct super_block *sb) { int ret; journal_t *journal; might_sleep(); ret = ext4_emergency_state(sb); if (unlikely(ret)) return ret; if (WARN_ON_ONCE(sb_rdonly(sb))) return -EROFS; WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); journal = EXT4_SB(sb)->s_journal; /* * Special case here: if the journal has aborted behind our * backs (eg. EIO in the commit thread), then we still need to * take the FS itself readonly cleanly. */ if (journal && is_journal_aborted(journal)) { ext4_abort(sb, -journal->j_errno, "Detected aborted journal"); return -EROFS; } return 0; } handle_t *__ext4_journal_start_sb(struct inode *inode, struct super_block *sb, unsigned int line, int type, int blocks, int rsv_blocks, int revoke_creds) { journal_t *journal; int err; if (inode) trace_ext4_journal_start_inode(inode, blocks, rsv_blocks, revoke_creds, type, _RET_IP_); else trace_ext4_journal_start_sb(sb, blocks, rsv_blocks, revoke_creds, type, _RET_IP_); err = ext4_journal_check_start(sb); if (err < 0) return ERR_PTR(err); journal = EXT4_SB(sb)->s_journal; if (!journal || (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) return ext4_get_nojournal(); return jbd2__journal_start(journal, blocks, rsv_blocks, revoke_creds, GFP_NOFS, type, line); } int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) { struct super_block *sb; int err; int rc; if (!ext4_handle_valid(handle)) { ext4_put_nojournal(handle); return 0; } err = handle->h_err; if (!handle->h_transaction) { rc = jbd2_journal_stop(handle); return err ? err : rc; } sb = handle->h_transaction->t_journal->j_private; rc = jbd2_journal_stop(handle); if (!err) err = rc; if (err) __ext4_std_error(sb, where, line, err); return err; } handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, int type) { struct super_block *sb; int err; if (!ext4_handle_valid(handle)) return ext4_get_nojournal(); sb = handle->h_journal->j_private; trace_ext4_journal_start_reserved(sb, jbd2_handle_buffer_credits(handle), _RET_IP_); err = ext4_journal_check_start(sb); if (err < 0) { jbd2_journal_free_reserved(handle); return ERR_PTR(err); } err = jbd2_journal_start_reserved(handle, type, line); if (err < 0) return ERR_PTR(err); return handle; } int __ext4_journal_ensure_credits(handle_t *handle, int check_cred, int extend_cred, int revoke_cred) { if (!ext4_handle_valid(handle)) return 0; if (is_handle_aborted(handle)) return -EROFS; if (jbd2_handle_buffer_credits(handle) >= check_cred && handle->h_revoke_credits >= revoke_cred) return 0; extend_cred = max(0, extend_cred - jbd2_handle_buffer_credits(handle)); revoke_cred = max(0, revoke_cred - handle->h_revoke_credits); return ext4_journal_extend(handle, extend_cred, revoke_cred); } static void ext4_journal_abort_handle(const char *caller, unsigned int line, const char *err_fn, struct buffer_head *bh, handle_t *handle, int err) { char nbuf[16]; const char *errstr = ext4_decode_error(NULL, err, nbuf); BUG_ON(!ext4_handle_valid(handle)); if (bh) BUFFER_TRACE(bh, "abort"); if (!handle->h_err) handle->h_err = err; if (is_handle_aborted(handle)) return; printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n", caller, line, errstr, err_fn); jbd2_journal_abort_handle(handle); } static void ext4_check_bdev_write_error(struct super_block *sb) { struct address_space *mapping = sb->s_bdev->bd_mapping; struct ext4_sb_info *sbi = EXT4_SB(sb); int err; /* * If the block device has write error flag, it may have failed to * async write out metadata buffers in the background. In this case, * we could read old data from disk and write it out again, which * may lead to on-disk filesystem inconsistency. */ if (errseq_check(&mapping->wb_err, READ_ONCE(sbi->s_bdev_wb_err))) { spin_lock(&sbi->s_bdev_wb_lock); err = errseq_check_and_advance(&mapping->wb_err, &sbi->s_bdev_wb_err); spin_unlock(&sbi->s_bdev_wb_lock); if (err) ext4_error_err(sb, -err, "Error while async write back metadata"); } } int __ext4_journal_get_write_access(const char *where, unsigned int line, handle_t *handle, struct super_block *sb, struct buffer_head *bh, enum ext4_journal_trigger_type trigger_type) { int err; might_sleep(); if (ext4_handle_valid(handle)) { err = jbd2_journal_get_write_access(handle, bh); if (err) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); return err; } } else ext4_check_bdev_write_error(sb); if (trigger_type == EXT4_JTR_NONE || !ext4_has_feature_metadata_csum(sb)) return 0; BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT); jbd2_journal_set_triggers(bh, &EXT4_SB(sb)->s_journal_triggers[trigger_type].tr_triggers); return 0; } /* * The ext4 forget function must perform a revoke if we are freeing data * which has been journaled. Metadata (eg. indirect blocks) must be * revoked in all cases. * * "bh" may be NULL: a metadata block may have been freed from memory * but there may still be a record of it in the journal, and that record * still needs to be revoked. */ int __ext4_forget(const char *where, unsigned int line, handle_t *handle, int is_metadata, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t blocknr) { int err; might_sleep(); trace_ext4_forget(inode, is_metadata, blocknr); BUFFER_TRACE(bh, "enter"); ext4_debug("forgetting bh %p: is_metadata=%d, mode %o, data mode %x\n", bh, is_metadata, inode->i_mode, test_opt(inode->i_sb, DATA_FLAGS)); /* * In the no journal case, we should wait for the ongoing buffer * to complete and do a forget. */ if (!ext4_handle_valid(handle)) { if (bh) { clear_buffer_dirty(bh); wait_on_buffer(bh); __bforget(bh); } return 0; } /* Never use the revoke function if we are doing full data * journaling: there is no need to, and a V1 superblock won't * support it. Otherwise, only skip the revoke on un-journaled * data blocks. */ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || (!is_metadata && !ext4_should_journal_data(inode))) { if (bh) { BUFFER_TRACE(bh, "call jbd2_journal_forget"); err = jbd2_journal_forget(handle, bh); if (err) ext4_journal_abort_handle(where, line, __func__, bh, handle, err); return err; } return 0; } /* * data!=journal && (is_metadata || should_journal_data(inode)) */ BUFFER_TRACE(bh, "call jbd2_journal_revoke"); err = jbd2_journal_revoke(handle, blocknr, bh); if (err) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); __ext4_error(inode->i_sb, where, line, true, -err, 0, "error %d when attempting revoke", err); } BUFFER_TRACE(bh, "exit"); return err; } int __ext4_journal_get_create_access(const char *where, unsigned int line, handle_t *handle, struct super_block *sb, struct buffer_head *bh, enum ext4_journal_trigger_type trigger_type) { int err; if (!ext4_handle_valid(handle)) return 0; err = jbd2_journal_get_create_access(handle, bh); if (err) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); return err; } if (trigger_type == EXT4_JTR_NONE || !ext4_has_feature_metadata_csum(sb)) return 0; BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT); jbd2_journal_set_triggers(bh, &EXT4_SB(sb)->s_journal_triggers[trigger_type].tr_triggers); return 0; } int __ext4_handle_dirty_metadata(const char *where, unsigned int line, handle_t *handle, struct inode *inode, struct buffer_head *bh) { int err = 0; might_sleep(); set_buffer_meta(bh); set_buffer_prio(bh); set_buffer_uptodate(bh); if (ext4_handle_valid(handle)) { err = jbd2_journal_dirty_metadata(handle, bh); /* Errors can only happen due to aborted journal or a nasty bug */ if (!is_handle_aborted(handle) && WARN_ON_ONCE(err)) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); if (inode == NULL) { pr_err("EXT4: jbd2_journal_dirty_metadata " "failed: handle type %u started at " "line %u, credits %u/%u, errcode %d", handle->h_type, handle->h_line_no, handle->h_requested_credits, jbd2_handle_buffer_credits(handle), err); return err; } ext4_error_inode(inode, where, line, bh->b_blocknr, "journal_dirty_metadata failed: " "handle type %u started at line %u, " "credits %u/%u, errcode %d", handle->h_type, handle->h_line_no, handle->h_requested_credits, jbd2_handle_buffer_credits(handle), err); } } else { if (inode) mark_buffer_dirty_inode(bh, inode); else mark_buffer_dirty(bh); if (inode && inode_needs_sync(inode)) { sync_dirty_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) { ext4_error_inode_err(inode, where, line, bh->b_blocknr, EIO, "IO error syncing itable block"); err = -EIO; } } } return err; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _linux_POSIX_TIMERS_H #define _linux_POSIX_TIMERS_H #include <linux/alarmtimer.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/pid.h> #include <linux/posix-timers_types.h> #include <linux/rcuref.h> #include <linux/spinlock.h> #include <linux/timerqueue.h> struct kernel_siginfo; struct task_struct; struct sigqueue; struct k_itimer; static inline clockid_t make_process_cpuclock(const unsigned int pid, const clockid_t clock) { return ((~pid) << 3) | clock; } static inline clockid_t make_thread_cpuclock(const unsigned int tid, const clockid_t clock) { return make_process_cpuclock(tid, clock | CPUCLOCK_PERTHREAD_MASK); } static inline clockid_t fd_to_clockid(const int fd) { return make_process_cpuclock((unsigned int) fd, CLOCKFD); } static inline int clockid_to_fd(const clockid_t clk) { return ~(clk >> 3); } static inline bool clockid_aux_valid(clockid_t id) { return IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS) && id >= CLOCK_AUX && id <= CLOCK_AUX_LAST; } #ifdef CONFIG_POSIX_TIMERS #include <linux/signal_types.h> /** * cpu_timer - Posix CPU timer representation for k_itimer * @node: timerqueue node to queue in the task/sig * @head: timerqueue head on which this timer is queued * @pid: Pointer to target task PID * @elist: List head for the expiry list * @firing: Timer is currently firing * @nanosleep: Timer is used for nanosleep and is not a regular posix-timer * @handling: Pointer to the task which handles expiry */ struct cpu_timer { struct timerqueue_node node; struct timerqueue_head *head; struct pid *pid; struct list_head elist; bool firing; bool nanosleep; struct task_struct __rcu *handling; }; static inline bool cpu_timer_enqueue(struct timerqueue_head *head, struct cpu_timer *ctmr) { ctmr->head = head; return timerqueue_add(head, &ctmr->node); } static inline bool cpu_timer_queued(struct cpu_timer *ctmr) { return !!ctmr->head; } static inline bool cpu_timer_dequeue(struct cpu_timer *ctmr) { if (cpu_timer_queued(ctmr)) { timerqueue_del(ctmr->head, &ctmr->node); ctmr->head = NULL; return true; } return false; } static inline u64 cpu_timer_getexpires(struct cpu_timer *ctmr) { return ctmr->node.expires; } static inline void cpu_timer_setexpires(struct cpu_timer *ctmr, u64 exp) { ctmr->node.expires = exp; } static inline void posix_cputimers_init(struct posix_cputimers *pct) { memset(pct, 0, sizeof(*pct)); pct->bases[0].nextevt = U64_MAX; pct->bases[1].nextevt = U64_MAX; pct->bases[2].nextevt = U64_MAX; } void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit); static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct, u64 runtime) { pct->bases[CPUCLOCK_SCHED].nextevt = runtime; } void posixtimer_rearm_itimer(struct task_struct *p); bool posixtimer_init_sigqueue(struct sigqueue *q); void posixtimer_send_sigqueue(struct k_itimer *tmr); bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq); void posixtimer_free_timer(struct k_itimer *timer); long posixtimer_create_prctl(unsigned long ctrl); /* Init task static initializer */ #define INIT_CPU_TIMERBASE(b) { \ .nextevt = U64_MAX, \ } #define INIT_CPU_TIMERBASES(b) { \ INIT_CPU_TIMERBASE(b[0]), \ INIT_CPU_TIMERBASE(b[1]), \ INIT_CPU_TIMERBASE(b[2]), \ } #define INIT_CPU_TIMERS(s) \ .posix_cputimers = { \ .bases = INIT_CPU_TIMERBASES(s.posix_cputimers.bases), \ }, #else struct cpu_timer { }; #define INIT_CPU_TIMERS(s) static inline void posix_cputimers_init(struct posix_cputimers *pct) { } static inline void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) { } static inline void posixtimer_rearm_itimer(struct task_struct *p) { } static inline bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq) { return false; } static inline void posixtimer_free_timer(struct k_itimer *timer) { } static inline long posixtimer_create_prctl(unsigned long ctrl) { return -EINVAL; } #endif #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK void clear_posix_cputimers_work(struct task_struct *p); void posix_cputimers_init_work(void); #else static inline void clear_posix_cputimers_work(struct task_struct *p) { } static inline void posix_cputimers_init_work(void) { } #endif /** * struct k_itimer - POSIX.1b interval timer structure. * @list: List node for binding the timer to tsk::signal::posix_timers * @ignored_list: List node for tracking ignored timers in tsk::signal::ignored_posix_timers * @t_hash: Entry in the posix timer hash table * @it_lock: Lock protecting the timer * @kclock: Pointer to the k_clock struct handling this timer * @it_clock: The posix timer clock id * @it_id: The posix timer id for identifying the timer * @it_status: The status of the timer * @it_sig_periodic: The periodic status at signal delivery * @it_overrun: The overrun counter for pending signals * @it_overrun_last: The overrun at the time of the last delivered signal * @it_signal_seq: Sequence count to control signal delivery * @it_sigqueue_seq: The sequence count at the point where the signal was queued * @it_sigev_notify: The notify word of sigevent struct for signal delivery * @it_interval: The interval for periodic timers * @it_signal: Pointer to the creators signal struct * @it_pid: The pid of the process/task targeted by the signal * @it_process: The task to wakeup on clock_nanosleep (CPU timers) * @rcuref: Reference count for life time management * @sigq: Embedded sigqueue * @it: Union representing the various posix timer type * internals. * @rcu: RCU head for freeing the timer. */ struct k_itimer { /* 1st cacheline contains read-mostly fields */ struct hlist_node t_hash; struct hlist_node list; timer_t it_id; clockid_t it_clock; int it_sigev_notify; enum pid_type it_pid_type; struct signal_struct *it_signal; const struct k_clock *kclock; /* 2nd cacheline and above contain fields which are modified regularly */ spinlock_t it_lock; int it_status; bool it_sig_periodic; s64 it_overrun; s64 it_overrun_last; unsigned int it_signal_seq; unsigned int it_sigqueue_seq; ktime_t it_interval; struct hlist_node ignored_list; union { struct pid *it_pid; struct task_struct *it_process; }; struct sigqueue sigq; rcuref_t rcuref; union { struct { struct hrtimer timer; } real; struct cpu_timer cpu; struct { struct alarm alarmtimer; } alarm; } it; struct rcu_head rcu; } ____cacheline_aligned_in_smp; void run_posix_cpu_timers(void); void posix_cpu_timers_exit(struct task_struct *task); void posix_cpu_timers_exit_group(struct task_struct *task); void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx, u64 *newval, u64 *oldval); int update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new); #ifdef CONFIG_POSIX_TIMERS static inline void posixtimer_putref(struct k_itimer *tmr) { if (rcuref_put(&tmr->rcuref)) posixtimer_free_timer(tmr); } static inline void posixtimer_sigqueue_getref(struct sigqueue *q) { struct k_itimer *tmr = container_of(q, struct k_itimer, sigq); WARN_ON_ONCE(!rcuref_get(&tmr->rcuref)); } static inline void posixtimer_sigqueue_putref(struct sigqueue *q) { struct k_itimer *tmr = container_of(q, struct k_itimer, sigq); posixtimer_putref(tmr); } static inline bool posixtimer_valid(const struct k_itimer *timer) { unsigned long val = (unsigned long)timer->it_signal; return !(val & 0x1UL); } #else /* CONFIG_POSIX_TIMERS */ static inline void posixtimer_sigqueue_getref(struct sigqueue *q) { } static inline void posixtimer_sigqueue_putref(struct sigqueue *q) { } #endif /* !CONFIG_POSIX_TIMERS */ #endif
90 44 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 // SPDX-License-Identifier: GPL-2.0-or-later /* Sysfs attributes of bond slaves * * Copyright (c) 2014 Scott Feldman <sfeldma@cumulusnetworks.com> */ #include <linux/capability.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <net/bonding.h> struct slave_attribute { struct attribute attr; ssize_t (*show)(struct slave *, char *); }; #define SLAVE_ATTR_RO(_name) \ const struct slave_attribute slave_attr_##_name = __ATTR_RO(_name) static ssize_t state_show(struct slave *slave, char *buf) { switch (bond_slave_state(slave)) { case BOND_STATE_ACTIVE: return sysfs_emit(buf, "active\n"); case BOND_STATE_BACKUP: return sysfs_emit(buf, "backup\n"); default: return sysfs_emit(buf, "UNKNOWN\n"); } } static SLAVE_ATTR_RO(state); static ssize_t mii_status_show(struct slave *slave, char *buf) { return sysfs_emit(buf, "%s\n", bond_slave_link_status(slave->link)); } static SLAVE_ATTR_RO(mii_status); static ssize_t link_failure_count_show(struct slave *slave, char *buf) { return sysfs_emit(buf, "%d\n", slave->link_failure_count); } static SLAVE_ATTR_RO(link_failure_count); static ssize_t perm_hwaddr_show(struct slave *slave, char *buf) { return sysfs_emit(buf, "%*phC\n", slave->dev->addr_len, slave->perm_hwaddr); } static SLAVE_ATTR_RO(perm_hwaddr); static ssize_t queue_id_show(struct slave *slave, char *buf) { return sysfs_emit(buf, "%d\n", READ_ONCE(slave->queue_id)); } static SLAVE_ATTR_RO(queue_id); static ssize_t ad_aggregator_id_show(struct slave *slave, char *buf) { const struct aggregator *agg; if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) { agg = SLAVE_AD_INFO(slave)->port.aggregator; if (agg) return sysfs_emit(buf, "%d\n", agg->aggregator_identifier); } return sysfs_emit(buf, "N/A\n"); } static SLAVE_ATTR_RO(ad_aggregator_id); static ssize_t ad_actor_oper_port_state_show(struct slave *slave, char *buf) { const struct port *ad_port; if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) { ad_port = &SLAVE_AD_INFO(slave)->port; if (ad_port->aggregator) return sysfs_emit(buf, "%u\n", ad_port->actor_oper_port_state); } return sysfs_emit(buf, "N/A\n"); } static SLAVE_ATTR_RO(ad_actor_oper_port_state); static ssize_t ad_partner_oper_port_state_show(struct slave *slave, char *buf) { const struct port *ad_port; if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) { ad_port = &SLAVE_AD_INFO(slave)->port; if (ad_port->aggregator) return sysfs_emit(buf, "%u\n", ad_port->partner_oper.port_state); } return sysfs_emit(buf, "N/A\n"); } static SLAVE_ATTR_RO(ad_partner_oper_port_state); static const struct attribute *slave_attrs[] = { &slave_attr_state.attr, &slave_attr_mii_status.attr, &slave_attr_link_failure_count.attr, &slave_attr_perm_hwaddr.attr, &slave_attr_queue_id.attr, &slave_attr_ad_aggregator_id.attr, &slave_attr_ad_actor_oper_port_state.attr, &slave_attr_ad_partner_oper_port_state.attr, NULL }; #define to_slave_attr(_at) container_of(_at, struct slave_attribute, attr) static ssize_t slave_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct slave_attribute *slave_attr = to_slave_attr(attr); struct slave *slave = to_slave(kobj); return slave_attr->show(slave, buf); } const struct sysfs_ops slave_sysfs_ops = { .show = slave_show, }; int bond_sysfs_slave_add(struct slave *slave) { return sysfs_create_files(&slave->kobj, slave_attrs); } void bond_sysfs_slave_del(struct slave *slave) { sysfs_remove_files(&slave->kobj, slave_attrs); }
2 25 333 12 43 542 542 198 198 349 347 56 56 13 42 55 724 19 208 208 208 218 218 6 218 15 56 569 348 606 606 542 542 542 542 542 540 541 12 541 542 12 87 84 10 10 105 103 2 105 74 74 74 74 80 8 8 8 87 79 4 4 6 1 82 1 1 1 1 1 1 87 25 75 75 73 2 73 2 73 1 75 75 65 65 44 25 45 25 46 25 45 25 65 65 65 65 65 45 25 37 3 34 16 16 19 19 396 560 370 198 3 348 349 349 542 542 541 542 541 542 12 542 541 12 734 736 736 608 608 245 3 138 170 242 3 2 542 542 542 12 542 363 187 353 352 353 353 15 2 2 5 2 1 2 2 518 514 5 518 167 372 467 467 467 466 36 542 234 234 237 1 8 9 9 422 218 208 421 422 22 396 5 422 422 421 192 192 192 192 159 185 169 188 185 185 188 192 62 56 9 50 2 2 9 50 20 48 50 2 7 36 3 62 194 4 192 6 6 70 29 2 7 62 1 62 6 47 69 1 70 65 2 5 41 46 3 14 2 12 14 14 14 4 11 14 11 57 203 203 203 203 203 5 5 5 5 5 177 3 3 3 24 18 18 4 1 8 288 422 19 3 16 19 7 218 218 218 218 218 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 /* * Resizable virtual memory filesystem for Linux. * * Copyright (C) 2000 Linus Torvalds. * 2000 Transmeta Corp. * 2000-2001 Christoph Rohland * 2000-2001 SAP AG * 2002 Red Hat Inc. * Copyright (C) 2002-2011 Hugh Dickins. * Copyright (C) 2011 Google Inc. * Copyright (C) 2002-2005 VERITAS Software Corporation. * Copyright (C) 2004 Andi Kleen, SuSE Labs * * Extended attribute support for tmpfs: * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> * * tiny-shmem: * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com> * * This file is released under the GPL. */ #include <linux/fs.h> #include <linux/init.h> #include <linux/vfs.h> #include <linux/mount.h> #include <linux/ramfs.h> #include <linux/pagemap.h> #include <linux/file.h> #include <linux/fileattr.h> #include <linux/mm.h> #include <linux/random.h> #include <linux/sched/signal.h> #include <linux/export.h> #include <linux/shmem_fs.h> #include <linux/swap.h> #include <linux/uio.h> #include <linux/hugetlb.h> #include <linux/fs_parser.h> #include <linux/swapfile.h> #include <linux/iversion.h> #include <linux/unicode.h> #include "swap.h" static struct vfsmount *shm_mnt __ro_after_init; #ifdef CONFIG_SHMEM /* * This virtual memory filesystem is heavily based on the ramfs. It * extends ramfs by the ability to use swap and honor resource limits * which makes it a completely usable filesystem. */ #include <linux/xattr.h> #include <linux/exportfs.h> #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> #include <linux/mman.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/backing-dev.h> #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/percpu_counter.h> #include <linux/falloc.h> #include <linux/splice.h> #include <linux/security.h> #include <linux/swapops.h> #include <linux/mempolicy.h> #include <linux/namei.h> #include <linux/ctype.h> #include <linux/migrate.h> #include <linux/highmem.h> #include <linux/seq_file.h> #include <linux/magic.h> #include <linux/syscalls.h> #include <linux/fcntl.h> #include <uapi/linux/memfd.h> #include <linux/rmap.h> #include <linux/uuid.h> #include <linux/quotaops.h> #include <linux/rcupdate_wait.h> #include <linux/uaccess.h> #include "internal.h" #define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT) /* Pretend that each entry is of this size in directory's i_size */ #define BOGO_DIRENT_SIZE 20 /* Pretend that one inode + its dentry occupy this much memory */ #define BOGO_INODE_SIZE 1024 /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ #define SHORT_SYMLINK_LEN 128 /* * shmem_fallocate communicates with shmem_fault or shmem_writeout via * inode->i_private (with i_rwsem making sure that it has only one user at * a time): we would prefer not to enlarge the shmem inode just for that. */ struct shmem_falloc { wait_queue_head_t *waitq; /* faults into hole wait for punch to end */ pgoff_t start; /* start of range currently being fallocated */ pgoff_t next; /* the next page offset to be fallocated */ pgoff_t nr_falloced; /* how many new pages have been fallocated */ pgoff_t nr_unswapped; /* how often writeout refused to swap out */ }; struct shmem_options { unsigned long long blocks; unsigned long long inodes; struct mempolicy *mpol; kuid_t uid; kgid_t gid; umode_t mode; bool full_inums; int huge; int seen; bool noswap; unsigned short quota_types; struct shmem_quota_limits qlimits; #if IS_ENABLED(CONFIG_UNICODE) struct unicode_map *encoding; bool strict_encoding; #endif #define SHMEM_SEEN_BLOCKS 1 #define SHMEM_SEEN_INODES 2 #define SHMEM_SEEN_HUGE 4 #define SHMEM_SEEN_INUMS 8 #define SHMEM_SEEN_NOSWAP 16 #define SHMEM_SEEN_QUOTA 32 }; #ifdef CONFIG_TRANSPARENT_HUGEPAGE static unsigned long huge_shmem_orders_always __read_mostly; static unsigned long huge_shmem_orders_madvise __read_mostly; static unsigned long huge_shmem_orders_inherit __read_mostly; static unsigned long huge_shmem_orders_within_size __read_mostly; static bool shmem_orders_configured __initdata; #endif #ifdef CONFIG_TMPFS static unsigned long shmem_default_max_blocks(void) { return totalram_pages() / 2; } static unsigned long shmem_default_max_inodes(void) { unsigned long nr_pages = totalram_pages(); return min3(nr_pages - totalhigh_pages(), nr_pages / 2, ULONG_MAX / BOGO_INODE_SIZE); } #endif static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, vm_fault_t *fault_type); static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) { return sb->s_fs_info; } /* * shmem_file_setup pre-accounts the whole fixed size of a VM object, * for shared memory and for shared anonymous (/dev/zero) mappings * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1), * consistent with the pre-accounting of private mappings ... */ static inline int shmem_acct_size(unsigned long flags, loff_t size) { return (flags & VM_NORESERVE) ? 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size)); } static inline void shmem_unacct_size(unsigned long flags, loff_t size) { if (!(flags & VM_NORESERVE)) vm_unacct_memory(VM_ACCT(size)); } static inline int shmem_reacct_size(unsigned long flags, loff_t oldsize, loff_t newsize) { if (!(flags & VM_NORESERVE)) { if (VM_ACCT(newsize) > VM_ACCT(oldsize)) return security_vm_enough_memory_mm(current->mm, VM_ACCT(newsize) - VM_ACCT(oldsize)); else if (VM_ACCT(newsize) < VM_ACCT(oldsize)) vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize)); } return 0; } /* * ... whereas tmpfs objects are accounted incrementally as * pages are allocated, in order to allow large sparse files. * shmem_get_folio reports shmem_acct_blocks failure as -ENOSPC not -ENOMEM, * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. */ static inline int shmem_acct_blocks(unsigned long flags, long pages) { if (!(flags & VM_NORESERVE)) return 0; return security_vm_enough_memory_mm(current->mm, pages * VM_ACCT(PAGE_SIZE)); } static inline void shmem_unacct_blocks(unsigned long flags, long pages) { if (flags & VM_NORESERVE) vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); } static int shmem_inode_acct_blocks(struct inode *inode, long pages) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); int err = -ENOSPC; if (shmem_acct_blocks(info->flags, pages)) return err; might_sleep(); /* when quotas */ if (sbinfo->max_blocks) { if (!percpu_counter_limited_add(&sbinfo->used_blocks, sbinfo->max_blocks, pages)) goto unacct; err = dquot_alloc_block_nodirty(inode, pages); if (err) { percpu_counter_sub(&sbinfo->used_blocks, pages); goto unacct; } } else { err = dquot_alloc_block_nodirty(inode, pages); if (err) goto unacct; } return 0; unacct: shmem_unacct_blocks(info->flags, pages); return err; } static void shmem_inode_unacct_blocks(struct inode *inode, long pages) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); might_sleep(); /* when quotas */ dquot_free_block_nodirty(inode, pages); if (sbinfo->max_blocks) percpu_counter_sub(&sbinfo->used_blocks, pages); shmem_unacct_blocks(info->flags, pages); } static const struct super_operations shmem_ops; static const struct address_space_operations shmem_aops; static const struct file_operations shmem_file_operations; static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; static const struct inode_operations shmem_special_inode_operations; static const struct vm_operations_struct shmem_vm_ops; static const struct vm_operations_struct shmem_anon_vm_ops; static struct file_system_type shmem_fs_type; bool shmem_mapping(const struct address_space *mapping) { return mapping->a_ops == &shmem_aops; } EXPORT_SYMBOL_GPL(shmem_mapping); bool vma_is_anon_shmem(const struct vm_area_struct *vma) { return vma->vm_ops == &shmem_anon_vm_ops; } bool vma_is_shmem(const struct vm_area_struct *vma) { return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops; } static LIST_HEAD(shmem_swaplist); static DEFINE_SPINLOCK(shmem_swaplist_lock); #ifdef CONFIG_TMPFS_QUOTA static int shmem_enable_quotas(struct super_block *sb, unsigned short quota_types) { int type, err = 0; sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY; for (type = 0; type < SHMEM_MAXQUOTAS; type++) { if (!(quota_types & (1 << type))) continue; err = dquot_load_quota_sb(sb, type, QFMT_SHMEM, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); if (err) goto out_err; } return 0; out_err: pr_warn("tmpfs: failed to enable quota tracking (type=%d, err=%d)\n", type, err); for (type--; type >= 0; type--) dquot_quota_off(sb, type); return err; } static void shmem_disable_quotas(struct super_block *sb) { int type; for (type = 0; type < SHMEM_MAXQUOTAS; type++) dquot_quota_off(sb, type); } static struct dquot __rcu **shmem_get_dquots(struct inode *inode) { return SHMEM_I(inode)->i_dquot; } #endif /* CONFIG_TMPFS_QUOTA */ /* * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and * produces a novel ino for the newly allocated inode. * * It may also be called when making a hard link to permit the space needed by * each dentry. However, in that case, no new inode number is needed since that * internally draws from another pool of inode numbers (currently global * get_next_ino()). This case is indicated by passing NULL as inop. */ #define SHMEM_INO_BATCH 1024 static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); ino_t ino; if (!(sb->s_flags & SB_KERNMOUNT)) { raw_spin_lock(&sbinfo->stat_lock); if (sbinfo->max_inodes) { if (sbinfo->free_ispace < BOGO_INODE_SIZE) { raw_spin_unlock(&sbinfo->stat_lock); return -ENOSPC; } sbinfo->free_ispace -= BOGO_INODE_SIZE; } if (inop) { ino = sbinfo->next_ino++; if (unlikely(is_zero_ino(ino))) ino = sbinfo->next_ino++; if (unlikely(!sbinfo->full_inums && ino > UINT_MAX)) { /* * Emulate get_next_ino uint wraparound for * compatibility */ if (IS_ENABLED(CONFIG_64BIT)) pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n", __func__, MINOR(sb->s_dev)); sbinfo->next_ino = 1; ino = sbinfo->next_ino++; } *inop = ino; } raw_spin_unlock(&sbinfo->stat_lock); } else if (inop) { /* * __shmem_file_setup, one of our callers, is lock-free: it * doesn't hold stat_lock in shmem_reserve_inode since * max_inodes is always 0, and is called from potentially * unknown contexts. As such, use a per-cpu batched allocator * which doesn't require the per-sb stat_lock unless we are at * the batch boundary. * * We don't need to worry about inode{32,64} since SB_KERNMOUNT * shmem mounts are not exposed to userspace, so we don't need * to worry about things like glibc compatibility. */ ino_t *next_ino; next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu()); ino = *next_ino; if (unlikely(ino % SHMEM_INO_BATCH == 0)) { raw_spin_lock(&sbinfo->stat_lock); ino = sbinfo->next_ino; sbinfo->next_ino += SHMEM_INO_BATCH; raw_spin_unlock(&sbinfo->stat_lock); if (unlikely(is_zero_ino(ino))) ino++; } *inop = ino; *next_ino = ++ino; put_cpu(); } return 0; } static void shmem_free_inode(struct super_block *sb, size_t freed_ispace) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); if (sbinfo->max_inodes) { raw_spin_lock(&sbinfo->stat_lock); sbinfo->free_ispace += BOGO_INODE_SIZE + freed_ispace; raw_spin_unlock(&sbinfo->stat_lock); } } /** * shmem_recalc_inode - recalculate the block usage of an inode * @inode: inode to recalc * @alloced: the change in number of pages allocated to inode * @swapped: the change in number of pages swapped from inode * * We have to calculate the free blocks since the mm can drop * undirtied hole pages behind our back. * * But normally info->alloced == inode->i_mapping->nrpages + info->swapped * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) * * Return: true if swapped was incremented from 0, for shmem_writeout(). */ static bool shmem_recalc_inode(struct inode *inode, long alloced, long swapped) { struct shmem_inode_info *info = SHMEM_I(inode); bool first_swapped = false; long freed; spin_lock(&info->lock); info->alloced += alloced; info->swapped += swapped; freed = info->alloced - info->swapped - READ_ONCE(inode->i_mapping->nrpages); /* * Special case: whereas normally shmem_recalc_inode() is called * after i_mapping->nrpages has already been adjusted (up or down), * shmem_writeout() has to raise swapped before nrpages is lowered - * to stop a racing shmem_recalc_inode() from thinking that a page has * been freed. Compensate here, to avoid the need for a followup call. */ if (swapped > 0) { if (info->swapped == swapped) first_swapped = true; freed += swapped; } if (freed > 0) info->alloced -= freed; spin_unlock(&info->lock); /* The quota case may block */ if (freed > 0) shmem_inode_unacct_blocks(inode, freed); return first_swapped; } bool shmem_charge(struct inode *inode, long pages) { struct address_space *mapping = inode->i_mapping; if (shmem_inode_acct_blocks(inode, pages)) return false; /* nrpages adjustment first, then shmem_recalc_inode() when balanced */ xa_lock_irq(&mapping->i_pages); mapping->nrpages += pages; xa_unlock_irq(&mapping->i_pages); shmem_recalc_inode(inode, pages, 0); return true; } void shmem_uncharge(struct inode *inode, long pages) { /* pages argument is currently unused: keep it to help debugging */ /* nrpages adjustment done by __filemap_remove_folio() or caller */ shmem_recalc_inode(inode, 0, 0); } /* * Replace item expected in xarray by a new item, while holding xa_lock. */ static int shmem_replace_entry(struct address_space *mapping, pgoff_t index, void *expected, void *replacement) { XA_STATE(xas, &mapping->i_pages, index); void *item; VM_BUG_ON(!expected); VM_BUG_ON(!replacement); item = xas_load(&xas); if (item != expected) return -ENOENT; xas_store(&xas, replacement); return 0; } /* * Sometimes, before we decide whether to proceed or to fail, we must check * that an entry was not already brought back or split by a racing thread. * * Checking folio is not enough: by the time a swapcache folio is locked, it * might be reused, and again be swapcache, using the same swap as before. * Returns the swap entry's order if it still presents, else returns -1. */ static int shmem_confirm_swap(struct address_space *mapping, pgoff_t index, swp_entry_t swap) { XA_STATE(xas, &mapping->i_pages, index); int ret = -1; void *entry; rcu_read_lock(); do { entry = xas_load(&xas); if (entry == swp_to_radix_entry(swap)) ret = xas_get_order(&xas); } while (xas_retry(&xas, entry)); rcu_read_unlock(); return ret; } /* * Definitions for "huge tmpfs": tmpfs mounted with the huge= option * * SHMEM_HUGE_NEVER: * disables huge pages for the mount; * SHMEM_HUGE_ALWAYS: * enables huge pages for the mount; * SHMEM_HUGE_WITHIN_SIZE: * only allocate huge pages if the page will be fully within i_size, * also respect madvise() hints; * SHMEM_HUGE_ADVISE: * only allocate huge pages if requested with madvise(); */ #define SHMEM_HUGE_NEVER 0 #define SHMEM_HUGE_ALWAYS 1 #define SHMEM_HUGE_WITHIN_SIZE 2 #define SHMEM_HUGE_ADVISE 3 /* * Special values. * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled: * * SHMEM_HUGE_DENY: * disables huge on shm_mnt and all mounts, for emergency use; * SHMEM_HUGE_FORCE: * enables huge on shm_mnt and all mounts, w/o needing option, for testing; * */ #define SHMEM_HUGE_DENY (-1) #define SHMEM_HUGE_FORCE (-2) #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* ifdef here to avoid bloating shmem.o when not necessary */ static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; static int tmpfs_huge __read_mostly = SHMEM_HUGE_NEVER; static unsigned int shmem_get_orders_within_size(struct inode *inode, unsigned long within_size_orders, pgoff_t index, loff_t write_end) { pgoff_t aligned_index; unsigned long order; loff_t i_size; order = highest_order(within_size_orders); while (within_size_orders) { aligned_index = round_up(index + 1, 1 << order); i_size = max(write_end, i_size_read(inode)); i_size = round_up(i_size, PAGE_SIZE); if (i_size >> PAGE_SHIFT >= aligned_index) return within_size_orders; order = next_order(&within_size_orders, order); } return 0; } static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, loff_t write_end, bool shmem_huge_force, struct vm_area_struct *vma, vm_flags_t vm_flags) { unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ? 0 : BIT(HPAGE_PMD_ORDER); unsigned long within_size_orders; if (!S_ISREG(inode->i_mode)) return 0; if (shmem_huge == SHMEM_HUGE_DENY) return 0; if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE) return maybe_pmd_order; /* * The huge order allocation for anon shmem is controlled through * the mTHP interface, so we still use PMD-sized huge order to * check whether global control is enabled. * * For tmpfs mmap()'s huge order, we still use PMD-sized order to * allocate huge pages due to lack of a write size hint. * * For tmpfs with 'huge=always' or 'huge=within_size' mount option, * we will always try PMD-sized order first. If that failed, it will * fall back to small large folios. */ switch (SHMEM_SB(inode->i_sb)->huge) { case SHMEM_HUGE_ALWAYS: if (vma) return maybe_pmd_order; return THP_ORDERS_ALL_FILE_DEFAULT; case SHMEM_HUGE_WITHIN_SIZE: if (vma) within_size_orders = maybe_pmd_order; else within_size_orders = THP_ORDERS_ALL_FILE_DEFAULT; within_size_orders = shmem_get_orders_within_size(inode, within_size_orders, index, write_end); if (within_size_orders > 0) return within_size_orders; fallthrough; case SHMEM_HUGE_ADVISE: if (vm_flags & VM_HUGEPAGE) return maybe_pmd_order; fallthrough; default: return 0; } } static int shmem_parse_huge(const char *str) { int huge; if (!str) return -EINVAL; if (!strcmp(str, "never")) huge = SHMEM_HUGE_NEVER; else if (!strcmp(str, "always")) huge = SHMEM_HUGE_ALWAYS; else if (!strcmp(str, "within_size")) huge = SHMEM_HUGE_WITHIN_SIZE; else if (!strcmp(str, "advise")) huge = SHMEM_HUGE_ADVISE; else if (!strcmp(str, "deny")) huge = SHMEM_HUGE_DENY; else if (!strcmp(str, "force")) huge = SHMEM_HUGE_FORCE; else return -EINVAL; if (!has_transparent_hugepage() && huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY) return -EINVAL; /* Do not override huge allocation policy with non-PMD sized mTHP */ if (huge == SHMEM_HUGE_FORCE && huge_shmem_orders_inherit != BIT(HPAGE_PMD_ORDER)) return -EINVAL; return huge; } #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) static const char *shmem_format_huge(int huge) { switch (huge) { case SHMEM_HUGE_NEVER: return "never"; case SHMEM_HUGE_ALWAYS: return "always"; case SHMEM_HUGE_WITHIN_SIZE: return "within_size"; case SHMEM_HUGE_ADVISE: return "advise"; case SHMEM_HUGE_DENY: return "deny"; case SHMEM_HUGE_FORCE: return "force"; default: VM_BUG_ON(1); return "bad_val"; } } #endif static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shrink_control *sc, unsigned long nr_to_free) { LIST_HEAD(list), *pos, *next; struct inode *inode; struct shmem_inode_info *info; struct folio *folio; unsigned long batch = sc ? sc->nr_to_scan : 128; unsigned long split = 0, freed = 0; if (list_empty(&sbinfo->shrinklist)) return SHRINK_STOP; spin_lock(&sbinfo->shrinklist_lock); list_for_each_safe(pos, next, &sbinfo->shrinklist) { info = list_entry(pos, struct shmem_inode_info, shrinklist); /* pin the inode */ inode = igrab(&info->vfs_inode); /* inode is about to be evicted */ if (!inode) { list_del_init(&info->shrinklist); goto next; } list_move(&info->shrinklist, &list); next: sbinfo->shrinklist_len--; if (!--batch) break; } spin_unlock(&sbinfo->shrinklist_lock); list_for_each_safe(pos, next, &list) { pgoff_t next, end; loff_t i_size; int ret; info = list_entry(pos, struct shmem_inode_info, shrinklist); inode = &info->vfs_inode; if (nr_to_free && freed >= nr_to_free) goto move_back; i_size = i_size_read(inode); folio = filemap_get_entry(inode->i_mapping, i_size / PAGE_SIZE); if (!folio || xa_is_value(folio)) goto drop; /* No large folio at the end of the file: nothing to split */ if (!folio_test_large(folio)) { folio_put(folio); goto drop; } /* Check if there is anything to gain from splitting */ next = folio_next_index(folio); end = shmem_fallocend(inode, DIV_ROUND_UP(i_size, PAGE_SIZE)); if (end <= folio->index || end >= next) { folio_put(folio); goto drop; } /* * Move the inode on the list back to shrinklist if we failed * to lock the page at this time. * * Waiting for the lock may lead to deadlock in the * reclaim path. */ if (!folio_trylock(folio)) { folio_put(folio); goto move_back; } ret = split_folio(folio); folio_unlock(folio); folio_put(folio); /* If split failed move the inode on the list back to shrinklist */ if (ret) goto move_back; freed += next - end; split++; drop: list_del_init(&info->shrinklist); goto put; move_back: /* * Make sure the inode is either on the global list or deleted * from any local list before iput() since it could be deleted * in another thread once we put the inode (then the local list * is corrupted). */ spin_lock(&sbinfo->shrinklist_lock); list_move(&info->shrinklist, &sbinfo->shrinklist); sbinfo->shrinklist_len++; spin_unlock(&sbinfo->shrinklist_lock); put: iput(inode); } return split; } static long shmem_unused_huge_scan(struct super_block *sb, struct shrink_control *sc) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); if (!READ_ONCE(sbinfo->shrinklist_len)) return SHRINK_STOP; return shmem_unused_huge_shrink(sbinfo, sc, 0); } static long shmem_unused_huge_count(struct super_block *sb, struct shrink_control *sc) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); return READ_ONCE(sbinfo->shrinklist_len); } #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ #define shmem_huge SHMEM_HUGE_DENY static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shrink_control *sc, unsigned long nr_to_free) { return 0; } static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, loff_t write_end, bool shmem_huge_force, struct vm_area_struct *vma, vm_flags_t vm_flags) { return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static void shmem_update_stats(struct folio *folio, int nr_pages) { if (folio_test_pmd_mappable(folio)) __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages); __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); __lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages); } /* * Somewhat like filemap_add_folio, but error if expected item has gone. */ static int shmem_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t index, void *expected, gfp_t gfp) { XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); unsigned long nr = folio_nr_pages(folio); swp_entry_t iter, swap; void *entry; VM_BUG_ON_FOLIO(index != round_down(index, nr), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); folio_ref_add(folio, nr); folio->mapping = mapping; folio->index = index; gfp &= GFP_RECLAIM_MASK; folio_throttle_swaprate(folio, gfp); swap = radix_to_swp_entry(expected); do { iter = swap; xas_lock_irq(&xas); xas_for_each_conflict(&xas, entry) { /* * The range must either be empty, or filled with * expected swap entries. Shmem swap entries are never * partially freed without split of both entry and * folio, so there shouldn't be any holes. */ if (!expected || entry != swp_to_radix_entry(iter)) { xas_set_err(&xas, -EEXIST); goto unlock; } iter.val += 1 << xas_get_order(&xas); } if (expected && iter.val - nr != swap.val) { xas_set_err(&xas, -EEXIST); goto unlock; } xas_store(&xas, folio); if (xas_error(&xas)) goto unlock; shmem_update_stats(folio, nr); mapping->nrpages += nr; unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); if (xas_error(&xas)) { folio->mapping = NULL; folio_ref_sub(folio, nr); return xas_error(&xas); } return 0; } /* * Somewhat like filemap_remove_folio, but substitutes swap for @folio. */ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) { struct address_space *mapping = folio->mapping; long nr = folio_nr_pages(folio); int error; xa_lock_irq(&mapping->i_pages); error = shmem_replace_entry(mapping, folio->index, folio, radswap); folio->mapping = NULL; mapping->nrpages -= nr; shmem_update_stats(folio, -nr); xa_unlock_irq(&mapping->i_pages); folio_put_refs(folio, nr); BUG_ON(error); } /* * Remove swap entry from page cache, free the swap and its page cache. Returns * the number of pages being freed. 0 means entry not found in XArray (0 pages * being freed). */ static long shmem_free_swap(struct address_space *mapping, pgoff_t index, void *radswap) { int order = xa_get_order(&mapping->i_pages, index); void *old; old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0); if (old != radswap) return 0; free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << order); return 1 << order; } /* * Determine (in bytes) how many of the shmem object's pages mapped by the * given offsets are swapped out. * * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem. */ unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end) { XA_STATE(xas, &mapping->i_pages, start); struct folio *folio; unsigned long swapped = 0; unsigned long max = end - 1; rcu_read_lock(); xas_for_each(&xas, folio, max) { if (xas_retry(&xas, folio)) continue; if (xa_is_value(folio)) swapped += 1 << xas_get_order(&xas); if (xas.xa_index == max) break; if (need_resched()) { xas_pause(&xas); cond_resched_rcu(); } } rcu_read_unlock(); return swapped << PAGE_SHIFT; } /* * Determine (in bytes) how many of the shmem object's pages mapped by the * given vma is swapped out. * * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem. */ unsigned long shmem_swap_usage(struct vm_area_struct *vma) { struct inode *inode = file_inode(vma->vm_file); struct shmem_inode_info *info = SHMEM_I(inode); struct address_space *mapping = inode->i_mapping; unsigned long swapped; /* Be careful as we don't hold info->lock */ swapped = READ_ONCE(info->swapped); /* * The easier cases are when the shmem object has nothing in swap, or * the vma maps it whole. Then we can simply use the stats that we * already track. */ if (!swapped) return 0; if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size) return swapped << PAGE_SHIFT; /* Here comes the more involved part */ return shmem_partial_swap_usage(mapping, vma->vm_pgoff, vma->vm_pgoff + vma_pages(vma)); } /* * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists. */ void shmem_unlock_mapping(struct address_space *mapping) { struct folio_batch fbatch; pgoff_t index = 0; folio_batch_init(&fbatch); /* * Minor point, but we might as well stop if someone else SHM_LOCKs it. */ while (!mapping_unevictable(mapping) && filemap_get_folios(mapping, &index, ~0UL, &fbatch)) { check_move_unevictable_folios(&fbatch); folio_batch_release(&fbatch); cond_resched(); } } static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) { struct folio *folio; /* * At first avoid shmem_get_folio(,,,SGP_READ): that fails * beyond i_size, and reports fallocated folios as holes. */ folio = filemap_get_entry(inode->i_mapping, index); if (!folio) return folio; if (!xa_is_value(folio)) { folio_lock(folio); if (folio->mapping == inode->i_mapping) return folio; /* The folio has been swapped out */ folio_unlock(folio); folio_put(folio); } /* * But read a folio back from swap if any of it is within i_size * (although in some cases this is just a waste of time). */ folio = NULL; shmem_get_folio(inode, index, 0, &folio, SGP_READ); return folio; } /* * Remove range of pages and swap entries from page cache, and free them. * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. */ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, bool unfalloc) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; pgoff_t end = (lend + 1) >> PAGE_SHIFT; struct folio_batch fbatch; pgoff_t indices[PAGEVEC_SIZE]; struct folio *folio; bool same_folio; long nr_swaps_freed = 0; pgoff_t index; int i; if (lend == -1) end = -1; /* unsigned, so actually very big */ if (info->fallocend > start && info->fallocend <= end && !unfalloc) info->fallocend = start; folio_batch_init(&fbatch); index = start; while (index < end && find_lock_entries(mapping, &index, end - 1, &fbatch, indices)) { for (i = 0; i < folio_batch_count(&fbatch); i++) { folio = fbatch.folios[i]; if (xa_is_value(folio)) { if (unfalloc) continue; nr_swaps_freed += shmem_free_swap(mapping, indices[i], folio); continue; } if (!unfalloc || !folio_test_uptodate(folio)) truncate_inode_folio(mapping, folio); folio_unlock(folio); } folio_batch_remove_exceptionals(&fbatch); folio_batch_release(&fbatch); cond_resched(); } /* * When undoing a failed fallocate, we want none of the partial folio * zeroing and splitting below, but shall want to truncate the whole * folio when !uptodate indicates that it was added by this fallocate, * even when [lstart, lend] covers only a part of the folio. */ if (unfalloc) goto whole_folios; same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT); if (folio) { same_folio = lend < folio_pos(folio) + folio_size(folio); folio_mark_dirty(folio); if (!truncate_inode_partial_folio(folio, lstart, lend)) { start = folio_next_index(folio); if (same_folio) end = folio->index; } folio_unlock(folio); folio_put(folio); folio = NULL; } if (!same_folio) folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT); if (folio) { folio_mark_dirty(folio); if (!truncate_inode_partial_folio(folio, lstart, lend)) end = folio->index; folio_unlock(folio); folio_put(folio); } whole_folios: index = start; while (index < end) { cond_resched(); if (!find_get_entries(mapping, &index, end - 1, &fbatch, indices)) { /* If all gone or hole-punch or unfalloc, we're done */ if (index == start || end != -1) break; /* But if truncating, restart to make sure all gone */ index = start; continue; } for (i = 0; i < folio_batch_count(&fbatch); i++) { folio = fbatch.folios[i]; if (xa_is_value(folio)) { long swaps_freed; if (unfalloc) continue; swaps_freed = shmem_free_swap(mapping, indices[i], folio); if (!swaps_freed) { /* Swap was replaced by page: retry */ index = indices[i]; break; } nr_swaps_freed += swaps_freed; continue; } folio_lock(folio); if (!unfalloc || !folio_test_uptodate(folio)) { if (folio_mapping(folio) != mapping) { /* Page was replaced by swap: retry */ folio_unlock(folio); index = indices[i]; break; } VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); if (!folio_test_large(folio)) { truncate_inode_folio(mapping, folio); } else if (truncate_inode_partial_folio(folio, lstart, lend)) { /* * If we split a page, reset the loop so * that we pick up the new sub pages. * Otherwise the THP was entirely * dropped or the target range was * zeroed, so just continue the loop as * is. */ if (!folio_test_large(folio)) { folio_unlock(folio); index = start; break; } } } folio_unlock(folio); } folio_batch_remove_exceptionals(&fbatch); folio_batch_release(&fbatch); } shmem_recalc_inode(inode, 0, -nr_swaps_freed); } void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) { shmem_undo_range(inode, lstart, lend, false); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); inode_inc_iversion(inode); } EXPORT_SYMBOL_GPL(shmem_truncate_range); static int shmem_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = path->dentry->d_inode; struct shmem_inode_info *info = SHMEM_I(inode); if (info->alloced - info->swapped != inode->i_mapping->nrpages) shmem_recalc_inode(inode, 0, 0); if (info->fsflags & FS_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; if (info->fsflags & FS_IMMUTABLE_FL) stat->attributes |= STATX_ATTR_IMMUTABLE; if (info->fsflags & FS_NODUMP_FL) stat->attributes |= STATX_ATTR_NODUMP; stat->attributes_mask |= (STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); generic_fillattr(idmap, request_mask, inode, stat); if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0)) stat->blksize = HPAGE_PMD_SIZE; if (request_mask & STATX_BTIME) { stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = info->i_crtime.tv_sec; stat->btime.tv_nsec = info->i_crtime.tv_nsec; } return 0; } static int shmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); int error; bool update_mtime = false; bool update_ctime = true; error = setattr_prepare(idmap, dentry, attr); if (error) return error; if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) { if ((inode->i_mode ^ attr->ia_mode) & 0111) { return -EPERM; } } if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { loff_t oldsize = inode->i_size; loff_t newsize = attr->ia_size; /* protected by i_rwsem */ if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || (newsize > oldsize && (info->seals & F_SEAL_GROW))) return -EPERM; if (newsize != oldsize) { error = shmem_reacct_size(SHMEM_I(inode)->flags, oldsize, newsize); if (error) return error; i_size_write(inode, newsize); update_mtime = true; } else { update_ctime = false; } if (newsize <= oldsize) { loff_t holebegin = round_up(newsize, PAGE_SIZE); if (oldsize > holebegin) unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); if (info->alloced) shmem_truncate_range(inode, newsize, (loff_t)-1); /* unmap again to remove racily COWed private pages */ if (oldsize > holebegin) unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); } } if (is_quota_modification(idmap, inode, attr)) { error = dquot_initialize(inode); if (error) return error; } /* Transfer quota accounting */ if (i_uid_needs_update(idmap, attr, inode) || i_gid_needs_update(idmap, attr, inode)) { error = dquot_transfer(idmap, inode, attr); if (error) return error; } setattr_copy(idmap, inode, attr); if (attr->ia_valid & ATTR_MODE) error = posix_acl_chmod(idmap, dentry, inode->i_mode); if (!error && update_ctime) { inode_set_ctime_current(inode); if (update_mtime) inode_set_mtime_to_ts(inode, inode_get_ctime(inode)); inode_inc_iversion(inode); } return error; } static void shmem_evict_inode(struct inode *inode) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); size_t freed = 0; if (shmem_mapping(inode->i_mapping)) { shmem_unacct_size(info->flags, inode->i_size); inode->i_size = 0; mapping_set_exiting(inode->i_mapping); shmem_truncate_range(inode, 0, (loff_t)-1); if (!list_empty(&info->shrinklist)) { spin_lock(&sbinfo->shrinklist_lock); if (!list_empty(&info->shrinklist)) { list_del_init(&info->shrinklist); sbinfo->shrinklist_len--; } spin_unlock(&sbinfo->shrinklist_lock); } while (!list_empty(&info->swaplist)) { /* Wait while shmem_unuse() is scanning this inode... */ wait_var_event(&info->stop_eviction, !atomic_read(&info->stop_eviction)); spin_lock(&shmem_swaplist_lock); /* ...but beware of the race if we peeked too early */ if (!atomic_read(&info->stop_eviction)) list_del_init(&info->swaplist); spin_unlock(&shmem_swaplist_lock); } } simple_xattrs_free(&info->xattrs, sbinfo->max_inodes ? &freed : NULL); shmem_free_inode(inode->i_sb, freed); WARN_ON(inode->i_blocks); clear_inode(inode); #ifdef CONFIG_TMPFS_QUOTA dquot_free_inode(inode); dquot_drop(inode); #endif } static unsigned int shmem_find_swap_entries(struct address_space *mapping, pgoff_t start, struct folio_batch *fbatch, pgoff_t *indices, unsigned int type) { XA_STATE(xas, &mapping->i_pages, start); struct folio *folio; swp_entry_t entry; rcu_read_lock(); xas_for_each(&xas, folio, ULONG_MAX) { if (xas_retry(&xas, folio)) continue; if (!xa_is_value(folio)) continue; entry = radix_to_swp_entry(folio); /* * swapin error entries can be found in the mapping. But they're * deliberately ignored here as we've done everything we can do. */ if (swp_type(entry) != type) continue; indices[folio_batch_count(fbatch)] = xas.xa_index; if (!folio_batch_add(fbatch, folio)) break; if (need_resched()) { xas_pause(&xas); cond_resched_rcu(); } } rcu_read_unlock(); return folio_batch_count(fbatch); } /* * Move the swapped pages for an inode to page cache. Returns the count * of pages swapped in, or the error in case of failure. */ static int shmem_unuse_swap_entries(struct inode *inode, struct folio_batch *fbatch, pgoff_t *indices) { int i = 0; int ret = 0; int error = 0; struct address_space *mapping = inode->i_mapping; for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE, mapping_gfp_mask(mapping), NULL, NULL); if (error == 0) { folio_unlock(folio); folio_put(folio); ret++; } if (error == -ENOMEM) break; error = 0; } return error ? error : ret; } /* * If swap found in inode, free it and move page from swapcache to filecache. */ static int shmem_unuse_inode(struct inode *inode, unsigned int type) { struct address_space *mapping = inode->i_mapping; pgoff_t start = 0; struct folio_batch fbatch; pgoff_t indices[PAGEVEC_SIZE]; int ret = 0; do { folio_batch_init(&fbatch); if (!shmem_find_swap_entries(mapping, start, &fbatch, indices, type)) { ret = 0; break; } ret = shmem_unuse_swap_entries(inode, &fbatch, indices); if (ret < 0) break; start = indices[folio_batch_count(&fbatch) - 1]; } while (true); return ret; } /* * Read all the shared memory data that resides in the swap * device 'type' back into memory, so the swap device can be * unused. */ int shmem_unuse(unsigned int type) { struct shmem_inode_info *info, *next; int error = 0; if (list_empty(&shmem_swaplist)) return 0; spin_lock(&shmem_swaplist_lock); start_over: list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { if (!info->swapped) { list_del_init(&info->swaplist); continue; } /* * Drop the swaplist mutex while searching the inode for swap; * but before doing so, make sure shmem_evict_inode() will not * remove placeholder inode from swaplist, nor let it be freed * (igrab() would protect from unlink, but not from unmount). */ atomic_inc(&info->stop_eviction); spin_unlock(&shmem_swaplist_lock); error = shmem_unuse_inode(&info->vfs_inode, type); cond_resched(); spin_lock(&shmem_swaplist_lock); if (atomic_dec_and_test(&info->stop_eviction)) wake_up_var(&info->stop_eviction); if (error) break; if (list_empty(&info->swaplist)) goto start_over; next = list_next_entry(info, swaplist); if (!info->swapped) list_del_init(&info->swaplist); } spin_unlock(&shmem_swaplist_lock); return error; } /** * shmem_writeout - Write the folio to swap * @folio: The folio to write * @plug: swap plug * @folio_list: list to put back folios on split * * Move the folio from the page cache to the swap cache. */ int shmem_writeout(struct folio *folio, struct swap_iocb **plug, struct list_head *folio_list) { struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); pgoff_t index; int nr_pages; bool split = false; if ((info->flags & VM_LOCKED) || sbinfo->noswap) goto redirty; if (!total_swap_pages) goto redirty; /* * If CONFIG_THP_SWAP is not enabled, the large folio should be * split when swapping. * * And shrinkage of pages beyond i_size does not split swap, so * swapout of a large folio crossing i_size needs to split too * (unless fallocate has been used to preallocate beyond EOF). */ if (folio_test_large(folio)) { index = shmem_fallocend(inode, DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE)); if ((index > folio->index && index < folio_next_index(folio)) || !IS_ENABLED(CONFIG_THP_SWAP)) split = true; } if (split) { try_split: /* Ensure the subpages are still dirty */ folio_test_set_dirty(folio); if (split_folio_to_list(folio, folio_list)) goto redirty; folio_clear_dirty(folio); } index = folio->index; nr_pages = folio_nr_pages(folio); /* * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC * value into swapfile.c, the only way we can correctly account for a * fallocated folio arriving here is now to initialize it and write it. * * That's okay for a folio already fallocated earlier, but if we have * not yet completed the fallocation, then (a) we want to keep track * of this folio in case we have to undo it, and (b) it may not be a * good idea to continue anyway, once we're pushing into swap. So * reactivate the folio, and let shmem_fallocate() quit when too many. */ if (!folio_test_uptodate(folio)) { if (inode->i_private) { struct shmem_falloc *shmem_falloc; spin_lock(&inode->i_lock); shmem_falloc = inode->i_private; if (shmem_falloc && !shmem_falloc->waitq && index >= shmem_falloc->start && index < shmem_falloc->next) shmem_falloc->nr_unswapped += nr_pages; else shmem_falloc = NULL; spin_unlock(&inode->i_lock); if (shmem_falloc) goto redirty; } folio_zero_range(folio, 0, folio_size(folio)); flush_dcache_folio(folio); folio_mark_uptodate(folio); } if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) { bool first_swapped = shmem_recalc_inode(inode, 0, nr_pages); int error; /* * Add inode to shmem_unuse()'s list of swapped-out inodes, * if it's not already there. Do it now before the folio is * removed from page cache, when its pagelock no longer * protects the inode from eviction. And do it now, after * we've incremented swapped, because shmem_unuse() will * prune a !swapped inode from the swaplist. */ if (first_swapped) { spin_lock(&shmem_swaplist_lock); if (list_empty(&info->swaplist)) list_add(&info->swaplist, &shmem_swaplist); spin_unlock(&shmem_swaplist_lock); } swap_shmem_alloc(folio->swap, nr_pages); shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap)); BUG_ON(folio_mapped(folio)); error = swap_writeout(folio, plug); if (error != AOP_WRITEPAGE_ACTIVATE) { /* folio has been unlocked */ return error; } /* * The intention here is to avoid holding on to the swap when * zswap was unable to compress and unable to writeback; but * it will be appropriate if other reactivate cases are added. */ error = shmem_add_to_page_cache(folio, mapping, index, swp_to_radix_entry(folio->swap), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); /* Swap entry might be erased by racing shmem_free_swap() */ if (!error) { shmem_recalc_inode(inode, 0, -nr_pages); swap_free_nr(folio->swap, nr_pages); } /* * The swap_cache_del_folio() below could be left for * shrink_folio_list()'s folio_free_swap() to dispose of; * but I'm a little nervous about letting this folio out of * shmem_writeout() in a hybrid half-tmpfs-half-swap state * e.g. folio_mapping(folio) might give an unexpected answer. */ swap_cache_del_folio(folio); goto redirty; } if (nr_pages > 1) goto try_split; redirty: folio_mark_dirty(folio); return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */ } EXPORT_SYMBOL_GPL(shmem_writeout); #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) { char buffer[64]; if (!mpol || mpol->mode == MPOL_DEFAULT) return; /* show nothing */ mpol_to_str(buffer, sizeof(buffer), mpol); seq_printf(seq, ",mpol=%s", buffer); } static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) { struct mempolicy *mpol = NULL; if (sbinfo->mpol) { raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ mpol = sbinfo->mpol; mpol_get(mpol); raw_spin_unlock(&sbinfo->stat_lock); } return mpol; } #else /* !CONFIG_NUMA || !CONFIG_TMPFS */ static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) { } static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) { return NULL; } #endif /* CONFIG_NUMA && CONFIG_TMPFS */ static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, pgoff_t index, unsigned int order, pgoff_t *ilx); static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct mempolicy *mpol; pgoff_t ilx; struct folio *folio; mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); folio = swap_cluster_readahead(swap, gfp, mpol, ilx); mpol_cond_put(mpol); return folio; } /* * Make sure huge_gfp is always more limited than limit_gfp. * Some of the flags set permissions, while others set limitations. */ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) { gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM; gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY; gfp_t zoneflags = limit_gfp & GFP_ZONEMASK; gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK); /* Allow allocations only from the originally specified zones. */ result |= zoneflags; /* * Minimize the result gfp by taking the union with the deny flags, * and the intersection of the allow flags. */ result |= (limit_gfp & denyflags); result |= (huge_gfp & limit_gfp) & allowflags; return result; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE bool shmem_hpage_pmd_enabled(void) { if (shmem_huge == SHMEM_HUGE_DENY) return false; if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_always)) return true; if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_madvise)) return true; if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_within_size)) return true; if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_inherit) && shmem_huge != SHMEM_HUGE_NEVER) return true; return false; } unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, loff_t write_end, bool shmem_huge_force) { unsigned long mask = READ_ONCE(huge_shmem_orders_always); unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); vm_flags_t vm_flags = vma ? vma->vm_flags : 0; unsigned int global_orders; if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags, shmem_huge_force))) return 0; global_orders = shmem_huge_global_enabled(inode, index, write_end, shmem_huge_force, vma, vm_flags); /* Tmpfs huge pages allocation */ if (!vma || !vma_is_anon_shmem(vma)) return global_orders; /* * Following the 'deny' semantics of the top level, force the huge * option off from all mounts. */ if (shmem_huge == SHMEM_HUGE_DENY) return 0; /* * Only allow inherit orders if the top-level value is 'force', which * means non-PMD sized THP can not override 'huge' mount option now. */ if (shmem_huge == SHMEM_HUGE_FORCE) return READ_ONCE(huge_shmem_orders_inherit); /* Allow mTHP that will be fully within i_size. */ mask |= shmem_get_orders_within_size(inode, within_size_orders, index, 0); if (vm_flags & VM_HUGEPAGE) mask |= READ_ONCE(huge_shmem_orders_madvise); if (global_orders > 0) mask |= READ_ONCE(huge_shmem_orders_inherit); return THP_ORDERS_ALL_FILE_DEFAULT & mask; } static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf, struct address_space *mapping, pgoff_t index, unsigned long orders) { struct vm_area_struct *vma = vmf ? vmf->vma : NULL; pgoff_t aligned_index; unsigned long pages; int order; if (vma) { orders = thp_vma_suitable_orders(vma, vmf->address, orders); if (!orders) return 0; } /* Find the highest order that can add into the page cache */ order = highest_order(orders); while (orders) { pages = 1UL << order; aligned_index = round_down(index, pages); /* * Check for conflict before waiting on a huge allocation. * Conflict might be that a huge page has just been allocated * and added to page cache by a racing thread, or that there * is already at least one small page in the huge extent. * Be careful to retry when appropriate, but not forever! * Elsewhere -EEXIST would be the right code, but not here. */ if (!xa_find(&mapping->i_pages, &aligned_index, aligned_index + pages - 1, XA_PRESENT)) break; order = next_order(&orders, order); } return orders; } #else static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf, struct address_space *mapping, pgoff_t index, unsigned long orders) { return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static struct folio *shmem_alloc_folio(gfp_t gfp, int order, struct shmem_inode_info *info, pgoff_t index) { struct mempolicy *mpol; pgoff_t ilx; struct folio *folio; mpol = shmem_get_pgoff_policy(info, index, order, &ilx); folio = folio_alloc_mpol(gfp, order, mpol, ilx, numa_node_id()); mpol_cond_put(mpol); return folio; } static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, gfp_t gfp, struct inode *inode, pgoff_t index, struct mm_struct *fault_mm, unsigned long orders) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); unsigned long suitable_orders = 0; struct folio *folio = NULL; long pages; int error, order; if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) orders = 0; if (orders > 0) { suitable_orders = shmem_suitable_orders(inode, vmf, mapping, index, orders); order = highest_order(suitable_orders); while (suitable_orders) { pages = 1UL << order; index = round_down(index, pages); folio = shmem_alloc_folio(gfp, order, info, index); if (folio) goto allocated; if (pages == HPAGE_PMD_NR) count_vm_event(THP_FILE_FALLBACK); count_mthp_stat(order, MTHP_STAT_SHMEM_FALLBACK); order = next_order(&suitable_orders, order); } } else { pages = 1; folio = shmem_alloc_folio(gfp, 0, info, index); } if (!folio) return ERR_PTR(-ENOMEM); allocated: __folio_set_locked(folio); __folio_set_swapbacked(folio); gfp &= GFP_RECLAIM_MASK; error = mem_cgroup_charge(folio, fault_mm, gfp); if (error) { if (xa_find(&mapping->i_pages, &index, index + pages - 1, XA_PRESENT)) { error = -EEXIST; } else if (pages > 1) { if (pages == HPAGE_PMD_NR) { count_vm_event(THP_FILE_FALLBACK); count_vm_event(THP_FILE_FALLBACK_CHARGE); } count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK); count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK_CHARGE); } goto unlock; } error = shmem_add_to_page_cache(folio, mapping, index, NULL, gfp); if (error) goto unlock; error = shmem_inode_acct_blocks(inode, pages); if (error) { struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); long freed; /* * Try to reclaim some space by splitting a few * large folios beyond i_size on the filesystem. */ shmem_unused_huge_shrink(sbinfo, NULL, pages); /* * And do a shmem_recalc_inode() to account for freed pages: * except our folio is there in cache, so not quite balanced. */ spin_lock(&info->lock); freed = pages + info->alloced - info->swapped - READ_ONCE(mapping->nrpages); if (freed > 0) info->alloced -= freed; spin_unlock(&info->lock); if (freed > 0) shmem_inode_unacct_blocks(inode, freed); error = shmem_inode_acct_blocks(inode, pages); if (error) { filemap_remove_folio(folio); goto unlock; } } shmem_recalc_inode(inode, pages, 0); folio_add_lru(folio); return folio; unlock: folio_unlock(folio); folio_put(folio); return ERR_PTR(error); } static struct folio *shmem_swap_alloc_folio(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, swp_entry_t entry, int order, gfp_t gfp) { struct shmem_inode_info *info = SHMEM_I(inode); int nr_pages = 1 << order; struct folio *new; gfp_t alloc_gfp; void *shadow; /* * We have arrived here because our zones are constrained, so don't * limit chance of success with further cpuset and node constraints. */ gfp &= ~GFP_CONSTRAINT_MASK; alloc_gfp = gfp; if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { if (WARN_ON_ONCE(order)) return ERR_PTR(-EINVAL); } else if (order) { /* * If uffd is active for the vma, we need per-page fault * fidelity to maintain the uffd semantics, then fallback * to swapin order-0 folio, as well as for zswap case. * Any existing sub folio in the swap cache also blocks * mTHP swapin. */ if ((vma && unlikely(userfaultfd_armed(vma))) || !zswap_never_enabled() || non_swapcache_batch(entry, nr_pages) != nr_pages) goto fallback; alloc_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp); } retry: new = shmem_alloc_folio(alloc_gfp, order, info, index); if (!new) { new = ERR_PTR(-ENOMEM); goto fallback; } if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL, alloc_gfp, entry)) { folio_put(new); new = ERR_PTR(-ENOMEM); goto fallback; } /* * Prevent parallel swapin from proceeding with the swap cache flag. * * Of course there is another possible concurrent scenario as well, * that is to say, the swap cache flag of a large folio has already * been set by swapcache_prepare(), while another thread may have * already split the large swap entry stored in the shmem mapping. * In this case, shmem_add_to_page_cache() will help identify the * concurrent swapin and return -EEXIST. */ if (swapcache_prepare(entry, nr_pages)) { folio_put(new); new = ERR_PTR(-EEXIST); /* Try smaller folio to avoid cache conflict */ goto fallback; } __folio_set_locked(new); __folio_set_swapbacked(new); new->swap = entry; memcg1_swapin(entry, nr_pages); shadow = swap_cache_get_shadow(entry); if (shadow) workingset_refault(new, shadow); folio_add_lru(new); swap_read_folio(new, NULL); return new; fallback: /* Order 0 swapin failed, nothing to fallback to, abort */ if (!order) return new; entry.val += index - round_down(index, nr_pages); alloc_gfp = gfp; nr_pages = 1; order = 0; goto retry; } /* * When a page is moved from swapcache to shmem filecache (either by the * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of * shmem_unuse_inode()), it may have been read in earlier from swap, in * ignorance of the mapping it belongs to. If that mapping has special * constraints (like the gma500 GEM driver, which requires RAM below 4GB), * we may need to copy to a suitable page before moving to filecache. * * In a future release, this may well be extended to respect cpuset and * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); * but for now it is a simple matter of zone. */ static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp) { return folio_zonenum(folio) > gfp_zone(gfp); } static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index, struct vm_area_struct *vma) { struct swap_cluster_info *ci; struct folio *new, *old = *foliop; swp_entry_t entry = old->swap; int nr_pages = folio_nr_pages(old); int error = 0; /* * We have arrived here because our zones are constrained, so don't * limit chance of success by further cpuset and node constraints. */ gfp &= ~GFP_CONSTRAINT_MASK; #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (nr_pages > 1) { gfp_t huge_gfp = vma_thp_gfp_mask(vma); gfp = limit_gfp_mask(huge_gfp, gfp); } #endif new = shmem_alloc_folio(gfp, folio_order(old), info, index); if (!new) return -ENOMEM; folio_ref_add(new, nr_pages); folio_copy(new, old); flush_dcache_folio(new); __folio_set_locked(new); __folio_set_swapbacked(new); folio_mark_uptodate(new); new->swap = entry; folio_set_swapcache(new); ci = swap_cluster_get_and_lock_irq(old); __swap_cache_replace_folio(ci, old, new); mem_cgroup_replace_folio(old, new); shmem_update_stats(new, nr_pages); shmem_update_stats(old, -nr_pages); swap_cluster_unlock_irq(ci); folio_add_lru(new); *foliop = new; folio_clear_swapcache(old); old->private = NULL; folio_unlock(old); /* * The old folio are removed from swap cache, drop the 'nr_pages' * reference, as well as one temporary reference getting from swap * cache. */ folio_put_refs(old, nr_pages + 1); return error; } static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, struct folio *folio, swp_entry_t swap, bool skip_swapcache) { struct address_space *mapping = inode->i_mapping; swp_entry_t swapin_error; void *old; int nr_pages; swapin_error = make_poisoned_swp_entry(); old = xa_cmpxchg_irq(&mapping->i_pages, index, swp_to_radix_entry(swap), swp_to_radix_entry(swapin_error), 0); if (old != swp_to_radix_entry(swap)) return; nr_pages = folio_nr_pages(folio); folio_wait_writeback(folio); if (!skip_swapcache) swap_cache_del_folio(folio); /* * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) * in shmem_evict_inode(). */ shmem_recalc_inode(inode, -nr_pages, -nr_pages); swap_free_nr(swap, nr_pages); } static int shmem_split_large_entry(struct inode *inode, pgoff_t index, swp_entry_t swap, gfp_t gfp) { struct address_space *mapping = inode->i_mapping; XA_STATE_ORDER(xas, &mapping->i_pages, index, 0); int split_order = 0; int i; /* Convert user data gfp flags to xarray node gfp flags */ gfp &= GFP_RECLAIM_MASK; for (;;) { void *old = NULL; int cur_order; pgoff_t swap_index; xas_lock_irq(&xas); old = xas_load(&xas); if (!xa_is_value(old) || swp_to_radix_entry(swap) != old) { xas_set_err(&xas, -EEXIST); goto unlock; } cur_order = xas_get_order(&xas); if (!cur_order) goto unlock; /* Try to split large swap entry in pagecache */ swap_index = round_down(index, 1 << cur_order); split_order = xas_try_split_min_order(cur_order); while (cur_order > 0) { pgoff_t aligned_index = round_down(index, 1 << cur_order); pgoff_t swap_offset = aligned_index - swap_index; xas_set_order(&xas, index, split_order); xas_try_split(&xas, old, cur_order); if (xas_error(&xas)) goto unlock; /* * Re-set the swap entry after splitting, and the swap * offset of the original large entry must be continuous. */ for (i = 0; i < 1 << cur_order; i += (1 << split_order)) { swp_entry_t tmp; tmp = swp_entry(swp_type(swap), swp_offset(swap) + swap_offset + i); __xa_store(&mapping->i_pages, aligned_index + i, swp_to_radix_entry(tmp), 0); } cur_order = split_order; split_order = xas_try_split_min_order(split_order); } unlock: xas_unlock_irq(&xas); if (!xas_nomem(&xas, gfp)) break; } if (xas_error(&xas)) return xas_error(&xas); return 0; } /* * Swap in the folio pointed to by *foliop. * Caller has to make sure that *foliop contains a valid swapped folio. * Returns 0 and the folio in foliop if success. On failure, returns the * error code and NULL in *foliop. */ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, vm_fault_t *fault_type) { struct address_space *mapping = inode->i_mapping; struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL; struct shmem_inode_info *info = SHMEM_I(inode); swp_entry_t swap, index_entry; struct swap_info_struct *si; struct folio *folio = NULL; bool skip_swapcache = false; int error, nr_pages, order; pgoff_t offset; VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); index_entry = radix_to_swp_entry(*foliop); swap = index_entry; *foliop = NULL; if (is_poisoned_swp_entry(index_entry)) return -EIO; si = get_swap_device(index_entry); order = shmem_confirm_swap(mapping, index, index_entry); if (unlikely(!si)) { if (order < 0) return -EEXIST; else return -EINVAL; } if (unlikely(order < 0)) { put_swap_device(si); return -EEXIST; } /* index may point to the middle of a large entry, get the sub entry */ if (order) { offset = index - round_down(index, 1 << order); swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); } /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap); if (!folio) { if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { /* Direct swapin skipping swap cache & readahead */ folio = shmem_swap_alloc_folio(inode, vma, index, index_entry, order, gfp); if (IS_ERR(folio)) { error = PTR_ERR(folio); folio = NULL; goto failed; } skip_swapcache = true; } else { /* Cached swapin only supports order 0 folio */ folio = shmem_swapin_cluster(swap, gfp, info, index); if (!folio) { error = -ENOMEM; goto failed; } } if (fault_type) { *fault_type |= VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); count_memcg_event_mm(fault_mm, PGMAJFAULT); } } else { swap_update_readahead(folio, NULL, 0); } if (order > folio_order(folio)) { /* * Swapin may get smaller folios due to various reasons: * It may fallback to order 0 due to memory pressure or race, * swap readahead may swap in order 0 folios into swapcache * asynchronously, while the shmem mapping can still stores * large swap entries. In such cases, we should split the * large swap entry to prevent possible data corruption. */ error = shmem_split_large_entry(inode, index, index_entry, gfp); if (error) goto failed_nolock; } /* * If the folio is large, round down swap and index by folio size. * No matter what race occurs, the swap layer ensures we either get * a valid folio that has its swap entry aligned by size, or a * temporarily invalid one which we'll abort very soon and retry. * * shmem_add_to_page_cache ensures the whole range contains expected * entries and prevents any corruption, so any race split is fine * too, it will succeed as long as the entries are still there. */ nr_pages = folio_nr_pages(folio); if (nr_pages > 1) { swap.val = round_down(swap.val, nr_pages); index = round_down(index, nr_pages); } /* * We have to do this with the folio locked to prevent races. * The shmem_confirm_swap below only checks if the first swap * entry matches the folio, that's enough to ensure the folio * is not used outside of shmem, as shmem swap entries * and swap cache folios are never partially freed. */ folio_lock(folio); if ((!skip_swapcache && !folio_test_swapcache(folio)) || shmem_confirm_swap(mapping, index, swap) < 0 || folio->swap.val != swap.val) { error = -EEXIST; goto unlock; } if (!folio_test_uptodate(folio)) { error = -EIO; goto failed; } folio_wait_writeback(folio); /* * Some architectures may have to restore extra metadata to the * folio after reading from swap. */ arch_swap_restore(folio_swap(swap, folio), folio); if (shmem_should_replace_folio(folio, gfp)) { error = shmem_replace_folio(&folio, gfp, info, index, vma); if (error) goto failed; } error = shmem_add_to_page_cache(folio, mapping, index, swp_to_radix_entry(swap), gfp); if (error) goto failed; shmem_recalc_inode(inode, 0, -nr_pages); if (sgp == SGP_WRITE) folio_mark_accessed(folio); if (skip_swapcache) { folio->swap.val = 0; swapcache_clear(si, swap, nr_pages); } else { swap_cache_del_folio(folio); } folio_mark_dirty(folio); swap_free_nr(swap, nr_pages); put_swap_device(si); *foliop = folio; return 0; failed: if (shmem_confirm_swap(mapping, index, swap) < 0) error = -EEXIST; if (error == -EIO) shmem_set_folio_swapin_error(inode, index, folio, swap, skip_swapcache); unlock: if (folio) folio_unlock(folio); failed_nolock: if (skip_swapcache) swapcache_clear(si, folio->swap, folio_nr_pages(folio)); if (folio) folio_put(folio); put_swap_device(si); return error; } /* * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate * * If we allocate a new one we do not mark it dirty. That's up to the * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache. * * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL. */ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, loff_t write_end, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type) { struct vm_area_struct *vma = vmf ? vmf->vma : NULL; struct mm_struct *fault_mm; struct folio *folio; int error; bool alloced; unsigned long orders = 0; if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping))) return -EINVAL; if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return -EFBIG; repeat: if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) return -EINVAL; alloced = false; fault_mm = vma ? vma->vm_mm : NULL; folio = filemap_get_entry(inode->i_mapping, index); if (folio && vma && userfaultfd_minor(vma)) { if (!xa_is_value(folio)) folio_put(folio); *fault_type = handle_userfault(vmf, VM_UFFD_MINOR); return 0; } if (xa_is_value(folio)) { error = shmem_swapin_folio(inode, index, &folio, sgp, gfp, vma, fault_type); if (error == -EEXIST) goto repeat; *foliop = folio; return error; } if (folio) { folio_lock(folio); /* Has the folio been truncated or swapped out? */ if (unlikely(folio->mapping != inode->i_mapping)) { folio_unlock(folio); folio_put(folio); goto repeat; } if (sgp == SGP_WRITE) folio_mark_accessed(folio); if (folio_test_uptodate(folio)) goto out; /* fallocated folio */ if (sgp != SGP_READ) goto clear; folio_unlock(folio); folio_put(folio); } /* * SGP_READ: succeed on hole, with NULL folio, letting caller zero. * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail. */ *foliop = NULL; if (sgp == SGP_READ) return 0; if (sgp == SGP_NOALLOC) return -ENOENT; /* * Fast cache lookup and swap lookup did not find it: allocate. */ if (vma && userfaultfd_missing(vma)) { *fault_type = handle_userfault(vmf, VM_UFFD_MISSING); return 0; } /* Find hugepage orders that are allowed for anonymous shmem and tmpfs. */ orders = shmem_allowable_huge_orders(inode, vma, index, write_end, false); if (orders > 0) { gfp_t huge_gfp; huge_gfp = vma_thp_gfp_mask(vma); huge_gfp = limit_gfp_mask(huge_gfp, gfp); folio = shmem_alloc_and_add_folio(vmf, huge_gfp, inode, index, fault_mm, orders); if (!IS_ERR(folio)) { if (folio_test_pmd_mappable(folio)) count_vm_event(THP_FILE_ALLOC); count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_ALLOC); goto alloced; } if (PTR_ERR(folio) == -EEXIST) goto repeat; } folio = shmem_alloc_and_add_folio(vmf, gfp, inode, index, fault_mm, 0); if (IS_ERR(folio)) { error = PTR_ERR(folio); if (error == -EEXIST) goto repeat; folio = NULL; goto unlock; } alloced: alloced = true; if (folio_test_large(folio) && DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < folio_next_index(folio)) { struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct shmem_inode_info *info = SHMEM_I(inode); /* * Part of the large folio is beyond i_size: subject * to shrink under memory pressure. */ spin_lock(&sbinfo->shrinklist_lock); /* * _careful to defend against unlocked access to * ->shrink_list in shmem_unused_huge_shrink() */ if (list_empty_careful(&info->shrinklist)) { list_add_tail(&info->shrinklist, &sbinfo->shrinklist); sbinfo->shrinklist_len++; } spin_unlock(&sbinfo->shrinklist_lock); } if (sgp == SGP_WRITE) folio_set_referenced(folio); /* * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio. */ if (sgp == SGP_FALLOC) sgp = SGP_WRITE; clear: /* * Let SGP_WRITE caller clear ends if write does not fill folio; * but SGP_FALLOC on a folio fallocated earlier must initialize * it now, lest undo on failure cancel our earlier guarantee. */ if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) { long i, n = folio_nr_pages(folio); for (i = 0; i < n; i++) clear_highpage(folio_page(folio, i)); flush_dcache_folio(folio); folio_mark_uptodate(folio); } /* Perhaps the file has been truncated since we checked */ if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { error = -EINVAL; goto unlock; } out: *foliop = folio; return 0; /* * Error recovery. */ unlock: if (alloced) filemap_remove_folio(folio); shmem_recalc_inode(inode, 0, 0); if (folio) { folio_unlock(folio); folio_put(folio); } return error; } /** * shmem_get_folio - find, and lock a shmem folio. * @inode: inode to search * @index: the page index. * @write_end: end of a write, could extend inode size * @foliop: pointer to the folio if found * @sgp: SGP_* flags to control behavior * * Looks up the page cache entry at @inode & @index. If a folio is * present, it is returned locked with an increased refcount. * * If the caller modifies data in the folio, it must call folio_mark_dirty() * before unlocking the folio to ensure that the folio is not reclaimed. * There is no need to reserve space before calling folio_mark_dirty(). * * When no folio is found, the behavior depends on @sgp: * - for SGP_READ, *@foliop is %NULL and 0 is returned * - for SGP_NOALLOC, *@foliop is %NULL and -ENOENT is returned * - for all other flags a new folio is allocated, inserted into the * page cache and returned locked in @foliop. * * Context: May sleep. * Return: 0 if successful, else a negative error code. */ int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end, struct folio **foliop, enum sgp_type sgp) { return shmem_get_folio_gfp(inode, index, write_end, foliop, sgp, mapping_gfp_mask(inode->i_mapping), NULL, NULL); } EXPORT_SYMBOL_GPL(shmem_get_folio); /* * This is like autoremove_wake_function, but it removes the wait queue * entry unconditionally - even if something else had already woken the * target. */ static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) { int ret = default_wake_function(wait, mode, sync, key); list_del_init(&wait->entry); return ret; } /* * Trinity finds that probing a hole which tmpfs is punching can * prevent the hole-punch from ever completing: which in turn * locks writers out with its hold on i_rwsem. So refrain from * faulting pages into the hole while it's being punched. Although * shmem_undo_range() does remove the additions, it may be unable to * keep up, as each new page needs its own unmap_mapping_range() call, * and the i_mmap tree grows ever slower to scan if new vmas are added. * * It does not matter if we sometimes reach this check just before the * hole-punch begins, so that one fault then races with the punch: * we just need to make racing faults a rare case. * * The implementation below would be much simpler if we just used a * standard mutex or completion: but we cannot take i_rwsem in fault, * and bloating every shmem inode for this unlikely case would be sad. */ static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode) { struct shmem_falloc *shmem_falloc; struct file *fpin = NULL; vm_fault_t ret = 0; spin_lock(&inode->i_lock); shmem_falloc = inode->i_private; if (shmem_falloc && shmem_falloc->waitq && vmf->pgoff >= shmem_falloc->start && vmf->pgoff < shmem_falloc->next) { wait_queue_head_t *shmem_falloc_waitq; DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); ret = VM_FAULT_NOPAGE; fpin = maybe_unlock_mmap_for_io(vmf, NULL); shmem_falloc_waitq = shmem_falloc->waitq; prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->i_lock); schedule(); /* * shmem_falloc_waitq points into the shmem_fallocate() * stack of the hole-punching task: shmem_falloc_waitq * is usually invalid by the time we reach here, but * finish_wait() does not dereference it in that case; * though i_lock needed lest racing with wake_up_all(). */ spin_lock(&inode->i_lock); finish_wait(shmem_falloc_waitq, &shmem_fault_wait); } spin_unlock(&inode->i_lock); if (fpin) { fput(fpin); ret = VM_FAULT_RETRY; } return ret; } static vm_fault_t shmem_fault(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); gfp_t gfp = mapping_gfp_mask(inode->i_mapping); struct folio *folio = NULL; vm_fault_t ret = 0; int err; /* * Trinity finds that probing a hole which tmpfs is punching can * prevent the hole-punch from ever completing: noted in i_private. */ if (unlikely(inode->i_private)) { ret = shmem_falloc_wait(vmf, inode); if (ret) return ret; } WARN_ON_ONCE(vmf->page != NULL); err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE, gfp, vmf, &ret); if (err) return vmf_error(err); if (folio) { vmf->page = folio_file_page(folio, vmf->pgoff); ret |= VM_FAULT_LOCKED; } return ret; } unsigned long shmem_get_unmapped_area(struct file *file, unsigned long uaddr, unsigned long len, unsigned long pgoff, unsigned long flags) { unsigned long addr; unsigned long offset; unsigned long inflated_len; unsigned long inflated_addr; unsigned long inflated_offset; unsigned long hpage_size; if (len > TASK_SIZE) return -ENOMEM; addr = mm_get_unmapped_area(current->mm, file, uaddr, len, pgoff, flags); if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return addr; if (IS_ERR_VALUE(addr)) return addr; if (addr & ~PAGE_MASK) return addr; if (addr > TASK_SIZE - len) return addr; if (shmem_huge == SHMEM_HUGE_DENY) return addr; if (flags & MAP_FIXED) return addr; /* * Our priority is to support MAP_SHARED mapped hugely; * and support MAP_PRIVATE mapped hugely too, until it is COWed. * But if caller specified an address hint and we allocated area there * successfully, respect that as before. */ if (uaddr == addr) return addr; hpage_size = HPAGE_PMD_SIZE; if (shmem_huge != SHMEM_HUGE_FORCE) { struct super_block *sb; unsigned long __maybe_unused hpage_orders; int order = 0; if (file) { VM_BUG_ON(file->f_op != &shmem_file_operations); sb = file_inode(file)->i_sb; } else { /* * Called directly from mm/mmap.c, or drivers/char/mem.c * for "/dev/zero", to create a shared anonymous object. */ if (IS_ERR(shm_mnt)) return addr; sb = shm_mnt->mnt_sb; /* * Find the highest mTHP order used for anonymous shmem to * provide a suitable alignment address. */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE hpage_orders = READ_ONCE(huge_shmem_orders_always); hpage_orders |= READ_ONCE(huge_shmem_orders_within_size); hpage_orders |= READ_ONCE(huge_shmem_orders_madvise); if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER) hpage_orders |= READ_ONCE(huge_shmem_orders_inherit); if (hpage_orders > 0) { order = highest_order(hpage_orders); hpage_size = PAGE_SIZE << order; } #endif } if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER && !order) return addr; } if (len < hpage_size) return addr; offset = (pgoff << PAGE_SHIFT) & (hpage_size - 1); if (offset && offset + len < 2 * hpage_size) return addr; if ((addr & (hpage_size - 1)) == offset) return addr; inflated_len = len + hpage_size - PAGE_SIZE; if (inflated_len > TASK_SIZE) return addr; if (inflated_len < len) return addr; inflated_addr = mm_get_unmapped_area(current->mm, NULL, uaddr, inflated_len, 0, flags); if (IS_ERR_VALUE(inflated_addr)) return addr; if (inflated_addr & ~PAGE_MASK) return addr; inflated_offset = inflated_addr & (hpage_size - 1); inflated_addr += offset - inflated_offset; if (inflated_offset > offset) inflated_addr += hpage_size; if (inflated_addr > TASK_SIZE - len) return addr; return inflated_addr; } #ifdef CONFIG_NUMA static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) { struct inode *inode = file_inode(vma->vm_file); return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol); } static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx) { struct inode *inode = file_inode(vma->vm_file); pgoff_t index; /* * Bias interleave by inode number to distribute better across nodes; * but this interface is independent of which page order is used, so * supplies only that bias, letting caller apply the offset (adjusted * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()). */ *ilx = inode->i_ino; index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); } static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, pgoff_t index, unsigned int order, pgoff_t *ilx) { struct mempolicy *mpol; /* Bias interleave by inode number to distribute better across nodes */ *ilx = info->vfs_inode.i_ino + (index >> order); mpol = mpol_shared_policy_lookup(&info->policy, index); return mpol ? mpol : get_task_policy(current); } #else static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, pgoff_t index, unsigned int order, pgoff_t *ilx) { *ilx = 0; return NULL; } #endif /* CONFIG_NUMA */ int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) { struct inode *inode = file_inode(file); struct shmem_inode_info *info = SHMEM_I(inode); int retval = -ENOMEM; /* * What serializes the accesses to info->flags? * ipc_lock_object() when called from shmctl_do_lock(), * no serialization needed when called from shm_destroy(). */ if (lock && !(info->flags & VM_LOCKED)) { if (!user_shm_lock(inode->i_size, ucounts)) goto out_nomem; info->flags |= VM_LOCKED; mapping_set_unevictable(file->f_mapping); } if (!lock && (info->flags & VM_LOCKED) && ucounts) { user_shm_unlock(inode->i_size, ucounts); info->flags &= ~VM_LOCKED; mapping_clear_unevictable(file->f_mapping); } retval = 0; out_nomem: return retval; } static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); file_accessed(file); /* This is anonymous shared memory if it is unlinked at the time of mmap */ if (inode->i_nlink) vma->vm_ops = &shmem_vm_ops; else vma->vm_ops = &shmem_anon_vm_ops; return 0; } static int shmem_file_open(struct inode *inode, struct file *file) { file->f_mode |= FMODE_CAN_ODIRECT; return generic_file_open(inode, file); } #ifdef CONFIG_TMPFS_XATTR static int shmem_initxattrs(struct inode *, const struct xattr *, void *); #if IS_ENABLED(CONFIG_UNICODE) /* * shmem_inode_casefold_flags - Deal with casefold file attribute flag * * The casefold file attribute needs some special checks. I can just be added to * an empty dir, and can't be removed from a non-empty dir. */ static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry, unsigned int *i_flags) { unsigned int old = inode->i_flags; struct super_block *sb = inode->i_sb; if (fsflags & FS_CASEFOLD_FL) { if (!(old & S_CASEFOLD)) { if (!sb->s_encoding) return -EOPNOTSUPP; if (!S_ISDIR(inode->i_mode)) return -ENOTDIR; if (dentry && !simple_empty(dentry)) return -ENOTEMPTY; } *i_flags = *i_flags | S_CASEFOLD; } else if (old & S_CASEFOLD) { if (dentry && !simple_empty(dentry)) return -ENOTEMPTY; } return 0; } #else static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry, unsigned int *i_flags) { if (fsflags & FS_CASEFOLD_FL) return -EOPNOTSUPP; return 0; } #endif /* * chattr's fsflags are unrelated to extended attributes, * but tmpfs has chosen to enable them under the same config option. */ static int shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry) { unsigned int i_flags = 0; int ret; ret = shmem_inode_casefold_flags(inode, fsflags, dentry, &i_flags); if (ret) return ret; if (fsflags & FS_NOATIME_FL) i_flags |= S_NOATIME; if (fsflags & FS_APPEND_FL) i_flags |= S_APPEND; if (fsflags & FS_IMMUTABLE_FL) i_flags |= S_IMMUTABLE; /* * But FS_NODUMP_FL does not require any action in i_flags. */ inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE | S_CASEFOLD); return 0; } #else static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry) { } #define shmem_initxattrs NULL #endif static struct offset_ctx *shmem_get_offset_ctx(struct inode *inode) { return &SHMEM_I(inode)->dir_offsets; } static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev, unsigned long flags) { struct inode *inode; struct shmem_inode_info *info; struct shmem_sb_info *sbinfo = SHMEM_SB(sb); ino_t ino; int err; err = shmem_reserve_inode(sb, &ino); if (err) return ERR_PTR(err); inode = new_inode(sb); if (!inode) { shmem_free_inode(sb, 0); return ERR_PTR(-ENOSPC); } inode->i_ino = ino; inode_init_owner(idmap, inode, dir, mode); inode->i_blocks = 0; simple_inode_init_ts(inode); inode->i_generation = get_random_u32(); info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); spin_lock_init(&info->lock); atomic_set(&info->stop_eviction, 0); info->seals = F_SEAL_SEAL; info->flags = flags & VM_NORESERVE; info->i_crtime = inode_get_mtime(inode); info->fsflags = (dir == NULL) ? 0 : SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; if (info->fsflags) shmem_set_inode_flags(inode, info->fsflags, NULL); INIT_LIST_HEAD(&info->shrinklist); INIT_LIST_HEAD(&info->swaplist); simple_xattrs_init(&info->xattrs); cache_no_acl(inode); if (sbinfo->noswap) mapping_set_unevictable(inode->i_mapping); /* Don't consider 'deny' for emergencies and 'force' for testing */ if (sbinfo->huge) mapping_set_large_folios(inode->i_mapping); switch (mode & S_IFMT) { default: inode->i_op = &shmem_special_inode_operations; init_special_inode(inode, mode, dev); break; case S_IFREG: inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_inode_operations; inode->i_fop = &shmem_file_operations; mpol_shared_policy_init(&info->policy, shmem_get_sbmpol(sbinfo)); break; case S_IFDIR: inc_nlink(inode); /* Some things misbehave if size == 0 on a directory */ inode->i_size = 2 * BOGO_DIRENT_SIZE; inode->i_op = &shmem_dir_inode_operations; inode->i_fop = &simple_offset_dir_operations; simple_offset_init(shmem_get_offset_ctx(inode)); break; case S_IFLNK: /* * Must not load anything in the rbtree, * mpol_free_shared_policy will not be called. */ mpol_shared_policy_init(&info->policy, NULL); break; } lockdep_annotate_inode_mutex_key(inode); return inode; } #ifdef CONFIG_TMPFS_QUOTA static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev, unsigned long flags) { int err; struct inode *inode; inode = __shmem_get_inode(idmap, sb, dir, mode, dev, flags); if (IS_ERR(inode)) return inode; err = dquot_initialize(inode); if (err) goto errout; err = dquot_alloc_inode(inode); if (err) { dquot_drop(inode); goto errout; } return inode; errout: inode->i_flags |= S_NOQUOTA; iput(inode); return ERR_PTR(err); } #else static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev, unsigned long flags) { return __shmem_get_inode(idmap, sb, dir, mode, dev, flags); } #endif /* CONFIG_TMPFS_QUOTA */ #ifdef CONFIG_USERFAULTFD int shmem_mfill_atomic_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, uffd_flags_t flags, struct folio **foliop) { struct inode *inode = file_inode(dst_vma->vm_file); struct shmem_inode_info *info = SHMEM_I(inode); struct address_space *mapping = inode->i_mapping; gfp_t gfp = mapping_gfp_mask(mapping); pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); void *page_kaddr; struct folio *folio; int ret; pgoff_t max_off; if (shmem_inode_acct_blocks(inode, 1)) { /* * We may have got a page, returned -ENOENT triggering a retry, * and now we find ourselves with -ENOMEM. Release the page, to * avoid a BUG_ON in our caller. */ if (unlikely(*foliop)) { folio_put(*foliop); *foliop = NULL; } return -ENOMEM; } if (!*foliop) { ret = -ENOMEM; folio = shmem_alloc_folio(gfp, 0, info, pgoff); if (!folio) goto out_unacct_blocks; if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) { page_kaddr = kmap_local_folio(folio, 0); /* * The read mmap_lock is held here. Despite the * mmap_lock being read recursive a deadlock is still * possible if a writer has taken a lock. For example: * * process A thread 1 takes read lock on own mmap_lock * process A thread 2 calls mmap, blocks taking write lock * process B thread 1 takes page fault, read lock on own mmap lock * process B thread 2 calls mmap, blocks taking write lock * process A thread 1 blocks taking read lock on process B * process B thread 1 blocks taking read lock on process A * * Disable page faults to prevent potential deadlock * and retry the copy outside the mmap_lock. */ pagefault_disable(); ret = copy_from_user(page_kaddr, (const void __user *)src_addr, PAGE_SIZE); pagefault_enable(); kunmap_local(page_kaddr); /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { *foliop = folio; ret = -ENOENT; /* don't free the page */ goto out_unacct_blocks; } flush_dcache_folio(folio); } else { /* ZEROPAGE */ clear_user_highpage(&folio->page, dst_addr); } } else { folio = *foliop; VM_BUG_ON_FOLIO(folio_test_large(folio), folio); *foliop = NULL; } VM_BUG_ON(folio_test_locked(folio)); VM_BUG_ON(folio_test_swapbacked(folio)); __folio_set_locked(folio); __folio_set_swapbacked(folio); __folio_mark_uptodate(folio); ret = -EFAULT; max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(pgoff >= max_off)) goto out_release; ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp); if (ret) goto out_release; ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp); if (ret) goto out_release; ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, &folio->page, true, flags); if (ret) goto out_delete_from_cache; shmem_recalc_inode(inode, 1, 0); folio_unlock(folio); return 0; out_delete_from_cache: filemap_remove_folio(folio); out_release: folio_unlock(folio); folio_put(folio); out_unacct_blocks: shmem_inode_unacct_blocks(inode, 1); return ret; } #endif /* CONFIG_USERFAULTFD */ #ifdef CONFIG_TMPFS static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; static int shmem_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t index = pos >> PAGE_SHIFT; struct folio *folio; int ret = 0; /* i_rwsem is held by caller */ if (unlikely(info->seals & (F_SEAL_GROW | F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) { if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) return -EPERM; if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) return -EPERM; } ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE); if (ret) return ret; if (folio_contain_hwpoisoned_page(folio)) { folio_unlock(folio); folio_put(folio); return -EIO; } *foliop = folio; return 0; } static int shmem_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; if (pos + copied > inode->i_size) i_size_write(inode, pos + copied); if (!folio_test_uptodate(folio)) { if (copied < folio_size(folio)) { size_t from = offset_in_folio(folio, pos); folio_zero_segments(folio, 0, from, from + copied, folio_size(folio)); } folio_mark_uptodate(folio); } folio_mark_dirty(folio); folio_unlock(folio); folio_put(folio); return copied; } static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; pgoff_t index; unsigned long offset; int error = 0; ssize_t retval = 0; for (;;) { struct folio *folio = NULL; struct page *page = NULL; unsigned long nr, ret; loff_t end_offset, i_size = i_size_read(inode); bool fallback_page_copy = false; size_t fsize; if (unlikely(iocb->ki_pos >= i_size)) break; index = iocb->ki_pos >> PAGE_SHIFT; error = shmem_get_folio(inode, index, 0, &folio, SGP_READ); if (error) { if (error == -EINVAL) error = 0; break; } if (folio) { folio_unlock(folio); page = folio_file_page(folio, index); if (PageHWPoison(page)) { folio_put(folio); error = -EIO; break; } if (folio_test_large(folio) && folio_test_has_hwpoisoned(folio)) fallback_page_copy = true; } /* * We must evaluate after, since reads (unlike writes) * are called without i_rwsem protection against truncate */ i_size = i_size_read(inode); if (unlikely(iocb->ki_pos >= i_size)) { if (folio) folio_put(folio); break; } end_offset = min_t(loff_t, i_size, iocb->ki_pos + to->count); if (folio && likely(!fallback_page_copy)) fsize = folio_size(folio); else fsize = PAGE_SIZE; offset = iocb->ki_pos & (fsize - 1); nr = min_t(loff_t, end_offset - iocb->ki_pos, fsize - offset); if (folio) { /* * If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. */ if (mapping_writably_mapped(mapping)) { if (likely(!fallback_page_copy)) flush_dcache_folio(folio); else flush_dcache_page(page); } /* * Mark the folio accessed if we read the beginning. */ if (!offset) folio_mark_accessed(folio); /* * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... */ if (likely(!fallback_page_copy)) ret = copy_folio_to_iter(folio, offset, nr, to); else ret = copy_page_to_iter(page, offset, nr, to); folio_put(folio); } else if (user_backed_iter(to)) { /* * Copy to user tends to be so well optimized, but * clear_user() not so much, that it is noticeably * faster to copy the zero page instead of clearing. */ ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to); } else { /* * But submitting the same page twice in a row to * splice() - or others? - can result in confusion: * so don't attempt that optimization on pipes etc. */ ret = iov_iter_zero(nr, to); } retval += ret; iocb->ki_pos += ret; if (!iov_iter_count(to)) break; if (ret < nr) { error = -EFAULT; break; } cond_resched(); } file_accessed(file); return retval ? retval : error; } static ssize_t shmem_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; ssize_t ret; inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret <= 0) goto unlock; ret = file_remove_privs(file); if (ret) goto unlock; ret = file_update_time(file); if (ret) goto unlock; ret = generic_perform_write(iocb, from); unlock: inode_unlock(inode); return ret; } static bool zero_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { return true; } static void zero_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { } static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { return false; } static const struct pipe_buf_operations zero_pipe_buf_ops = { .release = zero_pipe_buf_release, .try_steal = zero_pipe_buf_try_steal, .get = zero_pipe_buf_get, }; static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe, loff_t fpos, size_t size) { size_t offset = fpos & ~PAGE_MASK; size = min_t(size_t, size, PAGE_SIZE - offset); if (!pipe_is_full(pipe)) { struct pipe_buffer *buf = pipe_head_buf(pipe); *buf = (struct pipe_buffer) { .ops = &zero_pipe_buf_ops, .page = ZERO_PAGE(0), .offset = offset, .len = size, }; pipe->head++; } return size; } static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct inode *inode = file_inode(in); struct address_space *mapping = inode->i_mapping; struct folio *folio = NULL; size_t total_spliced = 0, used, npages, n, part; loff_t isize; int error = 0; /* Work out how much data we can actually add into the pipe */ used = pipe_buf_usage(pipe); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); do { bool fallback_page_splice = false; struct page *page = NULL; pgoff_t index; size_t size; if (*ppos >= i_size_read(inode)) break; index = *ppos >> PAGE_SHIFT; error = shmem_get_folio(inode, index, 0, &folio, SGP_READ); if (error) { if (error == -EINVAL) error = 0; break; } if (folio) { folio_unlock(folio); page = folio_file_page(folio, index); if (PageHWPoison(page)) { error = -EIO; break; } if (folio_test_large(folio) && folio_test_has_hwpoisoned(folio)) fallback_page_splice = true; } /* * i_size must be checked after we know the pages are Uptodate. * * Checking i_size after the check allows us to calculate * the correct value for "nr", which means the zero-filled * part of the page is not copied back to userspace (unless * another truncate extends the file - this is desired though). */ isize = i_size_read(inode); if (unlikely(*ppos >= isize)) break; /* * Fallback to PAGE_SIZE splice if the large folio has hwpoisoned * pages. */ size = len; if (unlikely(fallback_page_splice)) { size_t offset = *ppos & ~PAGE_MASK; size = umin(size, PAGE_SIZE - offset); } part = min_t(loff_t, isize - *ppos, size); if (folio) { /* * If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. */ if (mapping_writably_mapped(mapping)) { if (likely(!fallback_page_splice)) flush_dcache_folio(folio); else flush_dcache_page(page); } folio_mark_accessed(folio); /* * Ok, we have the page, and it's up-to-date, so we can * now splice it into the pipe. */ n = splice_folio_into_pipe(pipe, folio, *ppos, part); folio_put(folio); folio = NULL; } else { n = splice_zeropage_into_pipe(pipe, *ppos, part); } if (!n) break; len -= n; total_spliced += n; *ppos += n; in->f_ra.prev_pos = *ppos; if (pipe_is_full(pipe)) break; cond_resched(); } while (len); if (folio) folio_put(folio); file_accessed(in); return total_spliced ? total_spliced : error; } static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; if (whence != SEEK_DATA && whence != SEEK_HOLE) return generic_file_llseek_size(file, offset, whence, MAX_LFS_FILESIZE, i_size_read(inode)); if (offset < 0) return -ENXIO; inode_lock(inode); /* We're holding i_rwsem so we can access i_size directly */ offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence); if (offset >= 0) offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); inode_unlock(inode); return offset; } static long shmem_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_falloc shmem_falloc; pgoff_t start, index, end, undo_fallocend; int error; if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; inode_lock(inode); if (mode & FALLOC_FL_PUNCH_HOLE) { struct address_space *mapping = file->f_mapping; loff_t unmap_start = round_up(offset, PAGE_SIZE); loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); /* protected by i_rwsem */ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { error = -EPERM; goto out; } shmem_falloc.waitq = &shmem_falloc_waitq; shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT; shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; spin_lock(&inode->i_lock); inode->i_private = &shmem_falloc; spin_unlock(&inode->i_lock); if ((u64)unmap_end > (u64)unmap_start) unmap_mapping_range(mapping, unmap_start, 1 + unmap_end - unmap_start, 0); shmem_truncate_range(inode, offset, offset + len - 1); /* No need to unmap again: hole-punching leaves COWed pages */ spin_lock(&inode->i_lock); inode->i_private = NULL; wake_up_all(&shmem_falloc_waitq); WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head)); spin_unlock(&inode->i_lock); error = 0; goto out; } /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ error = inode_newsize_ok(inode, offset + len); if (error) goto out; if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { error = -EPERM; goto out; } start = offset >> PAGE_SHIFT; end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT; /* Try to avoid a swapstorm if len is impossible to satisfy */ if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) { error = -ENOSPC; goto out; } shmem_falloc.waitq = NULL; shmem_falloc.start = start; shmem_falloc.next = start; shmem_falloc.nr_falloced = 0; shmem_falloc.nr_unswapped = 0; spin_lock(&inode->i_lock); inode->i_private = &shmem_falloc; spin_unlock(&inode->i_lock); /* * info->fallocend is only relevant when huge pages might be * involved: to prevent split_huge_page() freeing fallocated * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size. */ undo_fallocend = info->fallocend; if (info->fallocend < end) info->fallocend = end; for (index = start; index < end; ) { struct folio *folio; /* * Check for fatal signal so that we abort early in OOM * situations. We don't want to abort in case of non-fatal * signals as large fallocate can take noticeable time and * e.g. periodic timers may result in fallocate constantly * restarting. */ if (fatal_signal_pending(current)) error = -EINTR; else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) error = -ENOMEM; else error = shmem_get_folio(inode, index, offset + len, &folio, SGP_FALLOC); if (error) { info->fallocend = undo_fallocend; /* Remove the !uptodate folios we added */ if (index > start) { shmem_undo_range(inode, (loff_t)start << PAGE_SHIFT, ((loff_t)index << PAGE_SHIFT) - 1, true); } goto undone; } /* * Here is a more important optimization than it appears: * a second SGP_FALLOC on the same large folio will clear it, * making it uptodate and un-undoable if we fail later. */ index = folio_next_index(folio); /* Beware 32-bit wraparound */ if (!index) index--; /* * Inform shmem_writeout() how far we have reached. * No need for lock or barrier: we have the page lock. */ if (!folio_test_uptodate(folio)) shmem_falloc.nr_falloced += index - shmem_falloc.next; shmem_falloc.next = index; /* * If !uptodate, leave it that way so that freeable folios * can be recognized if we need to rollback on error later. * But mark it dirty so that memory pressure will swap rather * than free the folios we are allocating (and SGP_CACHE folios * might still be clean: we now need to mark those dirty too). */ folio_mark_dirty(folio); folio_unlock(folio); folio_put(folio); cond_resched(); } if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) i_size_write(inode, offset + len); undone: spin_lock(&inode->i_lock); inode->i_private = NULL; spin_unlock(&inode->i_lock); out: if (!error) file_modified(file); inode_unlock(inode); return error; } static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) { struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); buf->f_type = TMPFS_MAGIC; buf->f_bsize = PAGE_SIZE; buf->f_namelen = NAME_MAX; if (sbinfo->max_blocks) { buf->f_blocks = sbinfo->max_blocks; buf->f_bavail = buf->f_bfree = sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks); } if (sbinfo->max_inodes) { buf->f_files = sbinfo->max_inodes; buf->f_ffree = sbinfo->free_ispace / BOGO_INODE_SIZE; } /* else leave those fields 0 like simple_statfs */ buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b); return 0; } /* * File creation. Allocate an inode, and we're done.. */ static int shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; int error; if (!generic_ci_validate_strict_name(dir, &dentry->d_name)) return -EINVAL; inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE); if (IS_ERR(inode)) return PTR_ERR(inode); error = simple_acl_create(dir, inode); if (error) goto out_iput; error = security_inode_init_security(inode, dir, &dentry->d_name, shmem_initxattrs, NULL); if (error && error != -EOPNOTSUPP) goto out_iput; error = simple_offset_add(shmem_get_offset_ctx(dir), dentry); if (error) goto out_iput; dir->i_size += BOGO_DIRENT_SIZE; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); inode_inc_iversion(dir); if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) d_add(dentry, inode); else d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ return error; out_iput: iput(inode); return error; } static int shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct inode *inode; int error; inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE); if (IS_ERR(inode)) { error = PTR_ERR(inode); goto err_out; } error = security_inode_init_security(inode, dir, NULL, shmem_initxattrs, NULL); if (error && error != -EOPNOTSUPP) goto out_iput; error = simple_acl_create(dir, inode); if (error) goto out_iput; d_tmpfile(file, inode); err_out: return finish_open_simple(file, error); out_iput: iput(inode); return error; } static struct dentry *shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { int error; error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); if (error) return ERR_PTR(error); inc_nlink(dir); return NULL; } static int shmem_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return shmem_mknod(idmap, dir, dentry, mode | S_IFREG, 0); } /* * Link a file.. */ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); int ret = 0; /* * No ordinary (disk based) filesystem counts links as inodes; * but each new link needs a new dentry, pinning lowmem, and * tmpfs dentries cannot be pruned until they are unlinked. * But if an O_TMPFILE file is linked into the tmpfs, the * first link must skip that, to get the accounting right. */ if (inode->i_nlink) { ret = shmem_reserve_inode(inode->i_sb, NULL); if (ret) goto out; } ret = simple_offset_add(shmem_get_offset_ctx(dir), dentry); if (ret) { if (inode->i_nlink) shmem_free_inode(inode->i_sb, 0); goto out; } dir->i_size += BOGO_DIRENT_SIZE; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); inode_inc_iversion(dir); inc_nlink(inode); ihold(inode); /* New dentry reference */ dget(dentry); /* Extra pinning count for the created dentry */ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) d_add(dentry, inode); else d_instantiate(dentry, inode); out: return ret; } static int shmem_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) shmem_free_inode(inode->i_sb, 0); simple_offset_remove(shmem_get_offset_ctx(dir), dentry); dir->i_size -= BOGO_DIRENT_SIZE; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); inode_inc_iversion(dir); drop_nlink(inode); dput(dentry); /* Undo the count from "create" - does all the work */ /* * For now, VFS can't deal with case-insensitive negative dentries, so * we invalidate them */ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) d_invalidate(dentry); return 0; } static int shmem_rmdir(struct inode *dir, struct dentry *dentry) { if (!simple_empty(dentry)) return -ENOTEMPTY; drop_nlink(d_inode(dentry)); drop_nlink(dir); return shmem_unlink(dir, dentry); } static int shmem_whiteout(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry) { struct dentry *whiteout; int error; whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name); if (!whiteout) return -ENOMEM; error = shmem_mknod(idmap, old_dir, whiteout, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); dput(whiteout); if (error) return error; /* * Cheat and hash the whiteout while the old dentry is still in * place, instead of playing games with FS_RENAME_DOES_D_MOVE. * * d_lookup() will consistently find one of them at this point, * not sure which one, but that isn't even important. */ d_rehash(whiteout); return 0; } /* * The VFS layer already does all the dentry stuff for rename, * we just have to decrement the usage count for the target if * it exists so that the VFS layer correctly free's it when it * gets overwritten. */ static int shmem_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct inode *inode = d_inode(old_dentry); int they_are_dirs = S_ISDIR(inode->i_mode); int error; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; if (flags & RENAME_EXCHANGE) return simple_offset_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); if (!simple_empty(new_dentry)) return -ENOTEMPTY; if (flags & RENAME_WHITEOUT) { error = shmem_whiteout(idmap, old_dir, old_dentry); if (error) return error; } error = simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry); if (error) return error; if (d_really_is_positive(new_dentry)) { (void) shmem_unlink(new_dir, new_dentry); if (they_are_dirs) { drop_nlink(d_inode(new_dentry)); drop_nlink(old_dir); } } else if (they_are_dirs) { drop_nlink(old_dir); inc_nlink(new_dir); } old_dir->i_size -= BOGO_DIRENT_SIZE; new_dir->i_size += BOGO_DIRENT_SIZE; simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); inode_inc_iversion(old_dir); inode_inc_iversion(new_dir); return 0; } static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { int error; int len; struct inode *inode; struct folio *folio; char *link; len = strlen(symname) + 1; if (len > PAGE_SIZE) return -ENAMETOOLONG; inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0, VM_NORESERVE); if (IS_ERR(inode)) return PTR_ERR(inode); error = security_inode_init_security(inode, dir, &dentry->d_name, shmem_initxattrs, NULL); if (error && error != -EOPNOTSUPP) goto out_iput; error = simple_offset_add(shmem_get_offset_ctx(dir), dentry); if (error) goto out_iput; inode->i_size = len-1; if (len <= SHORT_SYMLINK_LEN) { link = kmemdup(symname, len, GFP_KERNEL); if (!link) { error = -ENOMEM; goto out_remove_offset; } inode->i_op = &shmem_short_symlink_operations; inode_set_cached_link(inode, link, len - 1); } else { inode_nohighmem(inode); inode->i_mapping->a_ops = &shmem_aops; error = shmem_get_folio(inode, 0, 0, &folio, SGP_WRITE); if (error) goto out_remove_offset; inode->i_op = &shmem_symlink_inode_operations; memcpy(folio_address(folio), symname, len); folio_mark_uptodate(folio); folio_mark_dirty(folio); folio_unlock(folio); folio_put(folio); } dir->i_size += BOGO_DIRENT_SIZE; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); inode_inc_iversion(dir); if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) d_add(dentry, inode); else d_instantiate(dentry, inode); dget(dentry); return 0; out_remove_offset: simple_offset_remove(shmem_get_offset_ctx(dir), dentry); out_iput: iput(inode); return error; } static void shmem_put_link(void *arg) { folio_mark_accessed(arg); folio_put(arg); } static const char *shmem_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct folio *folio = NULL; int error; if (!dentry) { folio = filemap_get_folio(inode->i_mapping, 0); if (IS_ERR(folio)) return ERR_PTR(-ECHILD); if (PageHWPoison(folio_page(folio, 0)) || !folio_test_uptodate(folio)) { folio_put(folio); return ERR_PTR(-ECHILD); } } else { error = shmem_get_folio(inode, 0, 0, &folio, SGP_READ); if (error) return ERR_PTR(error); if (!folio) return ERR_PTR(-ECHILD); if (PageHWPoison(folio_page(folio, 0))) { folio_unlock(folio); folio_put(folio); return ERR_PTR(-ECHILD); } folio_unlock(folio); } set_delayed_call(done, shmem_put_link, folio); return folio_address(folio); } #ifdef CONFIG_TMPFS_XATTR static int shmem_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE); return 0; } static int shmem_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); int ret, flags; if (fileattr_has_fsx(fa)) return -EOPNOTSUPP; if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE) return -EOPNOTSUPP; flags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) | (fa->flags & SHMEM_FL_USER_MODIFIABLE); ret = shmem_set_inode_flags(inode, flags, dentry); if (ret) return ret; info->fsflags = flags; inode_set_ctime_current(inode); inode_inc_iversion(inode); return 0; } /* * Superblocks without xattr inode operations may get some security.* xattr * support from the LSM "for free". As soon as we have any other xattrs * like ACLs, we also need to implement the security.* handlers at * filesystem level, though. */ /* * Callback for security_inode_init_security() for acquiring xattrs. */ static int shmem_initxattrs(struct inode *inode, const struct xattr *xattr_array, void *fs_info) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); const struct xattr *xattr; struct simple_xattr *new_xattr; size_t ispace = 0; size_t len; if (sbinfo->max_inodes) { for (xattr = xattr_array; xattr->name != NULL; xattr++) { ispace += simple_xattr_space(xattr->name, xattr->value_len + XATTR_SECURITY_PREFIX_LEN); } if (ispace) { raw_spin_lock(&sbinfo->stat_lock); if (sbinfo->free_ispace < ispace) ispace = 0; else sbinfo->free_ispace -= ispace; raw_spin_unlock(&sbinfo->stat_lock); if (!ispace) return -ENOSPC; } } for (xattr = xattr_array; xattr->name != NULL; xattr++) { new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); if (!new_xattr) break; len = strlen(xattr->name) + 1; new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, GFP_KERNEL_ACCOUNT); if (!new_xattr->name) { kvfree(new_xattr); break; } memcpy(new_xattr->name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, xattr->name, len); simple_xattr_add(&info->xattrs, new_xattr); } if (xattr->name != NULL) { if (ispace) { raw_spin_lock(&sbinfo->stat_lock); sbinfo->free_ispace += ispace; raw_spin_unlock(&sbinfo->stat_lock); } simple_xattrs_free(&info->xattrs, NULL); return -ENOMEM; } return 0; } static int shmem_xattr_handler_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *buffer, size_t size) { struct shmem_inode_info *info = SHMEM_I(inode); name = xattr_full_name(handler, name); return simple_xattr_get(&info->xattrs, name, buffer, size); } static int shmem_xattr_handler_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct simple_xattr *old_xattr; size_t ispace = 0; name = xattr_full_name(handler, name); if (value && sbinfo->max_inodes) { ispace = simple_xattr_space(name, size); raw_spin_lock(&sbinfo->stat_lock); if (sbinfo->free_ispace < ispace) ispace = 0; else sbinfo->free_ispace -= ispace; raw_spin_unlock(&sbinfo->stat_lock); if (!ispace) return -ENOSPC; } old_xattr = simple_xattr_set(&info->xattrs, name, value, size, flags); if (!IS_ERR(old_xattr)) { ispace = 0; if (old_xattr && sbinfo->max_inodes) ispace = simple_xattr_space(old_xattr->name, old_xattr->size); simple_xattr_free(old_xattr); old_xattr = NULL; inode_set_ctime_current(inode); inode_inc_iversion(inode); } if (ispace) { raw_spin_lock(&sbinfo->stat_lock); sbinfo->free_ispace += ispace; raw_spin_unlock(&sbinfo->stat_lock); } return PTR_ERR(old_xattr); } static const struct xattr_handler shmem_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, .get = shmem_xattr_handler_get, .set = shmem_xattr_handler_set, }; static const struct xattr_handler shmem_trusted_xattr_handler = { .prefix = XATTR_TRUSTED_PREFIX, .get = shmem_xattr_handler_get, .set = shmem_xattr_handler_set, }; static const struct xattr_handler shmem_user_xattr_handler = { .prefix = XATTR_USER_PREFIX, .get = shmem_xattr_handler_get, .set = shmem_xattr_handler_set, }; static const struct xattr_handler * const shmem_xattr_handlers[] = { &shmem_security_xattr_handler, &shmem_trusted_xattr_handler, &shmem_user_xattr_handler, NULL }; static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) { struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size); } #endif /* CONFIG_TMPFS_XATTR */ static const struct inode_operations shmem_short_symlink_operations = { .getattr = shmem_getattr, .setattr = shmem_setattr, .get_link = simple_get_link, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, #endif }; static const struct inode_operations shmem_symlink_inode_operations = { .getattr = shmem_getattr, .setattr = shmem_setattr, .get_link = shmem_get_link, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, #endif }; static struct dentry *shmem_get_parent(struct dentry *child) { return ERR_PTR(-ESTALE); } static int shmem_match(struct inode *ino, void *vfh) { __u32 *fh = vfh; __u64 inum = fh[2]; inum = (inum << 32) | fh[1]; return ino->i_ino == inum && fh[0] == ino->i_generation; } /* Find any alias of inode, but prefer a hashed alias */ static struct dentry *shmem_find_alias(struct inode *inode) { struct dentry *alias = d_find_alias(inode); return alias ?: d_find_any_alias(inode); } static struct dentry *shmem_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { struct inode *inode; struct dentry *dentry = NULL; u64 inum; if (fh_len < 3) return NULL; inum = fid->raw[2]; inum = (inum << 32) | fid->raw[1]; inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), shmem_match, fid->raw); if (inode) { dentry = shmem_find_alias(inode); iput(inode); } return dentry; } static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len, struct inode *parent) { if (*len < 3) { *len = 3; return FILEID_INVALID; } if (inode_unhashed(inode)) { /* Unfortunately insert_inode_hash is not idempotent, * so as we hash inodes here rather than at creation * time, we need a lock to ensure we only try * to do it once */ static DEFINE_SPINLOCK(lock); spin_lock(&lock); if (inode_unhashed(inode)) __insert_inode_hash(inode, inode->i_ino + inode->i_generation); spin_unlock(&lock); } fh[0] = inode->i_generation; fh[1] = inode->i_ino; fh[2] = ((__u64)inode->i_ino) >> 32; *len = 3; return 1; } static const struct export_operations shmem_export_ops = { .get_parent = shmem_get_parent, .encode_fh = shmem_encode_fh, .fh_to_dentry = shmem_fh_to_dentry, }; enum shmem_param { Opt_gid, Opt_huge, Opt_mode, Opt_mpol, Opt_nr_blocks, Opt_nr_inodes, Opt_size, Opt_uid, Opt_inode32, Opt_inode64, Opt_noswap, Opt_quota, Opt_usrquota, Opt_grpquota, Opt_usrquota_block_hardlimit, Opt_usrquota_inode_hardlimit, Opt_grpquota_block_hardlimit, Opt_grpquota_inode_hardlimit, Opt_casefold_version, Opt_casefold, Opt_strict_encoding, }; static const struct constant_table shmem_param_enums_huge[] = { {"never", SHMEM_HUGE_NEVER }, {"always", SHMEM_HUGE_ALWAYS }, {"within_size", SHMEM_HUGE_WITHIN_SIZE }, {"advise", SHMEM_HUGE_ADVISE }, {} }; const struct fs_parameter_spec shmem_fs_parameters[] = { fsparam_gid ("gid", Opt_gid), fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge), fsparam_u32oct("mode", Opt_mode), fsparam_string("mpol", Opt_mpol), fsparam_string("nr_blocks", Opt_nr_blocks), fsparam_string("nr_inodes", Opt_nr_inodes), fsparam_string("size", Opt_size), fsparam_uid ("uid", Opt_uid), fsparam_flag ("inode32", Opt_inode32), fsparam_flag ("inode64", Opt_inode64), fsparam_flag ("noswap", Opt_noswap), #ifdef CONFIG_TMPFS_QUOTA fsparam_flag ("quota", Opt_quota), fsparam_flag ("usrquota", Opt_usrquota), fsparam_flag ("grpquota", Opt_grpquota), fsparam_string("usrquota_block_hardlimit", Opt_usrquota_block_hardlimit), fsparam_string("usrquota_inode_hardlimit", Opt_usrquota_inode_hardlimit), fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit), fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit), #endif fsparam_string("casefold", Opt_casefold_version), fsparam_flag ("casefold", Opt_casefold), fsparam_flag ("strict_encoding", Opt_strict_encoding), {} }; #if IS_ENABLED(CONFIG_UNICODE) static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param, bool latest_version) { struct shmem_options *ctx = fc->fs_private; int version = UTF8_LATEST; struct unicode_map *encoding; char *version_str = param->string + 5; if (!latest_version) { if (strncmp(param->string, "utf8-", 5)) return invalfc(fc, "Only UTF-8 encodings are supported " "in the format: utf8-<version number>"); version = utf8_parse_version(version_str); if (version < 0) return invalfc(fc, "Invalid UTF-8 version: %s", version_str); } encoding = utf8_load(version); if (IS_ERR(encoding)) { return invalfc(fc, "Failed loading UTF-8 version: utf8-%u.%u.%u\n", unicode_major(version), unicode_minor(version), unicode_rev(version)); } pr_info("tmpfs: Using encoding : utf8-%u.%u.%u\n", unicode_major(version), unicode_minor(version), unicode_rev(version)); ctx->encoding = encoding; return 0; } #else static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param, bool latest_version) { return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n"); } #endif static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) { struct shmem_options *ctx = fc->fs_private; struct fs_parse_result result; unsigned long long size; char *rest; int opt; kuid_t kuid; kgid_t kgid; opt = fs_parse(fc, shmem_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_size: size = memparse(param->string, &rest); if (*rest == '%') { size <<= PAGE_SHIFT; size *= totalram_pages(); do_div(size, 100); rest++; } if (*rest) goto bad_value; ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE); ctx->seen |= SHMEM_SEEN_BLOCKS; break; case Opt_nr_blocks: ctx->blocks = memparse(param->string, &rest); if (*rest || ctx->blocks > LONG_MAX) goto bad_value; ctx->seen |= SHMEM_SEEN_BLOCKS; break; case Opt_nr_inodes: ctx->inodes = memparse(param->string, &rest); if (*rest || ctx->inodes > ULONG_MAX / BOGO_INODE_SIZE) goto bad_value; ctx->seen |= SHMEM_SEEN_INODES; break; case Opt_mode: ctx->mode = result.uint_32 & 07777; break; case Opt_uid: kuid = result.uid; /* * The requested uid must be representable in the * filesystem's idmapping. */ if (!kuid_has_mapping(fc->user_ns, kuid)) goto bad_value; ctx->uid = kuid; break; case Opt_gid: kgid = result.gid; /* * The requested gid must be representable in the * filesystem's idmapping. */ if (!kgid_has_mapping(fc->user_ns, kgid)) goto bad_value; ctx->gid = kgid; break; case Opt_huge: ctx->huge = result.uint_32; if (ctx->huge != SHMEM_HUGE_NEVER && !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && has_transparent_hugepage())) goto unsupported_parameter; ctx->seen |= SHMEM_SEEN_HUGE; break; case Opt_mpol: if (IS_ENABLED(CONFIG_NUMA)) { mpol_put(ctx->mpol); ctx->mpol = NULL; if (mpol_parse_str(param->string, &ctx->mpol)) goto bad_value; break; } goto unsupported_parameter; case Opt_inode32: ctx->full_inums = false; ctx->seen |= SHMEM_SEEN_INUMS; break; case Opt_inode64: if (sizeof(ino_t) < 8) { return invalfc(fc, "Cannot use inode64 with <64bit inums in kernel\n"); } ctx->full_inums = true; ctx->seen |= SHMEM_SEEN_INUMS; break; case Opt_noswap: if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) { return invalfc(fc, "Turning off swap in unprivileged tmpfs mounts unsupported"); } ctx->noswap = true; ctx->seen |= SHMEM_SEEN_NOSWAP; break; case Opt_quota: if (fc->user_ns != &init_user_ns) return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); ctx->seen |= SHMEM_SEEN_QUOTA; ctx->quota_types |= (QTYPE_MASK_USR | QTYPE_MASK_GRP); break; case Opt_usrquota: if (fc->user_ns != &init_user_ns) return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); ctx->seen |= SHMEM_SEEN_QUOTA; ctx->quota_types |= QTYPE_MASK_USR; break; case Opt_grpquota: if (fc->user_ns != &init_user_ns) return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); ctx->seen |= SHMEM_SEEN_QUOTA; ctx->quota_types |= QTYPE_MASK_GRP; break; case Opt_usrquota_block_hardlimit: size = memparse(param->string, &rest); if (*rest || !size) goto bad_value; if (size > SHMEM_QUOTA_MAX_SPC_LIMIT) return invalfc(fc, "User quota block hardlimit too large."); ctx->qlimits.usrquota_bhardlimit = size; break; case Opt_grpquota_block_hardlimit: size = memparse(param->string, &rest); if (*rest || !size) goto bad_value; if (size > SHMEM_QUOTA_MAX_SPC_LIMIT) return invalfc(fc, "Group quota block hardlimit too large."); ctx->qlimits.grpquota_bhardlimit = size; break; case Opt_usrquota_inode_hardlimit: size = memparse(param->string, &rest); if (*rest || !size) goto bad_value; if (size > SHMEM_QUOTA_MAX_INO_LIMIT) return invalfc(fc, "User quota inode hardlimit too large."); ctx->qlimits.usrquota_ihardlimit = size; break; case Opt_grpquota_inode_hardlimit: size = memparse(param->string, &rest); if (*rest || !size) goto bad_value; if (size > SHMEM_QUOTA_MAX_INO_LIMIT) return invalfc(fc, "Group quota inode hardlimit too large."); ctx->qlimits.grpquota_ihardlimit = size; break; case Opt_casefold_version: return shmem_parse_opt_casefold(fc, param, false); case Opt_casefold: return shmem_parse_opt_casefold(fc, param, true); case Opt_strict_encoding: #if IS_ENABLED(CONFIG_UNICODE) ctx->strict_encoding = true; break; #else return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n"); #endif } return 0; unsupported_parameter: return invalfc(fc, "Unsupported parameter '%s'", param->key); bad_value: return invalfc(fc, "Bad value for '%s'", param->key); } static char *shmem_next_opt(char **s) { char *sbegin = *s; char *p; if (sbegin == NULL) return NULL; /* * NUL-terminate this option: unfortunately, * mount options form a comma-separated list, * but mpol's nodelist may also contain commas. */ for (;;) { p = strchr(*s, ','); if (p == NULL) break; *s = p + 1; if (!isdigit(*(p+1))) { *p = '\0'; return sbegin; } } *s = NULL; return sbegin; } static int shmem_parse_monolithic(struct fs_context *fc, void *data) { return vfs_parse_monolithic_sep(fc, data, shmem_next_opt); } /* * Reconfigure a shmem filesystem. */ static int shmem_reconfigure(struct fs_context *fc) { struct shmem_options *ctx = fc->fs_private; struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb); unsigned long used_isp; struct mempolicy *mpol = NULL; const char *err; raw_spin_lock(&sbinfo->stat_lock); used_isp = sbinfo->max_inodes * BOGO_INODE_SIZE - sbinfo->free_ispace; if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { if (!sbinfo->max_blocks) { err = "Cannot retroactively limit size"; goto out; } if (percpu_counter_compare(&sbinfo->used_blocks, ctx->blocks) > 0) { err = "Too small a size for current use"; goto out; } } if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) { if (!sbinfo->max_inodes) { err = "Cannot retroactively limit inodes"; goto out; } if (ctx->inodes * BOGO_INODE_SIZE < used_isp) { err = "Too few inodes for current use"; goto out; } } if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums && sbinfo->next_ino > UINT_MAX) { err = "Current inum too high to switch to 32-bit inums"; goto out; } if ((ctx->seen & SHMEM_SEEN_NOSWAP) && ctx->noswap && !sbinfo->noswap) { err = "Cannot disable swap on remount"; goto out; } if (!(ctx->seen & SHMEM_SEEN_NOSWAP) && !ctx->noswap && sbinfo->noswap) { err = "Cannot enable swap on remount if it was disabled on first mount"; goto out; } if (ctx->seen & SHMEM_SEEN_QUOTA && !sb_any_quota_loaded(fc->root->d_sb)) { err = "Cannot enable quota on remount"; goto out; } #ifdef CONFIG_TMPFS_QUOTA #define CHANGED_LIMIT(name) \ (ctx->qlimits.name## hardlimit && \ (ctx->qlimits.name## hardlimit != sbinfo->qlimits.name## hardlimit)) if (CHANGED_LIMIT(usrquota_b) || CHANGED_LIMIT(usrquota_i) || CHANGED_LIMIT(grpquota_b) || CHANGED_LIMIT(grpquota_i)) { err = "Cannot change global quota limit on remount"; goto out; } #endif /* CONFIG_TMPFS_QUOTA */ if (ctx->seen & SHMEM_SEEN_HUGE) sbinfo->huge = ctx->huge; if (ctx->seen & SHMEM_SEEN_INUMS) sbinfo->full_inums = ctx->full_inums; if (ctx->seen & SHMEM_SEEN_BLOCKS) sbinfo->max_blocks = ctx->blocks; if (ctx->seen & SHMEM_SEEN_INODES) { sbinfo->max_inodes = ctx->inodes; sbinfo->free_ispace = ctx->inodes * BOGO_INODE_SIZE - used_isp; } /* * Preserve previous mempolicy unless mpol remount option was specified. */ if (ctx->mpol) { mpol = sbinfo->mpol; sbinfo->mpol = ctx->mpol; /* transfers initial ref */ ctx->mpol = NULL; } if (ctx->noswap) sbinfo->noswap = true; raw_spin_unlock(&sbinfo->stat_lock); mpol_put(mpol); return 0; out: raw_spin_unlock(&sbinfo->stat_lock); return invalfc(fc, "%s", err); } static int shmem_show_options(struct seq_file *seq, struct dentry *root) { struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb); struct mempolicy *mpol; if (sbinfo->max_blocks != shmem_default_max_blocks()) seq_printf(seq, ",size=%luk", K(sbinfo->max_blocks)); if (sbinfo->max_inodes != shmem_default_max_inodes()) seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); if (sbinfo->mode != (0777 | S_ISVTX)) seq_printf(seq, ",mode=%03ho", sbinfo->mode); if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID)) seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, sbinfo->uid)); if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, sbinfo->gid)); /* * Showing inode{64,32} might be useful even if it's the system default, * since then people don't have to resort to checking both here and * /proc/config.gz to confirm 64-bit inums were successfully applied * (which may not even exist if IKCONFIG_PROC isn't enabled). * * We hide it when inode64 isn't the default and we are using 32-bit * inodes, since that probably just means the feature isn't even under * consideration. * * As such: * * +-----------------+-----------------+ * | TMPFS_INODE64=y | TMPFS_INODE64=n | * +------------------+-----------------+-----------------+ * | full_inums=true | show | show | * | full_inums=false | show | hide | * +------------------+-----------------+-----------------+ * */ if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums) seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32)); #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */ if (sbinfo->huge) seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge)); #endif mpol = shmem_get_sbmpol(sbinfo); shmem_show_mpol(seq, mpol); mpol_put(mpol); if (sbinfo->noswap) seq_printf(seq, ",noswap"); #ifdef CONFIG_TMPFS_QUOTA if (sb_has_quota_active(root->d_sb, USRQUOTA)) seq_printf(seq, ",usrquota"); if (sb_has_quota_active(root->d_sb, GRPQUOTA)) seq_printf(seq, ",grpquota"); if (sbinfo->qlimits.usrquota_bhardlimit) seq_printf(seq, ",usrquota_block_hardlimit=%lld", sbinfo->qlimits.usrquota_bhardlimit); if (sbinfo->qlimits.grpquota_bhardlimit) seq_printf(seq, ",grpquota_block_hardlimit=%lld", sbinfo->qlimits.grpquota_bhardlimit); if (sbinfo->qlimits.usrquota_ihardlimit) seq_printf(seq, ",usrquota_inode_hardlimit=%lld", sbinfo->qlimits.usrquota_ihardlimit); if (sbinfo->qlimits.grpquota_ihardlimit) seq_printf(seq, ",grpquota_inode_hardlimit=%lld", sbinfo->qlimits.grpquota_ihardlimit); #endif return 0; } #endif /* CONFIG_TMPFS */ static void shmem_put_super(struct super_block *sb) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); #if IS_ENABLED(CONFIG_UNICODE) if (sb->s_encoding) utf8_unload(sb->s_encoding); #endif #ifdef CONFIG_TMPFS_QUOTA shmem_disable_quotas(sb); #endif free_percpu(sbinfo->ino_batch); percpu_counter_destroy(&sbinfo->used_blocks); mpol_put(sbinfo->mpol); kfree(sbinfo); sb->s_fs_info = NULL; } #if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_TMPFS) static const struct dentry_operations shmem_ci_dentry_ops = { .d_hash = generic_ci_d_hash, .d_compare = generic_ci_d_compare, }; #endif static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) { struct shmem_options *ctx = fc->fs_private; struct inode *inode; struct shmem_sb_info *sbinfo; int error = -ENOMEM; /* Round up to L1_CACHE_BYTES to resist false sharing */ sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), L1_CACHE_BYTES), GFP_KERNEL); if (!sbinfo) return error; sb->s_fs_info = sbinfo; #ifdef CONFIG_TMPFS /* * Per default we only allow half of the physical ram per * tmpfs instance, limiting inodes to one per page of lowmem; * but the internal instance is left unlimited. */ if (!(sb->s_flags & SB_KERNMOUNT)) { if (!(ctx->seen & SHMEM_SEEN_BLOCKS)) ctx->blocks = shmem_default_max_blocks(); if (!(ctx->seen & SHMEM_SEEN_INODES)) ctx->inodes = shmem_default_max_inodes(); if (!(ctx->seen & SHMEM_SEEN_INUMS)) ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64); sbinfo->noswap = ctx->noswap; } else { sb->s_flags |= SB_NOUSER; } sb->s_export_op = &shmem_export_ops; sb->s_flags |= SB_NOSEC; #if IS_ENABLED(CONFIG_UNICODE) if (!ctx->encoding && ctx->strict_encoding) { pr_err("tmpfs: strict_encoding option without encoding is forbidden\n"); error = -EINVAL; goto failed; } if (ctx->encoding) { sb->s_encoding = ctx->encoding; set_default_d_op(sb, &shmem_ci_dentry_ops); if (ctx->strict_encoding) sb->s_encoding_flags = SB_ENC_STRICT_MODE_FL; } #endif #else sb->s_flags |= SB_NOUSER; #endif /* CONFIG_TMPFS */ sb->s_d_flags |= DCACHE_DONTCACHE; sbinfo->max_blocks = ctx->blocks; sbinfo->max_inodes = ctx->inodes; sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE; if (sb->s_flags & SB_KERNMOUNT) { sbinfo->ino_batch = alloc_percpu(ino_t); if (!sbinfo->ino_batch) goto failed; } sbinfo->uid = ctx->uid; sbinfo->gid = ctx->gid; sbinfo->full_inums = ctx->full_inums; sbinfo->mode = ctx->mode; #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (ctx->seen & SHMEM_SEEN_HUGE) sbinfo->huge = ctx->huge; else sbinfo->huge = tmpfs_huge; #endif sbinfo->mpol = ctx->mpol; ctx->mpol = NULL; raw_spin_lock_init(&sbinfo->stat_lock); if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) goto failed; spin_lock_init(&sbinfo->shrinklist_lock); INIT_LIST_HEAD(&sbinfo->shrinklist); sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = TMPFS_MAGIC; sb->s_op = &shmem_ops; sb->s_time_gran = 1; #ifdef CONFIG_TMPFS_XATTR sb->s_xattr = shmem_xattr_handlers; #endif #ifdef CONFIG_TMPFS_POSIX_ACL sb->s_flags |= SB_POSIXACL; #endif uuid_t uuid; uuid_gen(&uuid); super_set_uuid(sb, uuid.b, sizeof(uuid)); #ifdef CONFIG_TMPFS_QUOTA if (ctx->seen & SHMEM_SEEN_QUOTA) { sb->dq_op = &shmem_quota_operations; sb->s_qcop = &dquot_quotactl_sysfile_ops; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; /* Copy the default limits from ctx into sbinfo */ memcpy(&sbinfo->qlimits, &ctx->qlimits, sizeof(struct shmem_quota_limits)); if (shmem_enable_quotas(sb, ctx->quota_types)) goto failed; } #endif /* CONFIG_TMPFS_QUOTA */ inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); if (IS_ERR(inode)) { error = PTR_ERR(inode); goto failed; } inode->i_uid = sbinfo->uid; inode->i_gid = sbinfo->gid; sb->s_root = d_make_root(inode); if (!sb->s_root) goto failed; return 0; failed: shmem_put_super(sb); return error; } static int shmem_get_tree(struct fs_context *fc) { return get_tree_nodev(fc, shmem_fill_super); } static void shmem_free_fc(struct fs_context *fc) { struct shmem_options *ctx = fc->fs_private; if (ctx) { mpol_put(ctx->mpol); kfree(ctx); } } static const struct fs_context_operations shmem_fs_context_ops = { .free = shmem_free_fc, .get_tree = shmem_get_tree, #ifdef CONFIG_TMPFS .parse_monolithic = shmem_parse_monolithic, .parse_param = shmem_parse_one, .reconfigure = shmem_reconfigure, #endif }; static struct kmem_cache *shmem_inode_cachep __ro_after_init; static struct inode *shmem_alloc_inode(struct super_block *sb) { struct shmem_inode_info *info; info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL); if (!info) return NULL; return &info->vfs_inode; } static void shmem_free_in_core_inode(struct inode *inode) { if (S_ISLNK(inode->i_mode)) kfree(inode->i_link); kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); } static void shmem_destroy_inode(struct inode *inode) { if (S_ISREG(inode->i_mode)) mpol_free_shared_policy(&SHMEM_I(inode)->policy); if (S_ISDIR(inode->i_mode)) simple_offset_destroy(shmem_get_offset_ctx(inode)); } static void shmem_init_inode(void *foo) { struct shmem_inode_info *info = foo; inode_init_once(&info->vfs_inode); } static void __init shmem_init_inodecache(void) { shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", sizeof(struct shmem_inode_info), 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); } static void __init shmem_destroy_inodecache(void) { kmem_cache_destroy(shmem_inode_cachep); } /* Keep the page in page cache instead of truncating it */ static int shmem_error_remove_folio(struct address_space *mapping, struct folio *folio) { return 0; } static const struct address_space_operations shmem_aops = { .dirty_folio = noop_dirty_folio, #ifdef CONFIG_TMPFS .write_begin = shmem_write_begin, .write_end = shmem_write_end, #endif #ifdef CONFIG_MIGRATION .migrate_folio = migrate_folio, #endif .error_remove_folio = shmem_error_remove_folio, }; static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, .open = shmem_file_open, .get_unmapped_area = shmem_get_unmapped_area, #ifdef CONFIG_TMPFS .llseek = shmem_file_llseek, .read_iter = shmem_file_read_iter, .write_iter = shmem_file_write_iter, .fsync = noop_fsync, .splice_read = shmem_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = shmem_fallocate, #endif }; static const struct inode_operations shmem_inode_operations = { .getattr = shmem_getattr, .setattr = shmem_setattr, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, .set_acl = simple_set_acl, .fileattr_get = shmem_fileattr_get, .fileattr_set = shmem_fileattr_set, #endif }; static const struct inode_operations shmem_dir_inode_operations = { #ifdef CONFIG_TMPFS .getattr = shmem_getattr, .create = shmem_create, .lookup = simple_lookup, .link = shmem_link, .unlink = shmem_unlink, .symlink = shmem_symlink, .mkdir = shmem_mkdir, .rmdir = shmem_rmdir, .mknod = shmem_mknod, .rename = shmem_rename2, .tmpfile = shmem_tmpfile, .get_offset_ctx = shmem_get_offset_ctx, #endif #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, .fileattr_get = shmem_fileattr_get, .fileattr_set = shmem_fileattr_set, #endif #ifdef CONFIG_TMPFS_POSIX_ACL .setattr = shmem_setattr, .set_acl = simple_set_acl, #endif }; static const struct inode_operations shmem_special_inode_operations = { .getattr = shmem_getattr, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, #endif #ifdef CONFIG_TMPFS_POSIX_ACL .setattr = shmem_setattr, .set_acl = simple_set_acl, #endif }; static const struct super_operations shmem_ops = { .alloc_inode = shmem_alloc_inode, .free_inode = shmem_free_in_core_inode, .destroy_inode = shmem_destroy_inode, #ifdef CONFIG_TMPFS .statfs = shmem_statfs, .show_options = shmem_show_options, #endif #ifdef CONFIG_TMPFS_QUOTA .get_dquots = shmem_get_dquots, #endif .evict_inode = shmem_evict_inode, .drop_inode = inode_just_drop, .put_super = shmem_put_super, #ifdef CONFIG_TRANSPARENT_HUGEPAGE .nr_cached_objects = shmem_unused_huge_count, .free_cached_objects = shmem_unused_huge_scan, #endif }; static const struct vm_operations_struct shmem_vm_ops = { .fault = shmem_fault, .map_pages = filemap_map_pages, #ifdef CONFIG_NUMA .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif }; static const struct vm_operations_struct shmem_anon_vm_ops = { .fault = shmem_fault, .map_pages = filemap_map_pages, #ifdef CONFIG_NUMA .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif }; int shmem_init_fs_context(struct fs_context *fc) { struct shmem_options *ctx; ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->mode = 0777 | S_ISVTX; ctx->uid = current_fsuid(); ctx->gid = current_fsgid(); #if IS_ENABLED(CONFIG_UNICODE) ctx->encoding = NULL; #endif fc->fs_private = ctx; fc->ops = &shmem_fs_context_ops; #ifdef CONFIG_TMPFS fc->sb_flags |= SB_I_VERSION; #endif return 0; } static struct file_system_type shmem_fs_type = { .owner = THIS_MODULE, .name = "tmpfs", .init_fs_context = shmem_init_fs_context, #ifdef CONFIG_TMPFS .parameters = shmem_fs_parameters, #endif .kill_sb = kill_litter_super, .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP | FS_MGTIME, }; #if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS) #define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \ { \ .attr = { .name = __stringify(_name), .mode = _mode }, \ .show = _show, \ .store = _store, \ } #define TMPFS_ATTR_W(_name, _store) \ static struct kobj_attribute tmpfs_attr_##_name = \ __INIT_KOBJ_ATTR(_name, 0200, NULL, _store) #define TMPFS_ATTR_RW(_name, _show, _store) \ static struct kobj_attribute tmpfs_attr_##_name = \ __INIT_KOBJ_ATTR(_name, 0644, _show, _store) #define TMPFS_ATTR_RO(_name, _show) \ static struct kobj_attribute tmpfs_attr_##_name = \ __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) #if IS_ENABLED(CONFIG_UNICODE) static ssize_t casefold_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { return sysfs_emit(buf, "supported\n"); } TMPFS_ATTR_RO(casefold, casefold_show); #endif static struct attribute *tmpfs_attributes[] = { #if IS_ENABLED(CONFIG_UNICODE) &tmpfs_attr_casefold.attr, #endif NULL }; static const struct attribute_group tmpfs_attribute_group = { .attrs = tmpfs_attributes, .name = "features" }; static struct kobject *tmpfs_kobj; static int __init tmpfs_sysfs_init(void) { int ret; tmpfs_kobj = kobject_create_and_add("tmpfs", fs_kobj); if (!tmpfs_kobj) return -ENOMEM; ret = sysfs_create_group(tmpfs_kobj, &tmpfs_attribute_group); if (ret) kobject_put(tmpfs_kobj); return ret; } #endif /* CONFIG_SYSFS && CONFIG_TMPFS */ void __init shmem_init(void) { int error; shmem_init_inodecache(); #ifdef CONFIG_TMPFS_QUOTA register_quota_format(&shmem_quota_format); #endif error = register_filesystem(&shmem_fs_type); if (error) { pr_err("Could not register tmpfs\n"); goto out2; } shm_mnt = kern_mount(&shmem_fs_type); if (IS_ERR(shm_mnt)) { error = PTR_ERR(shm_mnt); pr_err("Could not kern_mount tmpfs\n"); goto out1; } #if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS) error = tmpfs_sysfs_init(); if (error) { pr_err("Could not init tmpfs sysfs\n"); goto out1; } #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; else shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */ /* * Default to setting PMD-sized THP to inherit the global setting and * disable all other multi-size THPs. */ if (!shmem_orders_configured) huge_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER); #endif return; out1: unregister_filesystem(&shmem_fs_type); out2: #ifdef CONFIG_TMPFS_QUOTA unregister_quota_format(&shmem_quota_format); #endif shmem_destroy_inodecache(); shm_mnt = ERR_PTR(error); } #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS) static ssize_t shmem_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { static const int values[] = { SHMEM_HUGE_ALWAYS, SHMEM_HUGE_WITHIN_SIZE, SHMEM_HUGE_ADVISE, SHMEM_HUGE_NEVER, SHMEM_HUGE_DENY, SHMEM_HUGE_FORCE, }; int len = 0; int i; for (i = 0; i < ARRAY_SIZE(values); i++) { len += sysfs_emit_at(buf, len, shmem_huge == values[i] ? "%s[%s]" : "%s%s", i ? " " : "", shmem_format_huge(values[i])); } len += sysfs_emit_at(buf, len, "\n"); return len; } static ssize_t shmem_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { char tmp[16]; int huge, err; if (count + 1 > sizeof(tmp)) return -EINVAL; memcpy(tmp, buf, count); tmp[count] = '\0'; if (count && tmp[count - 1] == '\n') tmp[count - 1] = '\0'; huge = shmem_parse_huge(tmp); if (huge == -EINVAL) return huge; shmem_huge = huge; if (shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; err = start_stop_khugepaged(); return err ? err : count; } struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled); static DEFINE_SPINLOCK(huge_shmem_orders_lock); static ssize_t thpsize_shmem_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { int order = to_thpsize(kobj)->order; const char *output; if (test_bit(order, &huge_shmem_orders_always)) output = "[always] inherit within_size advise never"; else if (test_bit(order, &huge_shmem_orders_inherit)) output = "always [inherit] within_size advise never"; else if (test_bit(order, &huge_shmem_orders_within_size)) output = "always inherit [within_size] advise never"; else if (test_bit(order, &huge_shmem_orders_madvise)) output = "always inherit within_size [advise] never"; else output = "always inherit within_size advise [never]"; return sysfs_emit(buf, "%s\n", output); } static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int order = to_thpsize(kobj)->order; ssize_t ret = count; if (sysfs_streq(buf, "always")) { spin_lock(&huge_shmem_orders_lock); clear_bit(order, &huge_shmem_orders_inherit); clear_bit(order, &huge_shmem_orders_madvise); clear_bit(order, &huge_shmem_orders_within_size); set_bit(order, &huge_shmem_orders_always); spin_unlock(&huge_shmem_orders_lock); } else if (sysfs_streq(buf, "inherit")) { /* Do not override huge allocation policy with non-PMD sized mTHP */ if (shmem_huge == SHMEM_HUGE_FORCE && order != HPAGE_PMD_ORDER) return -EINVAL; spin_lock(&huge_shmem_orders_lock); clear_bit(order, &huge_shmem_orders_always); clear_bit(order, &huge_shmem_orders_madvise); clear_bit(order, &huge_shmem_orders_within_size); set_bit(order, &huge_shmem_orders_inherit); spin_unlock(&huge_shmem_orders_lock); } else if (sysfs_streq(buf, "within_size")) { spin_lock(&huge_shmem_orders_lock); clear_bit(order, &huge_shmem_orders_always); clear_bit(order, &huge_shmem_orders_inherit); clear_bit(order, &huge_shmem_orders_madvise); set_bit(order, &huge_shmem_orders_within_size); spin_unlock(&huge_shmem_orders_lock); } else if (sysfs_streq(buf, "advise")) { spin_lock(&huge_shmem_orders_lock); clear_bit(order, &huge_shmem_orders_always); clear_bit(order, &huge_shmem_orders_inherit); clear_bit(order, &huge_shmem_orders_within_size); set_bit(order, &huge_shmem_orders_madvise); spin_unlock(&huge_shmem_orders_lock); } else if (sysfs_streq(buf, "never")) { spin_lock(&huge_shmem_orders_lock); clear_bit(order, &huge_shmem_orders_always); clear_bit(order, &huge_shmem_orders_inherit); clear_bit(order, &huge_shmem_orders_within_size); clear_bit(order, &huge_shmem_orders_madvise); spin_unlock(&huge_shmem_orders_lock); } else { ret = -EINVAL; } if (ret > 0) { int err = start_stop_khugepaged(); if (err) ret = err; } return ret; } struct kobj_attribute thpsize_shmem_enabled_attr = __ATTR(shmem_enabled, 0644, thpsize_shmem_enabled_show, thpsize_shmem_enabled_store); #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ #if defined(CONFIG_TRANSPARENT_HUGEPAGE) static int __init setup_transparent_hugepage_shmem(char *str) { int huge; huge = shmem_parse_huge(str); if (huge == -EINVAL) { pr_warn("transparent_hugepage_shmem= cannot parse, ignored\n"); return huge; } shmem_huge = huge; return 1; } __setup("transparent_hugepage_shmem=", setup_transparent_hugepage_shmem); static int __init setup_transparent_hugepage_tmpfs(char *str) { int huge; huge = shmem_parse_huge(str); if (huge < 0) { pr_warn("transparent_hugepage_tmpfs= cannot parse, ignored\n"); return huge; } tmpfs_huge = huge; return 1; } __setup("transparent_hugepage_tmpfs=", setup_transparent_hugepage_tmpfs); static char str_dup[PAGE_SIZE] __initdata; static int __init setup_thp_shmem(char *str) { char *token, *range, *policy, *subtoken; unsigned long always, inherit, madvise, within_size; char *start_size, *end_size; int start, end, nr; char *p; if (!str || strlen(str) + 1 > PAGE_SIZE) goto err; strscpy(str_dup, str); always = huge_shmem_orders_always; inherit = huge_shmem_orders_inherit; madvise = huge_shmem_orders_madvise; within_size = huge_shmem_orders_within_size; p = str_dup; while ((token = strsep(&p, ";")) != NULL) { range = strsep(&token, ":"); policy = token; if (!policy) goto err; while ((subtoken = strsep(&range, ",")) != NULL) { if (strchr(subtoken, '-')) { start_size = strsep(&subtoken, "-"); end_size = subtoken; start = get_order_from_str(start_size, THP_ORDERS_ALL_FILE_DEFAULT); end = get_order_from_str(end_size, THP_ORDERS_ALL_FILE_DEFAULT); } else { start_size = end_size = subtoken; start = end = get_order_from_str(subtoken, THP_ORDERS_ALL_FILE_DEFAULT); } if (start < 0) { pr_err("invalid size %s in thp_shmem boot parameter\n", start_size); goto err; } if (end < 0) { pr_err("invalid size %s in thp_shmem boot parameter\n", end_size); goto err; } if (start > end) goto err; nr = end - start + 1; if (!strcmp(policy, "always")) { bitmap_set(&always, start, nr); bitmap_clear(&inherit, start, nr); bitmap_clear(&madvise, start, nr); bitmap_clear(&within_size, start, nr); } else if (!strcmp(policy, "advise")) { bitmap_set(&madvise, start, nr); bitmap_clear(&inherit, start, nr); bitmap_clear(&always, start, nr); bitmap_clear(&within_size, start, nr); } else if (!strcmp(policy, "inherit")) { bitmap_set(&inherit, start, nr); bitmap_clear(&madvise, start, nr); bitmap_clear(&always, start, nr); bitmap_clear(&within_size, start, nr); } else if (!strcmp(policy, "within_size")) { bitmap_set(&within_size, start, nr); bitmap_clear(&inherit, start, nr); bitmap_clear(&madvise, start, nr); bitmap_clear(&always, start, nr); } else if (!strcmp(policy, "never")) { bitmap_clear(&inherit, start, nr); bitmap_clear(&madvise, start, nr); bitmap_clear(&always, start, nr); bitmap_clear(&within_size, start, nr); } else { pr_err("invalid policy %s in thp_shmem boot parameter\n", policy); goto err; } } } huge_shmem_orders_always = always; huge_shmem_orders_madvise = madvise; huge_shmem_orders_inherit = inherit; huge_shmem_orders_within_size = within_size; shmem_orders_configured = true; return 1; err: pr_warn("thp_shmem=%s: error parsing string, ignoring setting\n", str); return 0; } __setup("thp_shmem=", setup_thp_shmem); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #else /* !CONFIG_SHMEM */ /* * tiny-shmem: simple shmemfs and tmpfs using ramfs code * * This is intended for small system where the benefits of the full * shmem code (swap-backed and resource-limited) are outweighed by * their complexity. On systems without swap this code should be * effectively equivalent, but much lighter weight. */ static struct file_system_type shmem_fs_type = { .name = "tmpfs", .init_fs_context = ramfs_init_fs_context, .parameters = ramfs_fs_parameters, .kill_sb = ramfs_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; void __init shmem_init(void) { BUG_ON(register_filesystem(&shmem_fs_type) != 0); shm_mnt = kern_mount(&shmem_fs_type); BUG_ON(IS_ERR(shm_mnt)); } int shmem_unuse(unsigned int type) { return 0; } int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) { return 0; } void shmem_unlock_mapping(struct address_space *mapping) { } #ifdef CONFIG_MMU unsigned long shmem_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); } #endif void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) { truncate_inode_pages_range(inode->i_mapping, lstart, lend); } EXPORT_SYMBOL_GPL(shmem_truncate_range); #define shmem_vm_ops generic_file_vm_ops #define shmem_anon_vm_ops generic_file_vm_ops #define shmem_file_operations ramfs_file_operations #define shmem_acct_size(flags, size) 0 #define shmem_unacct_size(flags, size) do {} while (0) static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev, unsigned long flags) { struct inode *inode = ramfs_get_inode(sb, dir, mode, dev); return inode ? inode : ERR_PTR(-ENOSPC); } #endif /* CONFIG_SHMEM */ /* common code */ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size, unsigned long flags, unsigned int i_flags) { struct inode *inode; struct file *res; if (IS_ERR(mnt)) return ERR_CAST(mnt); if (size < 0 || size > MAX_LFS_FILESIZE) return ERR_PTR(-EINVAL); if (is_idmapped_mnt(mnt)) return ERR_PTR(-EINVAL); if (shmem_acct_size(flags, size)) return ERR_PTR(-ENOMEM); inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); if (IS_ERR(inode)) { shmem_unacct_size(flags, size); return ERR_CAST(inode); } inode->i_flags |= i_flags; inode->i_size = size; clear_nlink(inode); /* It is unlinked */ res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); if (!IS_ERR(res)) res = alloc_file_pseudo(inode, mnt, name, O_RDWR, &shmem_file_operations); if (IS_ERR(res)) iput(inode); return res; } /** * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be * kernel internal. There will be NO LSM permission checks against the * underlying inode. So users of this interface must do LSM checks at a * higher layer. The users are the big_key and shm implementations. LSM * checks are provided at the key or shm level rather than the inode. * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags) { return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE); } EXPORT_SYMBOL_GPL(shmem_kernel_file_setup); /** * shmem_file_setup - get an unlinked file living in tmpfs * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) { return __shmem_file_setup(shm_mnt, name, size, flags, 0); } EXPORT_SYMBOL_GPL(shmem_file_setup); /** * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs * @mnt: the tmpfs mount where the file will be created * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, loff_t size, unsigned long flags) { return __shmem_file_setup(mnt, name, size, flags, 0); } EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt); /** * shmem_zero_setup - setup a shared anonymous mapping * @vma: the vma to be mmapped is prepared by do_mmap */ int shmem_zero_setup(struct vm_area_struct *vma) { struct file *file; loff_t size = vma->vm_end - vma->vm_start; /* * Cloning a new file under mmap_lock leads to a lock ordering conflict * between XFS directory reading and selinux: since this file is only * accessible to the user through its mapping, use S_PRIVATE flag to * bypass file security, in the same way as shmem_kernel_file_setup(). */ file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags); if (IS_ERR(file)) return PTR_ERR(file); if (vma->vm_file) fput(vma->vm_file); vma->vm_file = file; vma->vm_ops = &shmem_anon_vm_ops; return 0; } /** * shmem_read_folio_gfp - read into page cache, using specified page allocation flags. * @mapping: the folio's address_space * @index: the folio index * @gfp: the page allocator flags to use if allocating * * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", * with any new page allocations done using the specified allocation flags. * But read_cache_page_gfp() uses the ->read_folio() method: which does not * suit tmpfs, since it may have pages in swapcache, and needs to find those * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. * * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. */ struct folio *shmem_read_folio_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp) { #ifdef CONFIG_SHMEM struct inode *inode = mapping->host; struct folio *folio; int error; error = shmem_get_folio_gfp(inode, index, i_size_read(inode), &folio, SGP_CACHE, gfp, NULL, NULL); if (error) return ERR_PTR(error); folio_unlock(folio); return folio; #else /* * The tiny !SHMEM case uses ramfs without swap */ return mapping_read_folio_gfp(mapping, index, gfp); #endif } EXPORT_SYMBOL_GPL(shmem_read_folio_gfp); struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp) { struct folio *folio = shmem_read_folio_gfp(mapping, index, gfp); struct page *page; if (IS_ERR(folio)) return &folio->page; page = folio_file_page(folio, index); if (PageHWPoison(page)) { folio_put(folio); return ERR_PTR(-EIO); } return page; } EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
20 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_UIDGID_H #define _LINUX_UIDGID_H /* * A set of types for the internal kernel types representing uids and gids. * * The types defined in this header allow distinguishing which uids and gids in * the kernel are values used by userspace and which uid and gid values are * the internal kernel values. With the addition of user namespaces the values * can be different. Using the type system makes it possible for the compiler * to detect when we overlook these differences. * */ #include <linux/uidgid_types.h> #include <linux/highuid.h> struct user_namespace; extern struct user_namespace init_user_ns; struct uid_gid_map; #define KUIDT_INIT(value) (kuid_t){ value } #define KGIDT_INIT(value) (kgid_t){ value } #ifdef CONFIG_MULTIUSER static inline uid_t __kuid_val(kuid_t uid) { return uid.val; } static inline gid_t __kgid_val(kgid_t gid) { return gid.val; } #else static inline uid_t __kuid_val(kuid_t uid) { return 0; } static inline gid_t __kgid_val(kgid_t gid) { return 0; } #endif #define GLOBAL_ROOT_UID KUIDT_INIT(0) #define GLOBAL_ROOT_GID KGIDT_INIT(0) #define INVALID_UID KUIDT_INIT(-1) #define INVALID_GID KGIDT_INIT(-1) static inline bool uid_eq(kuid_t left, kuid_t right) { return __kuid_val(left) == __kuid_val(right); } static inline bool gid_eq(kgid_t left, kgid_t right) { return __kgid_val(left) == __kgid_val(right); } static inline bool uid_gt(kuid_t left, kuid_t right) { return __kuid_val(left) > __kuid_val(right); } static inline bool gid_gt(kgid_t left, kgid_t right) { return __kgid_val(left) > __kgid_val(right); } static inline bool uid_gte(kuid_t left, kuid_t right) { return __kuid_val(left) >= __kuid_val(right); } static inline bool gid_gte(kgid_t left, kgid_t right) { return __kgid_val(left) >= __kgid_val(right); } static inline bool uid_lt(kuid_t left, kuid_t right) { return __kuid_val(left) < __kuid_val(right); } static inline bool gid_lt(kgid_t left, kgid_t right) { return __kgid_val(left) < __kgid_val(right); } static inline bool uid_lte(kuid_t left, kuid_t right) { return __kuid_val(left) <= __kuid_val(right); } static inline bool gid_lte(kgid_t left, kgid_t right) { return __kgid_val(left) <= __kgid_val(right); } static inline bool uid_valid(kuid_t uid) { return __kuid_val(uid) != (uid_t) -1; } static inline bool gid_valid(kgid_t gid) { return __kgid_val(gid) != (gid_t) -1; } #ifdef CONFIG_USER_NS extern kuid_t make_kuid(struct user_namespace *from, uid_t uid); extern kgid_t make_kgid(struct user_namespace *from, gid_t gid); extern uid_t from_kuid(struct user_namespace *to, kuid_t uid); extern gid_t from_kgid(struct user_namespace *to, kgid_t gid); extern uid_t from_kuid_munged(struct user_namespace *to, kuid_t uid); extern gid_t from_kgid_munged(struct user_namespace *to, kgid_t gid); static inline bool kuid_has_mapping(struct user_namespace *ns, kuid_t uid) { return from_kuid(ns, uid) != (uid_t) -1; } static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) { return from_kgid(ns, gid) != (gid_t) -1; } u32 map_id_down(struct uid_gid_map *map, u32 id); u32 map_id_up(struct uid_gid_map *map, u32 id); u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count); #else static inline kuid_t make_kuid(struct user_namespace *from, uid_t uid) { return KUIDT_INIT(uid); } static inline kgid_t make_kgid(struct user_namespace *from, gid_t gid) { return KGIDT_INIT(gid); } static inline uid_t from_kuid(struct user_namespace *to, kuid_t kuid) { return __kuid_val(kuid); } static inline gid_t from_kgid(struct user_namespace *to, kgid_t kgid) { return __kgid_val(kgid); } static inline uid_t from_kuid_munged(struct user_namespace *to, kuid_t kuid) { uid_t uid = from_kuid(to, kuid); if (uid == (uid_t)-1) uid = overflowuid; return uid; } static inline gid_t from_kgid_munged(struct user_namespace *to, kgid_t kgid) { gid_t gid = from_kgid(to, kgid); if (gid == (gid_t)-1) gid = overflowgid; return gid; } static inline bool kuid_has_mapping(struct user_namespace *ns, kuid_t uid) { return uid_valid(uid); } static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) { return gid_valid(gid); } static inline u32 map_id_down(struct uid_gid_map *map, u32 id) { return id; } static inline u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count) { return id; } static inline u32 map_id_up(struct uid_gid_map *map, u32 id) { return id; } #endif /* CONFIG_USER_NS */ #endif /* _LINUX_UIDGID_H */
1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Glue Code for the AVX/AES-NI/GFNI assembler implementation of the ARIA Cipher * * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com> */ #include <crypto/algapi.h> #include <crypto/aria.h> #include <linux/crypto.h> #include <linux/err.h> #include <linux/export.h> #include <linux/module.h> #include <linux/types.h> #include "ecb_cbc_helpers.h" #include "aria-avx.h" asmlinkage void aria_aesni_avx_encrypt_16way(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(aria_aesni_avx_encrypt_16way); asmlinkage void aria_aesni_avx_decrypt_16way(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(aria_aesni_avx_decrypt_16way); asmlinkage void aria_aesni_avx_ctr_crypt_16way(const void *ctx, u8 *dst, const u8 *src, u8 *keystream, u8 *iv); EXPORT_SYMBOL_GPL(aria_aesni_avx_ctr_crypt_16way); asmlinkage void aria_aesni_avx_gfni_encrypt_16way(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(aria_aesni_avx_gfni_encrypt_16way); asmlinkage void aria_aesni_avx_gfni_decrypt_16way(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(aria_aesni_avx_gfni_decrypt_16way); asmlinkage void aria_aesni_avx_gfni_ctr_crypt_16way(const void *ctx, u8 *dst, const u8 *src, u8 *keystream, u8 *iv); EXPORT_SYMBOL_GPL(aria_aesni_avx_gfni_ctr_crypt_16way); static struct aria_avx_ops aria_ops; struct aria_avx_request_ctx { u8 keystream[ARIA_AESNI_PARALLEL_BLOCK_SIZE]; }; static int ecb_do_encrypt(struct skcipher_request *req, const u32 *rkey) { ECB_WALK_START(req, ARIA_BLOCK_SIZE, ARIA_AESNI_PARALLEL_BLOCKS); ECB_BLOCK(ARIA_AESNI_PARALLEL_BLOCKS, aria_ops.aria_encrypt_16way); ECB_BLOCK(1, aria_encrypt); ECB_WALK_END(); } static int ecb_do_decrypt(struct skcipher_request *req, const u32 *rkey) { ECB_WALK_START(req, ARIA_BLOCK_SIZE, ARIA_AESNI_PARALLEL_BLOCKS); ECB_BLOCK(ARIA_AESNI_PARALLEL_BLOCKS, aria_ops.aria_decrypt_16way); ECB_BLOCK(1, aria_decrypt); ECB_WALK_END(); } static int aria_avx_ecb_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct aria_ctx *ctx = crypto_skcipher_ctx(tfm); return ecb_do_encrypt(req, ctx->enc_key[0]); } static int aria_avx_ecb_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct aria_ctx *ctx = crypto_skcipher_ctx(tfm); return ecb_do_decrypt(req, ctx->dec_key[0]); } static int aria_avx_set_key(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { return aria_set_key(&tfm->base, key, keylen); } static int aria_avx_ctr_encrypt(struct skcipher_request *req) { struct aria_avx_request_ctx *req_ctx = skcipher_request_ctx(req); struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct aria_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes) > 0) { const u8 *src = walk.src.virt.addr; u8 *dst = walk.dst.virt.addr; while (nbytes >= ARIA_AESNI_PARALLEL_BLOCK_SIZE) { kernel_fpu_begin(); aria_ops.aria_ctr_crypt_16way(ctx, dst, src, &req_ctx->keystream[0], walk.iv); kernel_fpu_end(); dst += ARIA_AESNI_PARALLEL_BLOCK_SIZE; src += ARIA_AESNI_PARALLEL_BLOCK_SIZE; nbytes -= ARIA_AESNI_PARALLEL_BLOCK_SIZE; } while (nbytes >= ARIA_BLOCK_SIZE) { memcpy(&req_ctx->keystream[0], walk.iv, ARIA_BLOCK_SIZE); crypto_inc(walk.iv, ARIA_BLOCK_SIZE); aria_encrypt(ctx, &req_ctx->keystream[0], &req_ctx->keystream[0]); crypto_xor_cpy(dst, src, &req_ctx->keystream[0], ARIA_BLOCK_SIZE); dst += ARIA_BLOCK_SIZE; src += ARIA_BLOCK_SIZE; nbytes -= ARIA_BLOCK_SIZE; } if (walk.nbytes == walk.total && nbytes > 0) { memcpy(&req_ctx->keystream[0], walk.iv, ARIA_BLOCK_SIZE); crypto_inc(walk.iv, ARIA_BLOCK_SIZE); aria_encrypt(ctx, &req_ctx->keystream[0], &req_ctx->keystream[0]); crypto_xor_cpy(dst, src, &req_ctx->keystream[0], nbytes); dst += nbytes; src += nbytes; nbytes = 0; } err = skcipher_walk_done(&walk, nbytes); } return err; } static int aria_avx_init_tfm(struct crypto_skcipher *tfm) { crypto_skcipher_set_reqsize(tfm, sizeof(struct aria_avx_request_ctx)); return 0; } static struct skcipher_alg aria_algs[] = { { .base.cra_name = "ecb(aria)", .base.cra_driver_name = "ecb-aria-avx", .base.cra_priority = 400, .base.cra_blocksize = ARIA_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct aria_ctx), .base.cra_module = THIS_MODULE, .min_keysize = ARIA_MIN_KEY_SIZE, .max_keysize = ARIA_MAX_KEY_SIZE, .setkey = aria_avx_set_key, .encrypt = aria_avx_ecb_encrypt, .decrypt = aria_avx_ecb_decrypt, }, { .base.cra_name = "ctr(aria)", .base.cra_driver_name = "ctr-aria-avx", .base.cra_priority = 400, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct aria_ctx), .base.cra_module = THIS_MODULE, .min_keysize = ARIA_MIN_KEY_SIZE, .max_keysize = ARIA_MAX_KEY_SIZE, .ivsize = ARIA_BLOCK_SIZE, .chunksize = ARIA_BLOCK_SIZE, .walksize = 16 * ARIA_BLOCK_SIZE, .setkey = aria_avx_set_key, .encrypt = aria_avx_ctr_encrypt, .decrypt = aria_avx_ctr_encrypt, .init = aria_avx_init_tfm, } }; static int __init aria_avx_init(void) { const char *feature_name; if (!boot_cpu_has(X86_FEATURE_AVX) || !boot_cpu_has(X86_FEATURE_AES) || !boot_cpu_has(X86_FEATURE_OSXSAVE)) { pr_info("AVX or AES-NI instructions are not detected.\n"); return -ENODEV; } if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } if (boot_cpu_has(X86_FEATURE_GFNI)) { aria_ops.aria_encrypt_16way = aria_aesni_avx_gfni_encrypt_16way; aria_ops.aria_decrypt_16way = aria_aesni_avx_gfni_decrypt_16way; aria_ops.aria_ctr_crypt_16way = aria_aesni_avx_gfni_ctr_crypt_16way; } else { aria_ops.aria_encrypt_16way = aria_aesni_avx_encrypt_16way; aria_ops.aria_decrypt_16way = aria_aesni_avx_decrypt_16way; aria_ops.aria_ctr_crypt_16way = aria_aesni_avx_ctr_crypt_16way; } return crypto_register_skciphers(aria_algs, ARRAY_SIZE(aria_algs)); } static void __exit aria_avx_exit(void) { crypto_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs)); } module_init(aria_avx_init); module_exit(aria_avx_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Taehee Yoo <ap420073@gmail.com>"); MODULE_DESCRIPTION("ARIA Cipher Algorithm, AVX/AES-NI/GFNI optimized"); MODULE_ALIAS_CRYPTO("aria"); MODULE_ALIAS_CRYPTO("aria-aesni-avx");
13 14 14 14 7 7 7 7 7 7 7 7 7 7 6 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright (c) 2014 Jiri Pirko <jiri@resnulli.us> */ #ifndef __NET_TC_VLAN_H #define __NET_TC_VLAN_H #include <net/act_api.h> #include <linux/tc_act/tc_vlan.h> struct tcf_vlan_params { int action; int tcfv_action; unsigned char tcfv_push_dst[ETH_ALEN]; unsigned char tcfv_push_src[ETH_ALEN]; u16 tcfv_push_vid; __be16 tcfv_push_proto; u8 tcfv_push_prio; bool tcfv_push_prio_exists; struct rcu_head rcu; }; struct tcf_vlan { struct tc_action common; struct tcf_vlan_params __rcu *vlan_p; }; #define to_vlan(a) ((struct tcf_vlan *)a) static inline u32 tcf_vlan_action(const struct tc_action *a) { u32 tcfv_action; rcu_read_lock(); tcfv_action = rcu_dereference(to_vlan(a)->vlan_p)->tcfv_action; rcu_read_unlock(); return tcfv_action; } static inline u16 tcf_vlan_push_vid(const struct tc_action *a) { u16 tcfv_push_vid; rcu_read_lock(); tcfv_push_vid = rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_vid; rcu_read_unlock(); return tcfv_push_vid; } static inline __be16 tcf_vlan_push_proto(const struct tc_action *a) { __be16 tcfv_push_proto; rcu_read_lock(); tcfv_push_proto = rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_proto; rcu_read_unlock(); return tcfv_push_proto; } static inline u8 tcf_vlan_push_prio(const struct tc_action *a) { u8 tcfv_push_prio; rcu_read_lock(); tcfv_push_prio = rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_prio; rcu_read_unlock(); return tcfv_push_prio; } static inline void tcf_vlan_push_eth(unsigned char *src, unsigned char *dest, const struct tc_action *a) { rcu_read_lock(); memcpy(dest, rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_dst, ETH_ALEN); memcpy(src, rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_src, ETH_ALEN); rcu_read_unlock(); } #endif /* __NET_TC_VLAN_H */
191 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 // SPDX-License-Identifier: GPL-2.0 /* * procfs-based user access to knfsd statistics * * /proc/net/rpc/nfsd * * Format: * rc <hits> <misses> <nocache> * Statistsics for the reply cache * fh <stale> <deprecated filehandle cache stats> * statistics for filehandle lookup * io <bytes-read> <bytes-written> * statistics for IO throughput * th <threads> <deprecated thread usage histogram stats> * number of threads * ra <deprecated ra-cache stats> * * plus generic RPC stats (see net/sunrpc/stats.c) * * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> */ #include <linux/seq_file.h> #include <linux/module.h> #include <linux/sunrpc/stats.h> #include <net/net_namespace.h> #include "nfsd.h" static int nfsd_show(struct seq_file *seq, void *v) { struct net *net = pde_data(file_inode(seq->file)); struct nfsd_net *nn = net_generic(net, nfsd_net_id); int i; seq_printf(seq, "rc %lld %lld %lld\nfh %lld 0 0 0 0\nio %lld %lld\n", percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_HITS]), percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_MISSES]), percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_NOCACHE]), percpu_counter_sum_positive(&nn->counter[NFSD_STATS_FH_STALE]), percpu_counter_sum_positive(&nn->counter[NFSD_STATS_IO_READ]), percpu_counter_sum_positive(&nn->counter[NFSD_STATS_IO_WRITE])); /* thread usage: */ seq_printf(seq, "th %u 0", atomic_read(&nfsd_th_cnt)); /* deprecated thread usage histogram stats */ for (i = 0; i < 10; i++) seq_puts(seq, " 0.000"); /* deprecated ra-cache stats */ seq_puts(seq, "\nra 0 0 0 0 0 0 0 0 0 0 0 0\n"); /* show my rpc info */ svc_seq_show(seq, &nn->nfsd_svcstats); #ifdef CONFIG_NFSD_V4 /* Show count for individual nfsv4 operations */ /* Writing operation numbers 0 1 2 also for maintaining uniformity */ seq_printf(seq, "proc4ops %u", LAST_NFS4_OP + 1); for (i = 0; i <= LAST_NFS4_OP; i++) { seq_printf(seq, " %lld", percpu_counter_sum_positive(&nn->counter[NFSD_STATS_NFS4_OP(i)])); } seq_printf(seq, "\nwdeleg_getattr %lld", percpu_counter_sum_positive(&nn->counter[NFSD_STATS_WDELEG_GETATTR])); seq_putc(seq, '\n'); #endif return 0; } DEFINE_PROC_SHOW_ATTRIBUTE(nfsd); struct proc_dir_entry *nfsd_proc_stat_init(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); return svc_proc_register(net, &nn->nfsd_svcstats, &nfsd_proc_ops); } void nfsd_proc_stat_shutdown(struct net *net) { svc_proc_unregister(net, "nfsd"); }
1 4 236 237 10 10 10 10 2 240 238 2 2 33 198 243 5 246 2 242 235 206 246 248 8 16 191 191 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 /* * Copyright (c) 2017 Mellanox Technologies Inc. All rights reserved. * Copyright (c) 2010 Voltaire Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__ #include <linux/export.h> #include <net/netlink.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/sock.h> #include <rdma/rdma_netlink.h> #include <linux/module.h> #include "core_priv.h" static struct { const struct rdma_nl_cbs *cb_table; /* Synchronizes between ongoing netlink commands and netlink client * unregistration. */ struct rw_semaphore sem; } rdma_nl_types[RDMA_NL_NUM_CLIENTS]; bool rdma_nl_chk_listeners(unsigned int group) { struct rdma_dev_net *rnet = rdma_net_to_dev_net(&init_net); return netlink_has_listeners(rnet->nl_sock, group); } EXPORT_SYMBOL(rdma_nl_chk_listeners); static bool is_nl_msg_valid(unsigned int type, unsigned int op) { static const unsigned int max_num_ops[RDMA_NL_NUM_CLIENTS] = { [RDMA_NL_IWCM] = RDMA_NL_IWPM_NUM_OPS, [RDMA_NL_LS] = RDMA_NL_LS_NUM_OPS, [RDMA_NL_NLDEV] = RDMA_NLDEV_NUM_OPS, }; /* * This BUILD_BUG_ON is intended to catch addition of new * RDMA netlink protocol without updating the array above. */ BUILD_BUG_ON(RDMA_NL_NUM_CLIENTS != 6); if (type >= RDMA_NL_NUM_CLIENTS) return false; return op < max_num_ops[type]; } static const struct rdma_nl_cbs * get_cb_table(const struct sk_buff *skb, unsigned int type, unsigned int op) { const struct rdma_nl_cbs *cb_table; /* * Currently only NLDEV client is supporting netlink commands in * non init_net net namespace. */ if (sock_net(skb->sk) != &init_net && type != RDMA_NL_NLDEV) return NULL; cb_table = READ_ONCE(rdma_nl_types[type].cb_table); if (!cb_table) { /* * Didn't get valid reference of the table, attempt module * load once. */ up_read(&rdma_nl_types[type].sem); request_module("rdma-netlink-subsys-%u", type); down_read(&rdma_nl_types[type].sem); cb_table = READ_ONCE(rdma_nl_types[type].cb_table); } if (!cb_table || (!cb_table[op].dump && !cb_table[op].doit)) return NULL; return cb_table; } void rdma_nl_register(unsigned int index, const struct rdma_nl_cbs cb_table[]) { if (WARN_ON(!is_nl_msg_valid(index, 0)) || WARN_ON(READ_ONCE(rdma_nl_types[index].cb_table))) return; /* Pairs with the READ_ONCE in is_nl_valid() */ smp_store_release(&rdma_nl_types[index].cb_table, cb_table); } EXPORT_SYMBOL(rdma_nl_register); void rdma_nl_unregister(unsigned int index) { down_write(&rdma_nl_types[index].sem); rdma_nl_types[index].cb_table = NULL; up_write(&rdma_nl_types[index].sem); } EXPORT_SYMBOL(rdma_nl_unregister); void *ibnl_put_msg(struct sk_buff *skb, struct nlmsghdr **nlh, int seq, int len, int client, int op, int flags) { *nlh = nlmsg_put(skb, 0, seq, RDMA_NL_GET_TYPE(client, op), len, flags); if (!*nlh) return NULL; return nlmsg_data(*nlh); } EXPORT_SYMBOL(ibnl_put_msg); int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh, int len, void *data, int type) { if (nla_put(skb, type, len, data)) { nlmsg_cancel(skb, nlh); return -EMSGSIZE; } return 0; } EXPORT_SYMBOL(ibnl_put_attr); static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { int type = nlh->nlmsg_type; unsigned int index = RDMA_NL_GET_CLIENT(type); unsigned int op = RDMA_NL_GET_OP(type); const struct rdma_nl_cbs *cb_table; int err = -EINVAL; if (!is_nl_msg_valid(index, op)) return -EINVAL; down_read(&rdma_nl_types[index].sem); cb_table = get_cb_table(skb, index, op); if (!cb_table) goto done; if ((cb_table[op].flags & RDMA_NL_ADMIN_PERM) && !netlink_capable(skb, CAP_NET_ADMIN)) { err = -EPERM; goto done; } /* * LS responses overload the 0x100 (NLM_F_ROOT) flag. Don't * mistakenly call the .dump() function. */ if (index == RDMA_NL_LS) { if (cb_table[op].doit) err = cb_table[op].doit(skb, nlh, extack); goto done; } /* FIXME: Convert IWCM to properly handle doit callbacks */ if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_IWCM) { struct netlink_dump_control c = { .dump = cb_table[op].dump, }; if (c.dump) err = netlink_dump_start(skb->sk, skb, nlh, &c); goto done; } if (cb_table[op].doit) err = cb_table[op].doit(skb, nlh, extack); done: up_read(&rdma_nl_types[index].sem); return err; } /* * This function is similar to netlink_rcv_skb with one exception: * It calls to the callback for the netlink messages without NLM_F_REQUEST * flag. These messages are intended for RDMA_NL_LS consumer, so it is allowed * for that consumer only. */ static int rdma_nl_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, struct nlmsghdr *, struct netlink_ext_ack *)) { struct netlink_ext_ack extack = {}; struct nlmsghdr *nlh; int err; while (skb->len >= nlmsg_total_size(0)) { int msglen; nlh = nlmsg_hdr(skb); err = 0; if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len) return 0; /* * Generally speaking, the only requests are handled * by the kernel, but RDMA_NL_LS is different, because it * runs backward netlink scheme. Kernel initiates messages * and waits for reply with data to keep pathrecord cache * in sync. */ if (!(nlh->nlmsg_flags & NLM_F_REQUEST) && (RDMA_NL_GET_CLIENT(nlh->nlmsg_type) != RDMA_NL_LS)) goto ack; /* Skip control messages */ if (nlh->nlmsg_type < NLMSG_MIN_TYPE) goto ack; err = cb(skb, nlh, &extack); if (err == -EINTR) goto skip; ack: if (nlh->nlmsg_flags & NLM_F_ACK || err) netlink_ack(skb, nlh, err, &extack); skip: msglen = NLMSG_ALIGN(nlh->nlmsg_len); if (msglen > skb->len) msglen = skb->len; skb_pull(skb, msglen); } return 0; } static void rdma_nl_rcv(struct sk_buff *skb) { rdma_nl_rcv_skb(skb, &rdma_nl_rcv_msg); } int rdma_nl_unicast(struct net *net, struct sk_buff *skb, u32 pid) { struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); int err; err = netlink_unicast(rnet->nl_sock, skb, pid, MSG_DONTWAIT); return (err < 0) ? err : 0; } EXPORT_SYMBOL(rdma_nl_unicast); int rdma_nl_unicast_wait(struct net *net, struct sk_buff *skb, __u32 pid) { struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); int err; err = netlink_unicast(rnet->nl_sock, skb, pid, 0); return (err < 0) ? err : 0; } EXPORT_SYMBOL(rdma_nl_unicast_wait); int rdma_nl_multicast(struct net *net, struct sk_buff *skb, unsigned int group, gfp_t flags) { struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); return nlmsg_multicast(rnet->nl_sock, skb, 0, group, flags); } EXPORT_SYMBOL(rdma_nl_multicast); void rdma_nl_init(void) { int idx; for (idx = 0; idx < RDMA_NL_NUM_CLIENTS; idx++) init_rwsem(&rdma_nl_types[idx].sem); } void rdma_nl_exit(void) { int idx; for (idx = 0; idx < RDMA_NL_NUM_CLIENTS; idx++) WARN(rdma_nl_types[idx].cb_table, "Netlink client %d wasn't released prior to unloading %s\n", idx, KBUILD_MODNAME); } int rdma_nl_net_init(struct rdma_dev_net *rnet) { struct net *net = read_pnet(&rnet->net); struct netlink_kernel_cfg cfg = { .input = rdma_nl_rcv, .flags = NL_CFG_F_NONROOT_RECV, }; struct sock *nls; nls = netlink_kernel_create(net, NETLINK_RDMA, &cfg); if (!nls) return -ENOMEM; nls->sk_sndtimeo = 10 * HZ; rnet->nl_sock = nls; return 0; } void rdma_nl_net_exit(struct rdma_dev_net *rnet) { netlink_kernel_release(rnet->nl_sock); } MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_RDMA);
64 1284 1383 1 1344 1344 1344 1294 1327 1284 1294 34 1294 1326 1329 5 1328 1331 1325 1330 1325 1327 5 1297 34 1293 35 1330 1328 47 1284 3191 3199 65 65 65 64 64 1 64 65 65 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/syscalls.h> #include <linux/export.h> #include <linux/uaccess.h> #include <linux/fs_struct.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/prefetch.h> #include "mount.h" #include "internal.h" struct prepend_buffer { char *buf; int len; }; #define DECLARE_BUFFER(__name, __buf, __len) \ struct prepend_buffer __name = {.buf = __buf + __len, .len = __len} static char *extract_string(struct prepend_buffer *p) { if (likely(p->len >= 0)) return p->buf; return ERR_PTR(-ENAMETOOLONG); } static bool prepend_char(struct prepend_buffer *p, unsigned char c) { if (likely(p->len > 0)) { p->len--; *--p->buf = c; return true; } p->len = -1; return false; } /* * The source of the prepend data can be an optimistic load * of a dentry name and length. And because we don't hold any * locks, the length and the pointer to the name may not be * in sync if a concurrent rename happens, and the kernel * copy might fault as a result. * * The end result will correct itself when we check the * rename sequence count, but we need to be able to handle * the fault gracefully. */ static bool prepend_copy(void *dst, const void *src, int len) { if (unlikely(copy_from_kernel_nofault(dst, src, len))) { memset(dst, 'x', len); return false; } return true; } static bool prepend(struct prepend_buffer *p, const char *str, int namelen) { // Already overflowed? if (p->len < 0) return false; // Will overflow? if (p->len < namelen) { // Fill as much as possible from the end of the name str += namelen - p->len; p->buf -= p->len; prepend_copy(p->buf, str, p->len); p->len = -1; return false; } // Fits fully p->len -= namelen; p->buf -= namelen; return prepend_copy(p->buf, str, namelen); } /** * prepend_name - prepend a pathname in front of current buffer pointer * @p: prepend buffer which contains buffer pointer and allocated length * @name: name string and length qstr structure * * With RCU path tracing, it may race with d_move(). Use READ_ONCE() to * make sure that either the old or the new name pointer and length are * fetched. However, there may be mismatch between length and pointer. * But since the length cannot be trusted, we need to copy the name very * carefully when doing the prepend_copy(). It also prepends "/" at * the beginning of the name. The sequence number check at the caller will * retry it again when a d_move() does happen. So any garbage in the buffer * due to mismatched pointer and length will be discarded. * * Load acquire is needed to make sure that we see the new name data even * if we might get the length wrong. */ static bool prepend_name(struct prepend_buffer *p, const struct qstr *name) { const char *dname = smp_load_acquire(&name->name); /* ^^^ */ u32 dlen = READ_ONCE(name->len); return prepend(p, dname, dlen) && prepend_char(p, '/'); } static int __prepend_path(const struct dentry *dentry, const struct mount *mnt, const struct path *root, struct prepend_buffer *p) { while (dentry != root->dentry || &mnt->mnt != root->mnt) { const struct dentry *parent = READ_ONCE(dentry->d_parent); if (dentry == mnt->mnt.mnt_root) { struct mount *m = READ_ONCE(mnt->mnt_parent); struct mnt_namespace *mnt_ns; if (likely(mnt != m)) { dentry = READ_ONCE(mnt->mnt_mountpoint); mnt = m; continue; } /* Global root */ mnt_ns = READ_ONCE(mnt->mnt_ns); /* open-coded is_mounted() to use local mnt_ns */ if (!IS_ERR_OR_NULL(mnt_ns) && !is_anon_ns(mnt_ns)) return 1; // absolute root else return 2; // detached or not attached yet } if (unlikely(dentry == parent)) /* Escaped? */ return 3; prefetch(parent); if (!prepend_name(p, &dentry->d_name)) break; dentry = parent; } return 0; } /** * prepend_path - Prepend path string to a buffer * @path: the dentry/vfsmount to report * @root: root vfsmnt/dentry * @p: prepend buffer which contains buffer pointer and allocated length * * The function will first try to write out the pathname without taking any * lock other than the RCU read lock to make sure that dentries won't go away. * It only checks the sequence number of the global rename_lock as any change * in the dentry's d_seq will be preceded by changes in the rename_lock * sequence number. If the sequence number had been changed, it will restart * the whole pathname back-tracing sequence again by taking the rename_lock. * In this case, there is no need to take the RCU read lock as the recursive * parent pointer references will keep the dentry chain alive as long as no * rename operation is performed. */ static int prepend_path(const struct path *path, const struct path *root, struct prepend_buffer *p) { unsigned seq, m_seq = 0; struct prepend_buffer b; int error; rcu_read_lock(); restart_mnt: read_seqbegin_or_lock(&mount_lock, &m_seq); seq = 0; rcu_read_lock(); restart: b = *p; read_seqbegin_or_lock(&rename_lock, &seq); error = __prepend_path(path->dentry, real_mount(path->mnt), root, &b); if (!(seq & 1)) rcu_read_unlock(); if (need_seqretry(&rename_lock, seq)) { seq = 1; goto restart; } done_seqretry(&rename_lock, seq); if (!(m_seq & 1)) rcu_read_unlock(); if (need_seqretry(&mount_lock, m_seq)) { m_seq = 1; goto restart_mnt; } done_seqretry(&mount_lock, m_seq); if (unlikely(error == 3)) b = *p; if (b.len == p->len) prepend_char(&b, '/'); *p = b; return error; } /** * __d_path - return the path of a dentry * @path: the dentry/vfsmount to report * @root: root vfsmnt/dentry * @buf: buffer to return value in * @buflen: buffer length * * Convert a dentry into an ASCII path name. * * Returns a pointer into the buffer or an error code if the * path was too long. * * "buflen" should be positive. * * If the path is not reachable from the supplied root, return %NULL. */ char *__d_path(const struct path *path, const struct path *root, char *buf, int buflen) { DECLARE_BUFFER(b, buf, buflen); prepend_char(&b, 0); if (unlikely(prepend_path(path, root, &b) > 0)) return NULL; return extract_string(&b); } char *d_absolute_path(const struct path *path, char *buf, int buflen) { struct path root = {}; DECLARE_BUFFER(b, buf, buflen); prepend_char(&b, 0); if (unlikely(prepend_path(path, &root, &b) > 1)) return ERR_PTR(-EINVAL); return extract_string(&b); } static void get_fs_root_rcu(struct fs_struct *fs, struct path *root) { unsigned seq; do { seq = read_seqbegin(&fs->seq); *root = fs->root; } while (read_seqretry(&fs->seq, seq)); } /** * d_path - return the path of a dentry * @path: path to report * @buf: buffer to return value in * @buflen: buffer length * * Convert a dentry into an ASCII path name. If the entry has been deleted * the string " (deleted)" is appended. Note that this is ambiguous. * * Returns a pointer into the buffer or an error code if the path was * too long. Note: Callers should use the returned pointer, not the passed * in buffer, to use the name! The implementation often starts at an offset * into the buffer, and may leave 0 bytes at the start. * * "buflen" should be positive. */ char *d_path(const struct path *path, char *buf, int buflen) { DECLARE_BUFFER(b, buf, buflen); struct path root; /* * We have various synthetic filesystems that never get mounted. On * these filesystems dentries are never used for lookup purposes, and * thus don't need to be hashed. They also don't need a name until a * user wants to identify the object in /proc/pid/fd/. The little hack * below allows us to generate a name for these objects on demand: * * Some pseudo inodes are mountable. When they are mounted * path->dentry == path->mnt->mnt_root. In that case don't call d_dname * and instead have d_path return the mounted path. */ if (path->dentry->d_op && path->dentry->d_op->d_dname && (!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root)) return path->dentry->d_op->d_dname(path->dentry, buf, buflen); rcu_read_lock(); get_fs_root_rcu(current->fs, &root); if (unlikely(d_unlinked(path->dentry))) prepend(&b, " (deleted)", 11); else prepend_char(&b, 0); prepend_path(path, &root, &b); rcu_read_unlock(); return extract_string(&b); } EXPORT_SYMBOL(d_path); /* * Helper function for dentry_operations.d_dname() members */ char *dynamic_dname(char *buffer, int buflen, const char *fmt, ...) { va_list args; char temp[64]; int sz; va_start(args, fmt); sz = vsnprintf(temp, sizeof(temp), fmt, args) + 1; va_end(args); if (sz > sizeof(temp) || sz > buflen) return ERR_PTR(-ENAMETOOLONG); buffer += buflen - sz; return memcpy(buffer, temp, sz); } char *simple_dname(struct dentry *dentry, char *buffer, int buflen) { DECLARE_BUFFER(b, buffer, buflen); /* these dentries are never renamed, so d_lock is not needed */ prepend(&b, " (deleted)", 11); prepend(&b, dentry->d_name.name, dentry->d_name.len); prepend_char(&b, '/'); return extract_string(&b); } /* * Write full pathname from the root of the filesystem into the buffer. */ static char *__dentry_path(const struct dentry *d, struct prepend_buffer *p) { const struct dentry *dentry; struct prepend_buffer b; int seq = 0; rcu_read_lock(); restart: dentry = d; b = *p; read_seqbegin_or_lock(&rename_lock, &seq); while (!IS_ROOT(dentry)) { const struct dentry *parent = dentry->d_parent; prefetch(parent); if (!prepend_name(&b, &dentry->d_name)) break; dentry = parent; } if (!(seq & 1)) rcu_read_unlock(); if (need_seqretry(&rename_lock, seq)) { seq = 1; goto restart; } done_seqretry(&rename_lock, seq); if (b.len == p->len) prepend_char(&b, '/'); return extract_string(&b); } char *dentry_path_raw(const struct dentry *dentry, char *buf, int buflen) { DECLARE_BUFFER(b, buf, buflen); prepend_char(&b, 0); return __dentry_path(dentry, &b); } EXPORT_SYMBOL(dentry_path_raw); char *dentry_path(const struct dentry *dentry, char *buf, int buflen) { DECLARE_BUFFER(b, buf, buflen); if (unlikely(d_unlinked(dentry))) prepend(&b, "//deleted", 10); else prepend_char(&b, 0); return __dentry_path(dentry, &b); } static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root, struct path *pwd) { unsigned seq; do { seq = read_seqbegin(&fs->seq); *root = fs->root; *pwd = fs->pwd; } while (read_seqretry(&fs->seq, seq)); } /* * NOTE! The user-level library version returns a * character pointer. The kernel system call just * returns the length of the buffer filled (which * includes the ending '\0' character), or a negative * error value. So libc would do something like * * char *getcwd(char * buf, size_t size) * { * int retval; * * retval = sys_getcwd(buf, size); * if (retval >= 0) * return buf; * errno = -retval; * return NULL; * } */ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) { int error; struct path pwd, root; char *page = __getname(); if (!page) return -ENOMEM; rcu_read_lock(); get_fs_root_and_pwd_rcu(current->fs, &root, &pwd); if (unlikely(d_unlinked(pwd.dentry))) { rcu_read_unlock(); error = -ENOENT; } else { unsigned len; DECLARE_BUFFER(b, page, PATH_MAX); prepend_char(&b, 0); if (unlikely(prepend_path(&pwd, &root, &b) > 0)) prepend(&b, "(unreachable)", 13); rcu_read_unlock(); len = PATH_MAX - b.len; if (unlikely(len > PATH_MAX)) error = -ENAMETOOLONG; else if (unlikely(len > size)) error = -ERANGE; else if (copy_to_user(buf, b.buf, len)) error = -EFAULT; else error = len; } __putname(page); return error; }
20 791 772 20 2 2 2 2 2 1 1 3 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 // SPDX-License-Identifier: GPL-2.0 /* * Author: Andrei Vagin <avagin@openvz.org> * Author: Dmitry Safonov <dima@arista.com> */ #include <linux/time_namespace.h> #include <linux/user_namespace.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/clocksource.h> #include <linux/seq_file.h> #include <linux/proc_ns.h> #include <linux/export.h> #include <linux/nstree.h> #include <linux/time.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/err.h> #include <linux/mm.h> #include <vdso/datapage.h> ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, struct timens_offsets *ns_offsets) { ktime_t offset; switch (clockid) { case CLOCK_MONOTONIC: offset = timespec64_to_ktime(ns_offsets->monotonic); break; case CLOCK_BOOTTIME: case CLOCK_BOOTTIME_ALARM: offset = timespec64_to_ktime(ns_offsets->boottime); break; default: return tim; } /* * Check that @tim value is in [offset, KTIME_MAX + offset] * and subtract offset. */ if (tim < offset) { /* * User can specify @tim *absolute* value - if it's lesser than * the time namespace's offset - it's already expired. */ tim = 0; } else { tim = ktime_sub(tim, offset); if (unlikely(tim > KTIME_MAX)) tim = KTIME_MAX; } return tim; } static struct ucounts *inc_time_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_TIME_NAMESPACES); } static void dec_time_namespaces(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_TIME_NAMESPACES); } /** * clone_time_ns - Clone a time namespace * @user_ns: User namespace which owns a new namespace. * @old_ns: Namespace to clone * * Clone @old_ns and set the clone refcount to 1 * * Return: The new namespace or ERR_PTR. */ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, struct time_namespace *old_ns) { struct time_namespace *ns; struct ucounts *ucounts; int err; err = -ENOSPC; ucounts = inc_time_namespaces(user_ns); if (!ucounts) goto fail; err = -ENOMEM; ns = kzalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); if (!ns) goto fail_dec; ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!ns->vvar_page) goto fail_free; err = ns_common_init(ns); if (err) goto fail_free_page; ns->ucounts = ucounts; ns->user_ns = get_user_ns(user_ns); ns->offsets = old_ns->offsets; ns->frozen_offsets = false; ns_tree_add(ns); return ns; fail_free_page: __free_page(ns->vvar_page); fail_free: kfree(ns); fail_dec: dec_time_namespaces(ucounts); fail: return ERR_PTR(err); } /** * copy_time_ns - Create timens_for_children from @old_ns * @flags: Cloning flags * @user_ns: User namespace which owns a new namespace. * @old_ns: Namespace to clone * * If CLONE_NEWTIME specified in @flags, creates a new timens_for_children; * adds a refcounter to @old_ns otherwise. * * Return: timens_for_children namespace or ERR_PTR. */ struct time_namespace *copy_time_ns(u64 flags, struct user_namespace *user_ns, struct time_namespace *old_ns) { if (!(flags & CLONE_NEWTIME)) return get_time_ns(old_ns); return clone_time_ns(user_ns, old_ns); } static struct timens_offset offset_from_ts(struct timespec64 off) { struct timens_offset ret; ret.sec = off.tv_sec; ret.nsec = off.tv_nsec; return ret; } /* * A time namespace VVAR page has the same layout as the VVAR page which * contains the system wide VDSO data. * * For a normal task the VVAR pages are installed in the normal ordering: * VVAR * PVCLOCK * HVCLOCK * TIMENS <- Not really required * * Now for a timens task the pages are installed in the following order: * TIMENS * PVCLOCK * HVCLOCK * VVAR * * The check for vdso_clock->clock_mode is in the unlikely path of * the seq begin magic. So for the non-timens case most of the time * 'seq' is even, so the branch is not taken. * * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check * for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the * update to finish and for 'seq' to become even anyway. * * Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which * enforces the time namespace handling path. */ static void timens_setup_vdso_clock_data(struct vdso_clock *vc, struct time_namespace *ns) { struct timens_offset *offset = vc->offset; struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic); struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); vc->seq = 1; vc->clock_mode = VDSO_CLOCKMODE_TIMENS; offset[CLOCK_MONOTONIC] = monotonic; offset[CLOCK_MONOTONIC_RAW] = monotonic; offset[CLOCK_MONOTONIC_COARSE] = monotonic; offset[CLOCK_BOOTTIME] = boottime; offset[CLOCK_BOOTTIME_ALARM] = boottime; } struct page *find_timens_vvar_page(struct vm_area_struct *vma) { if (likely(vma->vm_mm == current->mm)) return current->nsproxy->time_ns->vvar_page; /* * VM_PFNMAP | VM_IO protect .fault() handler from being called * through interfaces like /proc/$pid/mem or * process_vm_{readv,writev}() as long as there's no .access() * in special_mapping_vmops(). * For more details check_vma_flags() and __access_remote_vm() */ WARN(1, "vvar_page accessed remotely"); return NULL; } /* * Protects possibly multiple offsets writers racing each other * and tasks entering the namespace. */ static DEFINE_MUTEX(offset_lock); static void timens_set_vvar_page(struct task_struct *task, struct time_namespace *ns) { struct vdso_time_data *vdata; struct vdso_clock *vc; unsigned int i; if (ns == &init_time_ns) return; /* Fast-path, taken by every task in namespace except the first. */ if (likely(ns->frozen_offsets)) return; mutex_lock(&offset_lock); /* Nothing to-do: vvar_page has been already initialized. */ if (ns->frozen_offsets) goto out; ns->frozen_offsets = true; vdata = page_address(ns->vvar_page); vc = vdata->clock_data; for (i = 0; i < CS_BASES; i++) timens_setup_vdso_clock_data(&vc[i], ns); if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) { for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++) timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns); } out: mutex_unlock(&offset_lock); } void free_time_ns(struct time_namespace *ns) { ns_tree_remove(ns); dec_time_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_common_free(ns); __free_page(ns->vvar_page); /* Concurrent nstree traversal depends on a grace period. */ kfree_rcu(ns, ns.ns_rcu); } static struct ns_common *timens_get(struct task_struct *task) { struct time_namespace *ns = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) { ns = nsproxy->time_ns; get_time_ns(ns); } task_unlock(task); return ns ? &ns->ns : NULL; } static struct ns_common *timens_for_children_get(struct task_struct *task) { struct time_namespace *ns = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) { ns = nsproxy->time_ns_for_children; get_time_ns(ns); } task_unlock(task); return ns ? &ns->ns : NULL; } static void timens_put(struct ns_common *ns) { put_time_ns(to_time_ns(ns)); } void timens_commit(struct task_struct *tsk, struct time_namespace *ns) { timens_set_vvar_page(tsk, ns); vdso_join_timens(tsk, ns); } static int timens_install(struct nsset *nsset, struct ns_common *new) { struct nsproxy *nsproxy = nsset->nsproxy; struct time_namespace *ns = to_time_ns(new); if (!current_is_single_threaded()) return -EUSERS; if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; get_time_ns(ns); put_time_ns(nsproxy->time_ns_for_children); nsproxy->time_ns_for_children = ns; return 0; } void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) { struct ns_common *nsc = &nsproxy->time_ns_for_children->ns; struct time_namespace *ns = to_time_ns(nsc); /* create_new_namespaces() already incremented the ref counter */ if (nsproxy->time_ns == nsproxy->time_ns_for_children) return; get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; timens_commit(tsk, ns); } static struct user_namespace *timens_owner(struct ns_common *ns) { return to_time_ns(ns)->user_ns; } static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts) { char *clock; switch (clockid) { case CLOCK_BOOTTIME: clock = "boottime"; break; case CLOCK_MONOTONIC: clock = "monotonic"; break; default: clock = "unknown"; break; } seq_printf(m, "%-10s %10lld %9ld\n", clock, ts->tv_sec, ts->tv_nsec); } void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m) { struct ns_common *ns; struct time_namespace *time_ns; ns = timens_for_children_get(p); if (!ns) return; time_ns = to_time_ns(ns); show_offset(m, CLOCK_MONOTONIC, &time_ns->offsets.monotonic); show_offset(m, CLOCK_BOOTTIME, &time_ns->offsets.boottime); put_time_ns(time_ns); } int proc_timens_set_offset(struct file *file, struct task_struct *p, struct proc_timens_offset *offsets, int noffsets) { struct ns_common *ns; struct time_namespace *time_ns; struct timespec64 tp; int i, err; ns = timens_for_children_get(p); if (!ns) return -ESRCH; time_ns = to_time_ns(ns); if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) { put_time_ns(time_ns); return -EPERM; } for (i = 0; i < noffsets; i++) { struct proc_timens_offset *off = &offsets[i]; switch (off->clockid) { case CLOCK_MONOTONIC: ktime_get_ts64(&tp); break; case CLOCK_BOOTTIME: ktime_get_boottime_ts64(&tp); break; default: err = -EINVAL; goto out; } err = -ERANGE; if (off->val.tv_sec > KTIME_SEC_MAX || off->val.tv_sec < -KTIME_SEC_MAX) goto out; tp = timespec64_add(tp, off->val); /* * KTIME_SEC_MAX is divided by 2 to be sure that KTIME_MAX is * still unreachable. */ if (tp.tv_sec < 0 || tp.tv_sec > KTIME_SEC_MAX / 2) goto out; } mutex_lock(&offset_lock); if (time_ns->frozen_offsets) { err = -EACCES; goto out_unlock; } err = 0; /* Don't report errors after this line */ for (i = 0; i < noffsets; i++) { struct proc_timens_offset *off = &offsets[i]; struct timespec64 *offset = NULL; switch (off->clockid) { case CLOCK_MONOTONIC: offset = &time_ns->offsets.monotonic; break; case CLOCK_BOOTTIME: offset = &time_ns->offsets.boottime; break; } *offset = off->val; } out_unlock: mutex_unlock(&offset_lock); out: put_time_ns(time_ns); return err; } const struct proc_ns_operations timens_operations = { .name = "time", .get = timens_get, .put = timens_put, .install = timens_install, .owner = timens_owner, }; const struct proc_ns_operations timens_for_children_operations = { .name = "time_for_children", .real_ns_name = "time", .get = timens_for_children_get, .put = timens_put, .install = timens_install, .owner = timens_owner, }; struct time_namespace init_time_ns = { .ns.ns_type = ns_common_type(&init_time_ns), .ns.__ns_ref = REFCOUNT_INIT(3), .user_ns = &init_user_ns, .ns.inum = ns_init_inum(&init_time_ns), .ns.ops = &timens_operations, .frozen_offsets = true, }; void __init time_ns_init(void) { ns_tree_add(&init_time_ns); }
18 18 18 42 27 2 52 206 205 27 206 197 219 218 209 217 212 212 213 210 213 212 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include <net/genetlink.h> #define CREATE_TRACE_POINTS #include <trace/events/devlink.h> #include "devl_internal.h" EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwerr); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report); DEFINE_XARRAY_FLAGS(devlinks, XA_FLAGS_ALLOC); static struct devlink *devlinks_xa_get(unsigned long index) { struct devlink *devlink; rcu_read_lock(); devlink = xa_find(&devlinks, &index, index, DEVLINK_REGISTERED); if (!devlink || !devlink_try_get(devlink)) devlink = NULL; rcu_read_unlock(); return devlink; } /* devlink_rels xarray contains 1:1 relationships between * devlink object and related nested devlink instance. * The xarray index is used to get the nested object from * the nested-in object code. */ static DEFINE_XARRAY_FLAGS(devlink_rels, XA_FLAGS_ALLOC1); #define DEVLINK_REL_IN_USE XA_MARK_0 struct devlink_rel { u32 index; refcount_t refcount; u32 devlink_index; struct { u32 devlink_index; u32 obj_index; devlink_rel_notify_cb_t *notify_cb; devlink_rel_cleanup_cb_t *cleanup_cb; struct delayed_work notify_work; } nested_in; }; static void devlink_rel_free(struct devlink_rel *rel) { xa_erase(&devlink_rels, rel->index); kfree(rel); } static void __devlink_rel_get(struct devlink_rel *rel) { refcount_inc(&rel->refcount); } static void __devlink_rel_put(struct devlink_rel *rel) { if (refcount_dec_and_test(&rel->refcount)) devlink_rel_free(rel); } static void devlink_rel_nested_in_notify_work(struct work_struct *work) { struct devlink_rel *rel = container_of(work, struct devlink_rel, nested_in.notify_work.work); struct devlink *devlink; devlink = devlinks_xa_get(rel->nested_in.devlink_index); if (!devlink) goto rel_put; if (!devl_trylock(devlink)) { devlink_put(devlink); goto reschedule_work; } if (!devl_is_registered(devlink)) { devl_unlock(devlink); devlink_put(devlink); goto rel_put; } if (!xa_get_mark(&devlink_rels, rel->index, DEVLINK_REL_IN_USE)) rel->nested_in.cleanup_cb(devlink, rel->nested_in.obj_index, rel->index); rel->nested_in.notify_cb(devlink, rel->nested_in.obj_index); devl_unlock(devlink); devlink_put(devlink); rel_put: __devlink_rel_put(rel); return; reschedule_work: schedule_delayed_work(&rel->nested_in.notify_work, 1); } static void devlink_rel_nested_in_notify_work_schedule(struct devlink_rel *rel) { __devlink_rel_get(rel); schedule_delayed_work(&rel->nested_in.notify_work, 0); } static struct devlink_rel *devlink_rel_alloc(void) { struct devlink_rel *rel; static u32 next; int err; rel = kzalloc(sizeof(*rel), GFP_KERNEL); if (!rel) return ERR_PTR(-ENOMEM); err = xa_alloc_cyclic(&devlink_rels, &rel->index, rel, xa_limit_32b, &next, GFP_KERNEL); if (err < 0) { kfree(rel); return ERR_PTR(err); } refcount_set(&rel->refcount, 1); INIT_DELAYED_WORK(&rel->nested_in.notify_work, &devlink_rel_nested_in_notify_work); return rel; } static void devlink_rel_put(struct devlink *devlink) { struct devlink_rel *rel = devlink->rel; if (!rel) return; xa_clear_mark(&devlink_rels, rel->index, DEVLINK_REL_IN_USE); devlink_rel_nested_in_notify_work_schedule(rel); __devlink_rel_put(rel); devlink->rel = NULL; } void devlink_rel_nested_in_clear(u32 rel_index) { xa_clear_mark(&devlink_rels, rel_index, DEVLINK_REL_IN_USE); } int devlink_rel_nested_in_add(u32 *rel_index, u32 devlink_index, u32 obj_index, devlink_rel_notify_cb_t *notify_cb, devlink_rel_cleanup_cb_t *cleanup_cb, struct devlink *devlink) { struct devlink_rel *rel = devlink_rel_alloc(); ASSERT_DEVLINK_NOT_REGISTERED(devlink); if (IS_ERR(rel)) return PTR_ERR(rel); rel->devlink_index = devlink->index; rel->nested_in.devlink_index = devlink_index; rel->nested_in.obj_index = obj_index; rel->nested_in.notify_cb = notify_cb; rel->nested_in.cleanup_cb = cleanup_cb; *rel_index = rel->index; xa_set_mark(&devlink_rels, rel->index, DEVLINK_REL_IN_USE); devlink->rel = rel; return 0; } /** * devlink_rel_nested_in_notify - Notify the object this devlink * instance is nested in. * @devlink: devlink * * This is called upon network namespace change of devlink instance. * In case this devlink instance is nested in another devlink object, * a notification of a change of this object should be sent * over netlink. The parent devlink instance lock needs to be * taken during the notification preparation. * However, since the devlink lock of nested instance is held here, * we would end with wrong devlink instance lock ordering and * deadlock. Therefore the work is utilized to avoid that. */ void devlink_rel_nested_in_notify(struct devlink *devlink) { struct devlink_rel *rel = devlink->rel; if (!rel) return; devlink_rel_nested_in_notify_work_schedule(rel); } static struct devlink_rel *devlink_rel_find(unsigned long rel_index) { return xa_find(&devlink_rels, &rel_index, rel_index, DEVLINK_REL_IN_USE); } static struct devlink *devlink_rel_devlink_get(u32 rel_index) { struct devlink_rel *rel; u32 devlink_index; if (!rel_index) return NULL; xa_lock(&devlink_rels); rel = devlink_rel_find(rel_index); if (rel) devlink_index = rel->devlink_index; xa_unlock(&devlink_rels); if (!rel) return NULL; return devlinks_xa_get(devlink_index); } int devlink_rel_devlink_handle_put(struct sk_buff *msg, struct devlink *devlink, u32 rel_index, int attrtype, bool *msg_updated) { struct net *net = devlink_net(devlink); struct devlink *rel_devlink; int err; rel_devlink = devlink_rel_devlink_get(rel_index); if (!rel_devlink) return 0; err = devlink_nl_put_nested_handle(msg, net, rel_devlink, attrtype); devlink_put(rel_devlink); if (!err && msg_updated) *msg_updated = true; return err; } void *devlink_priv(struct devlink *devlink) { return &devlink->priv; } EXPORT_SYMBOL_GPL(devlink_priv); struct devlink *priv_to_devlink(void *priv) { return container_of(priv, struct devlink, priv); } EXPORT_SYMBOL_GPL(priv_to_devlink); struct device *devlink_to_dev(const struct devlink *devlink) { return devlink->dev; } EXPORT_SYMBOL_GPL(devlink_to_dev); struct net *devlink_net(const struct devlink *devlink) { return read_pnet(&devlink->_net); } EXPORT_SYMBOL_GPL(devlink_net); void devl_assert_locked(struct devlink *devlink) { lockdep_assert_held(&devlink->lock); } EXPORT_SYMBOL_GPL(devl_assert_locked); #ifdef CONFIG_LOCKDEP /* For use in conjunction with LOCKDEP only e.g. rcu_dereference_protected() */ bool devl_lock_is_held(struct devlink *devlink) { return lockdep_is_held(&devlink->lock); } EXPORT_SYMBOL_GPL(devl_lock_is_held); #endif void devl_lock(struct devlink *devlink) { mutex_lock(&devlink->lock); } EXPORT_SYMBOL_GPL(devl_lock); int devl_trylock(struct devlink *devlink) { return mutex_trylock(&devlink->lock); } EXPORT_SYMBOL_GPL(devl_trylock); void devl_unlock(struct devlink *devlink) { mutex_unlock(&devlink->lock); } EXPORT_SYMBOL_GPL(devl_unlock); /** * devlink_try_get() - try to obtain a reference on a devlink instance * @devlink: instance to reference * * Obtain a reference on a devlink instance. A reference on a devlink instance * only implies that it's safe to take the instance lock. It does not imply * that the instance is registered, use devl_is_registered() after taking * the instance lock to check registration status. */ struct devlink *__must_check devlink_try_get(struct devlink *devlink) { if (refcount_inc_not_zero(&devlink->refcount)) return devlink; return NULL; } static void devlink_release(struct work_struct *work) { struct devlink *devlink; devlink = container_of(to_rcu_work(work), struct devlink, rwork); mutex_destroy(&devlink->lock); lockdep_unregister_key(&devlink->lock_key); put_device(devlink->dev); kvfree(devlink); } void devlink_put(struct devlink *devlink) { if (refcount_dec_and_test(&devlink->refcount)) queue_rcu_work(system_percpu_wq, &devlink->rwork); } struct devlink *devlinks_xa_find_get(struct net *net, unsigned long *indexp) { struct devlink *devlink = NULL; rcu_read_lock(); retry: devlink = xa_find(&devlinks, indexp, ULONG_MAX, DEVLINK_REGISTERED); if (!devlink) goto unlock; if (!devlink_try_get(devlink)) goto next; if (!net_eq(devlink_net(devlink), net)) { devlink_put(devlink); goto next; } unlock: rcu_read_unlock(); return devlink; next: (*indexp)++; goto retry; } /** * devl_register - Register devlink instance * @devlink: devlink */ int devl_register(struct devlink *devlink) { ASSERT_DEVLINK_NOT_REGISTERED(devlink); devl_assert_locked(devlink); xa_set_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); devlink_notify_register(devlink); devlink_rel_nested_in_notify(devlink); return 0; } EXPORT_SYMBOL_GPL(devl_register); void devlink_register(struct devlink *devlink) { devl_lock(devlink); devl_register(devlink); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_register); /** * devl_unregister - Unregister devlink instance * @devlink: devlink */ void devl_unregister(struct devlink *devlink) { ASSERT_DEVLINK_REGISTERED(devlink); devl_assert_locked(devlink); devlink_notify_unregister(devlink); xa_clear_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); devlink_rel_put(devlink); } EXPORT_SYMBOL_GPL(devl_unregister); void devlink_unregister(struct devlink *devlink) { devl_lock(devlink); devl_unregister(devlink); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_unregister); /** * devlink_alloc_ns - Allocate new devlink instance resources * in specific namespace * * @ops: ops * @priv_size: size of user private data * @net: net namespace * @dev: parent device * * Allocate new devlink instance resources, including devlink index * and name. */ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, size_t priv_size, struct net *net, struct device *dev) { struct devlink *devlink; static u32 last_id; int ret; WARN_ON(!ops || !dev); if (!devlink_reload_actions_valid(ops)) return NULL; devlink = kvzalloc(struct_size(devlink, priv, priv_size), GFP_KERNEL); if (!devlink) return NULL; ret = xa_alloc_cyclic(&devlinks, &devlink->index, devlink, xa_limit_31b, &last_id, GFP_KERNEL); if (ret < 0) goto err_xa_alloc; devlink->dev = get_device(dev); devlink->ops = ops; xa_init_flags(&devlink->ports, XA_FLAGS_ALLOC); xa_init_flags(&devlink->params, XA_FLAGS_ALLOC); xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC); xa_init_flags(&devlink->nested_rels, XA_FLAGS_ALLOC); write_pnet(&devlink->_net, net); INIT_LIST_HEAD(&devlink->rate_list); INIT_LIST_HEAD(&devlink->linecard_list); INIT_LIST_HEAD(&devlink->sb_list); INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); INIT_LIST_HEAD(&devlink->resource_list); INIT_LIST_HEAD(&devlink->region_list); INIT_LIST_HEAD(&devlink->reporter_list); INIT_LIST_HEAD(&devlink->trap_list); INIT_LIST_HEAD(&devlink->trap_group_list); INIT_LIST_HEAD(&devlink->trap_policer_list); INIT_RCU_WORK(&devlink->rwork, devlink_release); lockdep_register_key(&devlink->lock_key); mutex_init(&devlink->lock); lockdep_set_class(&devlink->lock, &devlink->lock_key); refcount_set(&devlink->refcount, 1); return devlink; err_xa_alloc: kvfree(devlink); return NULL; } EXPORT_SYMBOL_GPL(devlink_alloc_ns); /** * devlink_free - Free devlink instance resources * * @devlink: devlink */ void devlink_free(struct devlink *devlink) { ASSERT_DEVLINK_NOT_REGISTERED(devlink); WARN_ON(!list_empty(&devlink->trap_policer_list)); WARN_ON(!list_empty(&devlink->trap_group_list)); WARN_ON(!list_empty(&devlink->trap_list)); WARN_ON(!list_empty(&devlink->reporter_list)); WARN_ON(!list_empty(&devlink->region_list)); WARN_ON(!list_empty(&devlink->resource_list)); WARN_ON(!list_empty(&devlink->dpipe_table_list)); WARN_ON(!list_empty(&devlink->sb_list)); WARN_ON(!list_empty(&devlink->rate_list)); WARN_ON(!list_empty(&devlink->linecard_list)); WARN_ON(!xa_empty(&devlink->ports)); xa_destroy(&devlink->nested_rels); xa_destroy(&devlink->snapshot_ids); xa_destroy(&devlink->params); xa_destroy(&devlink->ports); xa_erase(&devlinks, devlink->index); devlink_put(devlink); } EXPORT_SYMBOL_GPL(devlink_free); static void __net_exit devlink_pernet_pre_exit(struct net *net) { struct devlink *devlink; u32 actions_performed; unsigned long index; int err; /* In case network namespace is getting destroyed, reload * all devlink instances from this namespace into init_net. */ devlinks_xa_for_each_registered_get(net, index, devlink) { devl_dev_lock(devlink, true); err = 0; if (devl_is_registered(devlink)) err = devlink_reload(devlink, &init_net, DEVLINK_RELOAD_ACTION_DRIVER_REINIT, DEVLINK_RELOAD_LIMIT_UNSPEC, &actions_performed, NULL); devl_dev_unlock(devlink, true); devlink_put(devlink); if (err && err != -EOPNOTSUPP) pr_warn("Failed to reload devlink instance into init_net\n"); } } static struct pernet_operations devlink_pernet_ops __net_initdata = { .pre_exit = devlink_pernet_pre_exit, }; static struct notifier_block devlink_port_netdevice_nb = { .notifier_call = devlink_port_netdevice_event, }; static int __init devlink_init(void) { int err; err = register_pernet_subsys(&devlink_pernet_ops); if (err) goto out; err = genl_register_family(&devlink_nl_family); if (err) goto out_unreg_pernet_subsys; err = register_netdevice_notifier(&devlink_port_netdevice_nb); if (!err) return 0; genl_unregister_family(&devlink_nl_family); out_unreg_pernet_subsys: unregister_pernet_subsys(&devlink_pernet_ops); out: WARN_ON(err); return err; } subsys_initcall(devlink_init);
17618 7 17611 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 // SPDX-License-Identifier: GPL-2.0 #include <linux/fault-inject.h> #include <linux/debugfs.h> #include <linux/error-injection.h> #include <linux/mm.h> static struct { struct fault_attr attr; bool ignore_gfp_highmem; bool ignore_gfp_reclaim; u32 min_order; } fail_page_alloc = { .attr = FAULT_ATTR_INITIALIZER, .ignore_gfp_reclaim = true, .ignore_gfp_highmem = true, .min_order = 1, }; static int __init setup_fail_page_alloc(char *str) { return setup_fault_attr(&fail_page_alloc.attr, str); } __setup("fail_page_alloc=", setup_fail_page_alloc); bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { int flags = 0; if (order < fail_page_alloc.min_order) return false; if (gfp_mask & __GFP_NOFAIL) return false; if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) return false; if (fail_page_alloc.ignore_gfp_reclaim && (gfp_mask & __GFP_DIRECT_RECLAIM)) return false; /* See comment in __should_failslab() */ if (gfp_mask & __GFP_NOWARN) flags |= FAULT_NOWARN; return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags); } ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS static int __init fail_page_alloc_debugfs(void) { umode_t mode = S_IFREG | 0600; struct dentry *dir; dir = fault_create_debugfs_attr("fail_page_alloc", NULL, &fail_page_alloc.attr); debugfs_create_bool("ignore-gfp-wait", mode, dir, &fail_page_alloc.ignore_gfp_reclaim); debugfs_create_bool("ignore-gfp-highmem", mode, dir, &fail_page_alloc.ignore_gfp_highmem); debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order); return 0; } late_initcall(fail_page_alloc_debugfs); #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
83 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_TABLES_IPV4_H_ #define _NF_TABLES_IPV4_H_ #include <net/netfilter/nf_tables.h> #include <net/ip.h> static inline void nft_set_pktinfo_ipv4(struct nft_pktinfo *pkt) { struct iphdr *ip; ip = ip_hdr(pkt->skb); pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = ip->protocol; pkt->thoff = ip_hdrlen(pkt->skb); pkt->fragoff = ntohs(ip->frag_off) & IP_OFFSET; } static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) { struct iphdr *iph, _iph; u32 len, thoff, skb_len; iph = skb_header_pointer(pkt->skb, skb_network_offset(pkt->skb), sizeof(*iph), &_iph); if (!iph) return -1; if (iph->ihl < 5 || iph->version != 4) return -1; len = iph_totlen(pkt->skb, iph); thoff = iph->ihl * 4; skb_len = pkt->skb->len - skb_network_offset(pkt->skb); if (skb_len < len) return -1; else if (len < thoff) return -1; else if (thoff < sizeof(*iph)) return -1; pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = iph->protocol; pkt->thoff = skb_network_offset(pkt->skb) + thoff; pkt->fragoff = ntohs(iph->frag_off) & IP_OFFSET; return 0; } static inline void nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) { if (__nft_set_pktinfo_ipv4_validate(pkt) < 0) nft_set_pktinfo_unspec(pkt); } static inline int nft_set_pktinfo_ipv4_ingress(struct nft_pktinfo *pkt) { struct iphdr *iph; u32 len, thoff; if (!pskb_may_pull(pkt->skb, sizeof(*iph))) return -1; iph = ip_hdr(pkt->skb); if (iph->ihl < 5 || iph->version != 4) goto inhdr_error; len = iph_totlen(pkt->skb, iph); thoff = iph->ihl * 4; if (pkt->skb->len < len) { __IP_INC_STATS(nft_net(pkt), IPSTATS_MIB_INTRUNCATEDPKTS); return -1; } else if (len < thoff) { goto inhdr_error; } else if (thoff < sizeof(*iph)) { return -1; } pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = iph->protocol; pkt->thoff = thoff; pkt->fragoff = ntohs(iph->frag_off) & IP_OFFSET; return 0; inhdr_error: __IP_INC_STATS(nft_net(pkt), IPSTATS_MIB_INHDRERRORS); return -1; } #endif
5 3 5 2 5 5 4 4 4 3 4 3 4 4 5 4 3 5 5 3 4 20 19 1 18 1 17 1 16 1 16 16 15 1 14 14 14 14 14 14 14 14 14 14 14 14 14 14 9 9 3 2 3 1 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 // SPDX-License-Identifier: GPL-2.0-only #include <net/netdev_queues.h> #include "netlink.h" #include "common.h" struct rings_req_info { struct ethnl_req_info base; }; struct rings_reply_data { struct ethnl_reply_data base; struct ethtool_ringparam ringparam; struct kernel_ethtool_ringparam kernel_ringparam; u32 supported_ring_params; }; #define RINGS_REPDATA(__reply_base) \ container_of(__reply_base, struct rings_reply_data, base) const struct nla_policy ethnl_rings_get_policy[] = { [ETHTOOL_A_RINGS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int rings_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct rings_reply_data *data = RINGS_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; if (!dev->ethtool_ops->get_ringparam) return -EOPNOTSUPP; data->supported_ring_params = dev->ethtool_ops->supported_ring_params; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; data->kernel_ringparam.tcp_data_split = dev->cfg->hds_config; data->kernel_ringparam.hds_thresh = dev->cfg->hds_thresh; dev->ethtool_ops->get_ringparam(dev, &data->ringparam, &data->kernel_ringparam, info->extack); ethnl_ops_complete(dev); return 0; } static int rings_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { return nla_total_size(sizeof(u32)) + /* _RINGS_RX_MAX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_MINI_MAX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_JUMBO_MAX */ nla_total_size(sizeof(u32)) + /* _RINGS_TX_MAX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_MINI */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_JUMBO */ nla_total_size(sizeof(u32)) + /* _RINGS_TX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_BUF_LEN */ nla_total_size(sizeof(u8)) + /* _RINGS_TCP_DATA_SPLIT */ nla_total_size(sizeof(u32) + /* _RINGS_CQE_SIZE */ nla_total_size(sizeof(u8)) + /* _RINGS_TX_PUSH */ nla_total_size(sizeof(u8))) + /* _RINGS_RX_PUSH */ nla_total_size(sizeof(u32)) + /* _RINGS_TX_PUSH_BUF_LEN */ nla_total_size(sizeof(u32)) + /* _RINGS_TX_PUSH_BUF_LEN_MAX */ nla_total_size(sizeof(u32)) + /* _RINGS_HDS_THRESH */ nla_total_size(sizeof(u32)); /* _RINGS_HDS_THRESH_MAX*/ } static int rings_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct rings_reply_data *data = RINGS_REPDATA(reply_base); const struct kernel_ethtool_ringparam *kr = &data->kernel_ringparam; const struct ethtool_ringparam *ringparam = &data->ringparam; u32 supported_ring_params = data->supported_ring_params; WARN_ON(kr->tcp_data_split > ETHTOOL_TCP_DATA_SPLIT_ENABLED); if ((ringparam->rx_max_pending && (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MAX, ringparam->rx_max_pending) || nla_put_u32(skb, ETHTOOL_A_RINGS_RX, ringparam->rx_pending))) || (ringparam->rx_mini_max_pending && (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MINI_MAX, ringparam->rx_mini_max_pending) || nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MINI, ringparam->rx_mini_pending))) || (ringparam->rx_jumbo_max_pending && (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_JUMBO_MAX, ringparam->rx_jumbo_max_pending) || nla_put_u32(skb, ETHTOOL_A_RINGS_RX_JUMBO, ringparam->rx_jumbo_pending))) || (ringparam->tx_max_pending && (nla_put_u32(skb, ETHTOOL_A_RINGS_TX_MAX, ringparam->tx_max_pending) || nla_put_u32(skb, ETHTOOL_A_RINGS_TX, ringparam->tx_pending))) || (kr->rx_buf_len && (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_BUF_LEN, kr->rx_buf_len))) || (kr->tcp_data_split && (nla_put_u8(skb, ETHTOOL_A_RINGS_TCP_DATA_SPLIT, kr->tcp_data_split))) || (kr->cqe_size && (nla_put_u32(skb, ETHTOOL_A_RINGS_CQE_SIZE, kr->cqe_size))) || nla_put_u8(skb, ETHTOOL_A_RINGS_TX_PUSH, !!kr->tx_push) || nla_put_u8(skb, ETHTOOL_A_RINGS_RX_PUSH, !!kr->rx_push) || ((supported_ring_params & ETHTOOL_RING_USE_TX_PUSH_BUF_LEN) && (nla_put_u32(skb, ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX, kr->tx_push_buf_max_len) || nla_put_u32(skb, ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN, kr->tx_push_buf_len))) || ((supported_ring_params & ETHTOOL_RING_USE_HDS_THRS) && (nla_put_u32(skb, ETHTOOL_A_RINGS_HDS_THRESH, kr->hds_thresh) || nla_put_u32(skb, ETHTOOL_A_RINGS_HDS_THRESH_MAX, kr->hds_thresh_max)))) return -EMSGSIZE; return 0; } /* RINGS_SET */ const struct nla_policy ethnl_rings_set_policy[] = { [ETHTOOL_A_RINGS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_RINGS_RX] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_RX_MINI] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_RX_JUMBO] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_TX] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_RX_BUF_LEN] = NLA_POLICY_MIN(NLA_U32, 1), [ETHTOOL_A_RINGS_TCP_DATA_SPLIT] = NLA_POLICY_MAX(NLA_U8, ETHTOOL_TCP_DATA_SPLIT_ENABLED), [ETHTOOL_A_RINGS_CQE_SIZE] = NLA_POLICY_MIN(NLA_U32, 1), [ETHTOOL_A_RINGS_TX_PUSH] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_RINGS_RX_PUSH] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_HDS_THRESH] = { .type = NLA_U32 }, }; static int ethnl_set_rings_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; struct nlattr **tb = info->attrs; if (tb[ETHTOOL_A_RINGS_RX_BUF_LEN] && !(ops->supported_ring_params & ETHTOOL_RING_USE_RX_BUF_LEN)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_RX_BUF_LEN], "setting rx buf len not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT] && !(ops->supported_ring_params & ETHTOOL_RING_USE_TCP_DATA_SPLIT)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT], "setting TCP data split is not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_HDS_THRESH] && !(ops->supported_ring_params & ETHTOOL_RING_USE_HDS_THRS)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_HDS_THRESH], "setting hds-thresh is not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_CQE_SIZE] && !(ops->supported_ring_params & ETHTOOL_RING_USE_CQE_SIZE)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_CQE_SIZE], "setting cqe size not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_TX_PUSH] && !(ops->supported_ring_params & ETHTOOL_RING_USE_TX_PUSH)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_TX_PUSH], "setting tx push not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_RX_PUSH] && !(ops->supported_ring_params & ETHTOOL_RING_USE_RX_PUSH)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_RX_PUSH], "setting rx push not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN] && !(ops->supported_ring_params & ETHTOOL_RING_USE_TX_PUSH_BUF_LEN)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN], "setting tx push buf len is not supported"); return -EOPNOTSUPP; } return ops->get_ringparam && ops->set_ringparam ? 1 : -EOPNOTSUPP; } static int ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info) { struct kernel_ethtool_ringparam kernel_ringparam; struct net_device *dev = req_info->dev; struct ethtool_ringparam ringparam; struct nlattr **tb = info->attrs; const struct nlattr *err_attr; bool mod = false; int ret; ethtool_ringparam_get_cfg(dev, &ringparam, &kernel_ringparam, info->extack); ethnl_update_u32(&ringparam.rx_pending, tb[ETHTOOL_A_RINGS_RX], &mod); ethnl_update_u32(&ringparam.rx_mini_pending, tb[ETHTOOL_A_RINGS_RX_MINI], &mod); ethnl_update_u32(&ringparam.rx_jumbo_pending, tb[ETHTOOL_A_RINGS_RX_JUMBO], &mod); ethnl_update_u32(&ringparam.tx_pending, tb[ETHTOOL_A_RINGS_TX], &mod); ethnl_update_u32(&kernel_ringparam.rx_buf_len, tb[ETHTOOL_A_RINGS_RX_BUF_LEN], &mod); ethnl_update_u8(&kernel_ringparam.tcp_data_split, tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT], &mod); ethnl_update_u32(&kernel_ringparam.cqe_size, tb[ETHTOOL_A_RINGS_CQE_SIZE], &mod); ethnl_update_u8(&kernel_ringparam.tx_push, tb[ETHTOOL_A_RINGS_TX_PUSH], &mod); ethnl_update_u8(&kernel_ringparam.rx_push, tb[ETHTOOL_A_RINGS_RX_PUSH], &mod); ethnl_update_u32(&kernel_ringparam.tx_push_buf_len, tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN], &mod); ethnl_update_u32(&kernel_ringparam.hds_thresh, tb[ETHTOOL_A_RINGS_HDS_THRESH], &mod); if (!mod) return 0; if (kernel_ringparam.tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_ENABLED && dev_xdp_sb_prog_count(dev)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT], "tcp-data-split can not be enabled with single buffer XDP"); return -EINVAL; } if (dev_get_min_mp_channel_count(dev)) { if (kernel_ringparam.tcp_data_split != ETHTOOL_TCP_DATA_SPLIT_ENABLED) { NL_SET_ERR_MSG(info->extack, "can't disable tcp-data-split while device has memory provider enabled"); return -EINVAL; } else if (kernel_ringparam.hds_thresh) { NL_SET_ERR_MSG(info->extack, "can't set non-zero hds_thresh while device is memory provider enabled"); return -EINVAL; } } /* ensure new ring parameters are within limits */ if (ringparam.rx_pending > ringparam.rx_max_pending) err_attr = tb[ETHTOOL_A_RINGS_RX]; else if (ringparam.rx_mini_pending > ringparam.rx_mini_max_pending) err_attr = tb[ETHTOOL_A_RINGS_RX_MINI]; else if (ringparam.rx_jumbo_pending > ringparam.rx_jumbo_max_pending) err_attr = tb[ETHTOOL_A_RINGS_RX_JUMBO]; else if (ringparam.tx_pending > ringparam.tx_max_pending) err_attr = tb[ETHTOOL_A_RINGS_TX]; else if (kernel_ringparam.hds_thresh > kernel_ringparam.hds_thresh_max) err_attr = tb[ETHTOOL_A_RINGS_HDS_THRESH]; else err_attr = NULL; if (err_attr) { NL_SET_ERR_MSG_ATTR(info->extack, err_attr, "requested ring size exceeds maximum"); return -EINVAL; } if (kernel_ringparam.tx_push_buf_len > kernel_ringparam.tx_push_buf_max_len) { NL_SET_ERR_MSG_ATTR_FMT(info->extack, tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN], "Requested TX push buffer exceeds the maximum of %u", kernel_ringparam.tx_push_buf_max_len); return -EINVAL; } dev->cfg_pending->hds_config = kernel_ringparam.tcp_data_split; dev->cfg_pending->hds_thresh = kernel_ringparam.hds_thresh; ret = dev->ethtool_ops->set_ringparam(dev, &ringparam, &kernel_ringparam, info->extack); return ret < 0 ? ret : 1; } const struct ethnl_request_ops ethnl_rings_request_ops = { .request_cmd = ETHTOOL_MSG_RINGS_GET, .reply_cmd = ETHTOOL_MSG_RINGS_GET_REPLY, .hdr_attr = ETHTOOL_A_RINGS_HEADER, .req_info_size = sizeof(struct rings_req_info), .reply_data_size = sizeof(struct rings_reply_data), .prepare_data = rings_prepare_data, .reply_size = rings_reply_size, .fill_reply = rings_fill_reply, .set_validate = ethnl_set_rings_validate, .set = ethnl_set_rings, .set_ntf_cmd = ETHTOOL_MSG_RINGS_NTF, };
1667 181 1511 1504 1492 1491 221 59 63 23 28 72 253 10 32 226 40 25 15 602 328 276 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 // SPDX-License-Identifier: GPL-2.0-only /* * Access kernel or user memory without faulting. */ #include <linux/export.h> #include <linux/mm.h> #include <linux/uaccess.h> #include <asm/tlb.h> bool __weak copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) { return true; } /* * The below only uses kmsan_check_memory() to ensure uninitialized kernel * memory isn't leaked. */ #define copy_from_kernel_nofault_loop(dst, src, len, type, err_label) \ while (len >= sizeof(type)) { \ __get_kernel_nofault(dst, src, type, err_label); \ kmsan_check_memory(src, sizeof(type)); \ dst += sizeof(type); \ src += sizeof(type); \ len -= sizeof(type); \ } long copy_from_kernel_nofault(void *dst, const void *src, size_t size) { unsigned long align = 0; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) align = (unsigned long)dst | (unsigned long)src; if (!copy_from_kernel_nofault_allowed(src, size)) return -ERANGE; pagefault_disable(); if (!(align & 7)) copy_from_kernel_nofault_loop(dst, src, size, u64, Efault); if (!(align & 3)) copy_from_kernel_nofault_loop(dst, src, size, u32, Efault); if (!(align & 1)) copy_from_kernel_nofault_loop(dst, src, size, u16, Efault); copy_from_kernel_nofault_loop(dst, src, size, u8, Efault); pagefault_enable(); return 0; Efault: pagefault_enable(); return -EFAULT; } EXPORT_SYMBOL_GPL(copy_from_kernel_nofault); #define copy_to_kernel_nofault_loop(dst, src, len, type, err_label) \ while (len >= sizeof(type)) { \ __put_kernel_nofault(dst, src, type, err_label); \ instrument_write(dst, sizeof(type)); \ dst += sizeof(type); \ src += sizeof(type); \ len -= sizeof(type); \ } long copy_to_kernel_nofault(void *dst, const void *src, size_t size) { unsigned long align = 0; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) align = (unsigned long)dst | (unsigned long)src; pagefault_disable(); if (!(align & 7)) copy_to_kernel_nofault_loop(dst, src, size, u64, Efault); if (!(align & 3)) copy_to_kernel_nofault_loop(dst, src, size, u32, Efault); if (!(align & 1)) copy_to_kernel_nofault_loop(dst, src, size, u16, Efault); copy_to_kernel_nofault_loop(dst, src, size, u8, Efault); pagefault_enable(); return 0; Efault: pagefault_enable(); return -EFAULT; } long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count) { const void *src = unsafe_addr; if (unlikely(count <= 0)) return 0; if (!copy_from_kernel_nofault_allowed(unsafe_addr, count)) return -ERANGE; pagefault_disable(); do { __get_kernel_nofault(dst, src, u8, Efault); dst++; src++; } while (dst[-1] && src - unsafe_addr < count); pagefault_enable(); dst[-1] = '\0'; return src - unsafe_addr; Efault: pagefault_enable(); dst[0] = '\0'; return -EFAULT; } /** * copy_from_user_nofault(): safely attempt to read from a user-space location * @dst: pointer to the buffer that shall take the data * @src: address to read from. This must be a user address. * @size: size of the data chunk * * Safely read from user address @src to the buffer at @dst. If a kernel fault * happens, handle that and return -EFAULT. */ long copy_from_user_nofault(void *dst, const void __user *src, size_t size) { long ret = -EFAULT; if (!__access_ok(src, size)) return ret; if (!nmi_uaccess_okay()) return ret; pagefault_disable(); ret = __copy_from_user_inatomic(dst, src, size); pagefault_enable(); if (ret) return -EFAULT; return 0; } EXPORT_SYMBOL_GPL(copy_from_user_nofault); /** * copy_to_user_nofault(): safely attempt to write to a user-space location * @dst: address to write to * @src: pointer to the data that shall be written * @size: size of the data chunk * * Safely write to address @dst from the buffer at @src. If a kernel fault * happens, handle that and return -EFAULT. */ long copy_to_user_nofault(void __user *dst, const void *src, size_t size) { long ret = -EFAULT; if (access_ok(dst, size)) { pagefault_disable(); ret = __copy_to_user_inatomic(dst, src, size); pagefault_enable(); } if (ret) return -EFAULT; return 0; } EXPORT_SYMBOL_GPL(copy_to_user_nofault); /** * strncpy_from_user_nofault: - Copy a NUL terminated string from unsafe user * address. * @dst: Destination address, in kernel space. This buffer must be at * least @count bytes long. * @unsafe_addr: Unsafe user address. * @count: Maximum number of bytes to copy, including the trailing NUL. * * Copies a NUL-terminated string from unsafe user address to kernel buffer. * * On success, returns the length of the string INCLUDING the trailing NUL. * * If access fails, returns -EFAULT (some data may have been copied * and the trailing NUL added). * * If @count is smaller than the length of the string, copies @count-1 bytes, * sets the last byte of @dst buffer to NUL and returns @count. */ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, long count) { long ret; if (unlikely(count <= 0)) return 0; pagefault_disable(); ret = strncpy_from_user(dst, unsafe_addr, count); pagefault_enable(); if (ret >= count) { ret = count; dst[ret - 1] = '\0'; } else if (ret >= 0) { ret++; } return ret; } /** * strnlen_user_nofault: - Get the size of a user string INCLUDING final NUL. * @unsafe_addr: The string to measure. * @count: Maximum count (including NUL) * * Get the size of a NUL-terminated string in user space without pagefault. * * Returns the size of the string INCLUDING the terminating NUL. * * If the string is too long, returns a number larger than @count. User * has to check the return value against "> count". * On exception (or invalid count), returns 0. * * Unlike strnlen_user, this can be used from IRQ handler etc. because * it disables pagefaults. */ long strnlen_user_nofault(const void __user *unsafe_addr, long count) { int ret; pagefault_disable(); ret = strnlen_user(unsafe_addr, count); pagefault_enable(); return ret; } void __copy_overflow(int size, unsigned long count) { WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count); } EXPORT_SYMBOL(__copy_overflow);
108 19 112 113 112 113 113 113 95 97 97 97 97 2 2 2 1 2 1 2 14 1 4 3 6 3 4 3 2 1 2 148 148 123 99 99 4 4 67 122 9 122 10 48 14 34 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 /* * net/tipc/bcast.c: TIPC broadcast code * * Copyright (c) 2004-2006, 2014-2017, Ericsson AB * Copyright (c) 2004, Intel Corporation. * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <linux/tipc_config.h> #include "socket.h" #include "msg.h" #include "bcast.h" #include "link.h" #include "name_table.h" #define BCLINK_WIN_DEFAULT 50 /* bcast link window size (default) */ #define BCLINK_WIN_MIN 32 /* bcast minimum link window size */ const char tipc_bclink_name[] = "broadcast-link"; unsigned long sysctl_tipc_bc_retruni __read_mostly; /** * struct tipc_bc_base - base structure for keeping broadcast send state * @link: broadcast send link structure * @inputq: data input queue; will only carry SOCK_WAKEUP messages * @dests: array keeping number of reachable destinations per bearer * @primary_bearer: a bearer having links to all broadcast destinations, if any * @bcast_support: indicates if primary bearer, if any, supports broadcast * @force_bcast: forces broadcast for multicast traffic * @rcast_support: indicates if all peer nodes support replicast * @force_rcast: forces replicast for multicast traffic * @rc_ratio: dest count as percentage of cluster size where send method changes * @bc_threshold: calculated from rc_ratio; if dests > threshold use broadcast */ struct tipc_bc_base { struct tipc_link *link; struct sk_buff_head inputq; int dests[MAX_BEARERS]; int primary_bearer; bool bcast_support; bool force_bcast; bool rcast_support; bool force_rcast; int rc_ratio; int bc_threshold; }; static struct tipc_bc_base *tipc_bc_base(struct net *net) { return tipc_net(net)->bcbase; } /* tipc_bcast_get_mtu(): -get the MTU currently used by broadcast link * Note: the MTU is decremented to give room for a tunnel header, in * case the message needs to be sent as replicast */ int tipc_bcast_get_mtu(struct net *net) { return tipc_link_mss(tipc_bc_sndlink(net)); } void tipc_bcast_toggle_rcast(struct net *net, bool supp) { tipc_bc_base(net)->rcast_support = supp; } static void tipc_bcbase_calc_bc_threshold(struct net *net) { struct tipc_bc_base *bb = tipc_bc_base(net); int cluster_size = tipc_link_bc_peers(tipc_bc_sndlink(net)); bb->bc_threshold = 1 + (cluster_size * bb->rc_ratio / 100); } /* tipc_bcbase_select_primary(): find a bearer with links to all destinations, * if any, and make it primary bearer */ static void tipc_bcbase_select_primary(struct net *net) { struct tipc_bc_base *bb = tipc_bc_base(net); int all_dests = tipc_link_bc_peers(bb->link); int max_win = tipc_link_max_win(bb->link); int min_win = tipc_link_min_win(bb->link); int i, mtu, prim; bb->primary_bearer = INVALID_BEARER_ID; bb->bcast_support = true; if (!all_dests) return; for (i = 0; i < MAX_BEARERS; i++) { if (!bb->dests[i]) continue; mtu = tipc_bearer_mtu(net, i); if (mtu < tipc_link_mtu(bb->link)) { tipc_link_set_mtu(bb->link, mtu); tipc_link_set_queue_limits(bb->link, min_win, max_win); } bb->bcast_support &= tipc_bearer_bcast_support(net, i); if (bb->dests[i] < all_dests) continue; bb->primary_bearer = i; /* Reduce risk that all nodes select same primary */ if ((i ^ tipc_own_addr(net)) & 1) break; } prim = bb->primary_bearer; if (prim != INVALID_BEARER_ID) bb->bcast_support = tipc_bearer_bcast_support(net, prim); } void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id) { struct tipc_bc_base *bb = tipc_bc_base(net); tipc_bcast_lock(net); bb->dests[bearer_id]++; tipc_bcbase_select_primary(net); tipc_bcast_unlock(net); } void tipc_bcast_dec_bearer_dst_cnt(struct net *net, int bearer_id) { struct tipc_bc_base *bb = tipc_bc_base(net); tipc_bcast_lock(net); bb->dests[bearer_id]--; tipc_bcbase_select_primary(net); tipc_bcast_unlock(net); } /* tipc_bcbase_xmit - broadcast a packet queue across one or more bearers * * Note that number of reachable destinations, as indicated in the dests[] * array, may transitionally differ from the number of destinations indicated * in each sent buffer. We can sustain this. Excess destination nodes will * drop and never acknowledge the unexpected packets, and missing destinations * will either require retransmission (if they are just about to be added to * the bearer), or be removed from the buffer's 'ackers' counter (if they * just went down) */ static void tipc_bcbase_xmit(struct net *net, struct sk_buff_head *xmitq) { int bearer_id; struct tipc_bc_base *bb = tipc_bc_base(net); struct sk_buff *skb, *_skb; struct sk_buff_head _xmitq; if (skb_queue_empty(xmitq)) return; /* The typical case: at least one bearer has links to all nodes */ bearer_id = bb->primary_bearer; if (bearer_id >= 0) { tipc_bearer_bc_xmit(net, bearer_id, xmitq); return; } /* We have to transmit across all bearers */ __skb_queue_head_init(&_xmitq); for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { if (!bb->dests[bearer_id]) continue; skb_queue_walk(xmitq, skb) { _skb = pskb_copy_for_clone(skb, GFP_ATOMIC); if (!_skb) break; __skb_queue_tail(&_xmitq, _skb); } tipc_bearer_bc_xmit(net, bearer_id, &_xmitq); } __skb_queue_purge(xmitq); __skb_queue_purge(&_xmitq); } static void tipc_bcast_select_xmit_method(struct net *net, int dests, struct tipc_mc_method *method) { struct tipc_bc_base *bb = tipc_bc_base(net); unsigned long exp = method->expires; /* Broadcast supported by used bearer/bearers? */ if (!bb->bcast_support) { method->rcast = true; return; } /* Any destinations which don't support replicast ? */ if (!bb->rcast_support) { method->rcast = false; return; } /* Can current method be changed ? */ method->expires = jiffies + TIPC_METHOD_EXPIRE; if (method->mandatory) return; if (!(tipc_net(net)->capabilities & TIPC_MCAST_RBCTL) && time_before(jiffies, exp)) return; /* Configuration as force 'broadcast' method */ if (bb->force_bcast) { method->rcast = false; return; } /* Configuration as force 'replicast' method */ if (bb->force_rcast) { method->rcast = true; return; } /* Configuration as 'autoselect' or default method */ /* Determine method to use now */ method->rcast = dests <= bb->bc_threshold; } /* tipc_bcast_xmit - broadcast the buffer chain to all external nodes * @net: the applicable net namespace * @pkts: chain of buffers containing message * @cong_link_cnt: set to 1 if broadcast link is congested, otherwise 0 * Consumes the buffer chain. * Returns 0 if success, otherwise errno: -EHOSTUNREACH,-EMSGSIZE */ int tipc_bcast_xmit(struct net *net, struct sk_buff_head *pkts, u16 *cong_link_cnt) { struct tipc_link *l = tipc_bc_sndlink(net); struct sk_buff_head xmitq; int rc = 0; __skb_queue_head_init(&xmitq); tipc_bcast_lock(net); if (tipc_link_bc_peers(l)) rc = tipc_link_xmit(l, pkts, &xmitq); tipc_bcast_unlock(net); tipc_bcbase_xmit(net, &xmitq); __skb_queue_purge(pkts); if (rc == -ELINKCONG) { *cong_link_cnt = 1; rc = 0; } return rc; } /* tipc_rcast_xmit - replicate and send a message to given destination nodes * @net: the applicable net namespace * @pkts: chain of buffers containing message * @dests: list of destination nodes * @cong_link_cnt: returns number of congested links * @cong_links: returns identities of congested links * Returns 0 if success, otherwise errno */ static int tipc_rcast_xmit(struct net *net, struct sk_buff_head *pkts, struct tipc_nlist *dests, u16 *cong_link_cnt) { struct tipc_dest *dst, *tmp; struct sk_buff_head _pkts; u32 dnode, selector; selector = msg_link_selector(buf_msg(skb_peek(pkts))); __skb_queue_head_init(&_pkts); list_for_each_entry_safe(dst, tmp, &dests->list, list) { dnode = dst->node; if (!tipc_msg_pskb_copy(dnode, pkts, &_pkts)) return -ENOMEM; /* Any other return value than -ELINKCONG is ignored */ if (tipc_node_xmit(net, &_pkts, dnode, selector) == -ELINKCONG) (*cong_link_cnt)++; } return 0; } /* tipc_mcast_send_sync - deliver a dummy message with SYN bit * @net: the applicable net namespace * @skb: socket buffer to copy * @method: send method to be used * @dests: destination nodes for message. * Returns 0 if success, otherwise errno */ static int tipc_mcast_send_sync(struct net *net, struct sk_buff *skb, struct tipc_mc_method *method, struct tipc_nlist *dests) { struct tipc_msg *hdr, *_hdr; struct sk_buff_head tmpq; u16 cong_link_cnt = 0; struct sk_buff *_skb; int rc = 0; /* Is a cluster supporting with new capabilities ? */ if (!(tipc_net(net)->capabilities & TIPC_MCAST_RBCTL)) return 0; hdr = buf_msg(skb); if (msg_user(hdr) == MSG_FRAGMENTER) hdr = msg_inner_hdr(hdr); if (msg_type(hdr) != TIPC_MCAST_MSG) return 0; /* Allocate dummy message */ _skb = tipc_buf_acquire(MCAST_H_SIZE, GFP_KERNEL); if (!_skb) return -ENOMEM; /* Preparing for 'synching' header */ msg_set_syn(hdr, 1); /* Copy skb's header into a dummy header */ skb_copy_to_linear_data(_skb, hdr, MCAST_H_SIZE); skb_orphan(_skb); /* Reverse method for dummy message */ _hdr = buf_msg(_skb); msg_set_size(_hdr, MCAST_H_SIZE); msg_set_is_rcast(_hdr, !msg_is_rcast(hdr)); msg_set_errcode(_hdr, TIPC_ERR_NO_PORT); __skb_queue_head_init(&tmpq); __skb_queue_tail(&tmpq, _skb); if (method->rcast) rc = tipc_bcast_xmit(net, &tmpq, &cong_link_cnt); else rc = tipc_rcast_xmit(net, &tmpq, dests, &cong_link_cnt); /* This queue should normally be empty by now */ __skb_queue_purge(&tmpq); return rc; } /* tipc_mcast_xmit - deliver message to indicated destination nodes * and to identified node local sockets * @net: the applicable net namespace * @pkts: chain of buffers containing message * @method: send method to be used * @dests: destination nodes for message. * @cong_link_cnt: returns number of encountered congested destination links * Consumes buffer chain. * Returns 0 if success, otherwise errno */ int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts, struct tipc_mc_method *method, struct tipc_nlist *dests, u16 *cong_link_cnt) { struct sk_buff_head inputq, localq; bool rcast = method->rcast; struct tipc_msg *hdr; struct sk_buff *skb; int rc = 0; skb_queue_head_init(&inputq); __skb_queue_head_init(&localq); /* Clone packets before they are consumed by next call */ if (dests->local && !tipc_msg_reassemble(pkts, &localq)) { rc = -ENOMEM; goto exit; } /* Send according to determined transmit method */ if (dests->remote) { tipc_bcast_select_xmit_method(net, dests->remote, method); skb = skb_peek(pkts); hdr = buf_msg(skb); if (msg_user(hdr) == MSG_FRAGMENTER) hdr = msg_inner_hdr(hdr); msg_set_is_rcast(hdr, method->rcast); /* Switch method ? */ if (rcast != method->rcast) { rc = tipc_mcast_send_sync(net, skb, method, dests); if (unlikely(rc)) { pr_err("Unable to send SYN: method %d, rc %d\n", rcast, rc); goto exit; } } if (method->rcast) rc = tipc_rcast_xmit(net, pkts, dests, cong_link_cnt); else rc = tipc_bcast_xmit(net, pkts, cong_link_cnt); } if (dests->local) { tipc_loopback_trace(net, &localq); tipc_sk_mcast_rcv(net, &localq, &inputq); } exit: /* This queue should normally be empty by now */ __skb_queue_purge(pkts); return rc; } /* tipc_bcast_rcv - receive a broadcast packet, and deliver to rcv link * * RCU is locked, no other locks set */ int tipc_bcast_rcv(struct net *net, struct tipc_link *l, struct sk_buff *skb) { struct tipc_msg *hdr = buf_msg(skb); struct sk_buff_head *inputq = &tipc_bc_base(net)->inputq; struct sk_buff_head xmitq; int rc; __skb_queue_head_init(&xmitq); if (msg_mc_netid(hdr) != tipc_netid(net) || !tipc_link_is_up(l)) { kfree_skb(skb); return 0; } tipc_bcast_lock(net); if (msg_user(hdr) == BCAST_PROTOCOL) rc = tipc_link_bc_nack_rcv(l, skb, &xmitq); else rc = tipc_link_rcv(l, skb, NULL); tipc_bcast_unlock(net); tipc_bcbase_xmit(net, &xmitq); /* Any socket wakeup messages ? */ if (!skb_queue_empty(inputq)) tipc_sk_rcv(net, inputq); return rc; } /* tipc_bcast_ack_rcv - receive and handle a broadcast acknowledge * * RCU is locked, no other locks set */ void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l, struct tipc_msg *hdr) { struct sk_buff_head *inputq = &tipc_bc_base(net)->inputq; u16 acked = msg_bcast_ack(hdr); struct sk_buff_head xmitq; /* Ignore bc acks sent by peer before bcast synch point was received */ if (msg_bc_ack_invalid(hdr)) return; __skb_queue_head_init(&xmitq); tipc_bcast_lock(net); tipc_link_bc_ack_rcv(l, acked, 0, NULL, &xmitq, NULL); tipc_bcast_unlock(net); tipc_bcbase_xmit(net, &xmitq); /* Any socket wakeup messages ? */ if (!skb_queue_empty(inputq)) tipc_sk_rcv(net, inputq); } /* tipc_bcast_synch_rcv - check and update rcv link with peer's send state * * RCU is locked, no other locks set */ int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l, struct tipc_msg *hdr, struct sk_buff_head *retrq) { struct sk_buff_head *inputq = &tipc_bc_base(net)->inputq; struct tipc_gap_ack_blks *ga; struct sk_buff_head xmitq; int rc = 0; __skb_queue_head_init(&xmitq); tipc_bcast_lock(net); if (msg_type(hdr) != STATE_MSG) { tipc_link_bc_init_rcv(l, hdr); } else if (!msg_bc_ack_invalid(hdr)) { tipc_get_gap_ack_blks(&ga, l, hdr, false); if (!sysctl_tipc_bc_retruni) retrq = &xmitq; rc = tipc_link_bc_ack_rcv(l, msg_bcast_ack(hdr), msg_bc_gap(hdr), ga, &xmitq, retrq); rc |= tipc_link_bc_sync_rcv(l, hdr, &xmitq); } tipc_bcast_unlock(net); tipc_bcbase_xmit(net, &xmitq); /* Any socket wakeup messages ? */ if (!skb_queue_empty(inputq)) tipc_sk_rcv(net, inputq); return rc; } /* tipc_bcast_add_peer - add a peer node to broadcast link and bearer * * RCU is locked, node lock is set */ void tipc_bcast_add_peer(struct net *net, struct tipc_link *uc_l, struct sk_buff_head *xmitq) { struct tipc_link *snd_l = tipc_bc_sndlink(net); tipc_bcast_lock(net); tipc_link_add_bc_peer(snd_l, uc_l, xmitq); tipc_bcbase_select_primary(net); tipc_bcbase_calc_bc_threshold(net); tipc_bcast_unlock(net); } /* tipc_bcast_remove_peer - remove a peer node from broadcast link and bearer * * RCU is locked, node lock is set */ void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_l) { struct tipc_link *snd_l = tipc_bc_sndlink(net); struct sk_buff_head *inputq = &tipc_bc_base(net)->inputq; struct sk_buff_head xmitq; __skb_queue_head_init(&xmitq); tipc_bcast_lock(net); tipc_link_remove_bc_peer(snd_l, rcv_l, &xmitq); tipc_bcbase_select_primary(net); tipc_bcbase_calc_bc_threshold(net); tipc_bcast_unlock(net); tipc_bcbase_xmit(net, &xmitq); /* Any socket wakeup messages ? */ if (!skb_queue_empty(inputq)) tipc_sk_rcv(net, inputq); } int tipc_bclink_reset_stats(struct net *net, struct tipc_link *l) { if (!l) return -ENOPROTOOPT; tipc_bcast_lock(net); tipc_link_reset_stats(l); tipc_bcast_unlock(net); return 0; } static int tipc_bc_link_set_queue_limits(struct net *net, u32 max_win) { struct tipc_link *l = tipc_bc_sndlink(net); if (!l) return -ENOPROTOOPT; if (max_win < BCLINK_WIN_MIN) max_win = BCLINK_WIN_MIN; if (max_win > TIPC_MAX_LINK_WIN) return -EINVAL; tipc_bcast_lock(net); tipc_link_set_queue_limits(l, tipc_link_min_win(l), max_win); tipc_bcast_unlock(net); return 0; } static int tipc_bc_link_set_broadcast_mode(struct net *net, u32 bc_mode) { struct tipc_bc_base *bb = tipc_bc_base(net); switch (bc_mode) { case BCLINK_MODE_BCAST: if (!bb->bcast_support) return -ENOPROTOOPT; bb->force_bcast = true; bb->force_rcast = false; break; case BCLINK_MODE_RCAST: if (!bb->rcast_support) return -ENOPROTOOPT; bb->force_bcast = false; bb->force_rcast = true; break; case BCLINK_MODE_SEL: if (!bb->bcast_support || !bb->rcast_support) return -ENOPROTOOPT; bb->force_bcast = false; bb->force_rcast = false; break; default: return -EINVAL; } return 0; } static int tipc_bc_link_set_broadcast_ratio(struct net *net, u32 bc_ratio) { struct tipc_bc_base *bb = tipc_bc_base(net); if (!bb->bcast_support || !bb->rcast_support) return -ENOPROTOOPT; if (bc_ratio > 100 || bc_ratio <= 0) return -EINVAL; bb->rc_ratio = bc_ratio; tipc_bcast_lock(net); tipc_bcbase_calc_bc_threshold(net); tipc_bcast_unlock(net); return 0; } int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]) { int err; u32 win; u32 bc_mode; u32 bc_ratio; struct nlattr *props[TIPC_NLA_PROP_MAX + 1]; if (!attrs[TIPC_NLA_LINK_PROP]) return -EINVAL; err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_LINK_PROP], props); if (err) return err; if (!props[TIPC_NLA_PROP_WIN] && !props[TIPC_NLA_PROP_BROADCAST] && !props[TIPC_NLA_PROP_BROADCAST_RATIO]) { return -EOPNOTSUPP; } if (props[TIPC_NLA_PROP_BROADCAST]) { bc_mode = nla_get_u32(props[TIPC_NLA_PROP_BROADCAST]); err = tipc_bc_link_set_broadcast_mode(net, bc_mode); } if (!err && props[TIPC_NLA_PROP_BROADCAST_RATIO]) { bc_ratio = nla_get_u32(props[TIPC_NLA_PROP_BROADCAST_RATIO]); err = tipc_bc_link_set_broadcast_ratio(net, bc_ratio); } if (!err && props[TIPC_NLA_PROP_WIN]) { win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); err = tipc_bc_link_set_queue_limits(net, win); } return err; } int tipc_bcast_init(struct net *net) { struct tipc_net *tn = tipc_net(net); struct tipc_bc_base *bb = NULL; struct tipc_link *l = NULL; bb = kzalloc(sizeof(*bb), GFP_KERNEL); if (!bb) goto enomem; tn->bcbase = bb; spin_lock_init(&tipc_net(net)->bclock); if (!tipc_link_bc_create(net, 0, 0, NULL, one_page_mtu, BCLINK_WIN_DEFAULT, BCLINK_WIN_DEFAULT, 0, &bb->inputq, NULL, NULL, &l)) goto enomem; bb->link = l; tn->bcl = l; bb->rc_ratio = 10; bb->rcast_support = true; return 0; enomem: kfree(bb); kfree(l); return -ENOMEM; } void tipc_bcast_stop(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); synchronize_net(); kfree(tn->bcbase); kfree(tn->bcl); } void tipc_nlist_init(struct tipc_nlist *nl, u32 self) { memset(nl, 0, sizeof(*nl)); INIT_LIST_HEAD(&nl->list); nl->self = self; } void tipc_nlist_add(struct tipc_nlist *nl, u32 node) { if (node == nl->self) nl->local = true; else if (tipc_dest_push(&nl->list, node, 0)) nl->remote++; } void tipc_nlist_del(struct tipc_nlist *nl, u32 node) { if (node == nl->self) nl->local = false; else if (tipc_dest_del(&nl->list, node, 0)) nl->remote--; } void tipc_nlist_purge(struct tipc_nlist *nl) { tipc_dest_list_purge(&nl->list); nl->remote = 0; nl->local = false; } u32 tipc_bcast_get_mode(struct net *net) { struct tipc_bc_base *bb = tipc_bc_base(net); if (bb->force_bcast) return BCLINK_MODE_BCAST; if (bb->force_rcast) return BCLINK_MODE_RCAST; if (bb->bcast_support && bb->rcast_support) return BCLINK_MODE_SEL; return 0; } u32 tipc_bcast_get_broadcast_ratio(struct net *net) { struct tipc_bc_base *bb = tipc_bc_base(net); return bb->rc_ratio; } void tipc_mcast_filter_msg(struct net *net, struct sk_buff_head *defq, struct sk_buff_head *inputq) { struct sk_buff *skb, *_skb, *tmp; struct tipc_msg *hdr, *_hdr; bool match = false; u32 node, port; skb = skb_peek(inputq); if (!skb) return; hdr = buf_msg(skb); if (likely(!msg_is_syn(hdr) && skb_queue_empty(defq))) return; node = msg_orignode(hdr); if (node == tipc_own_addr(net)) return; port = msg_origport(hdr); /* Has the twin SYN message already arrived ? */ skb_queue_walk(defq, _skb) { _hdr = buf_msg(_skb); if (msg_orignode(_hdr) != node) continue; if (msg_origport(_hdr) != port) continue; match = true; break; } if (!match) { if (!msg_is_syn(hdr)) return; __skb_dequeue(inputq); __skb_queue_tail(defq, skb); return; } /* Deliver non-SYN message from other link, otherwise queue it */ if (!msg_is_syn(hdr)) { if (msg_is_rcast(hdr) != msg_is_rcast(_hdr)) return; __skb_dequeue(inputq); __skb_queue_tail(defq, skb); return; } /* Queue non-SYN/SYN message from same link */ if (msg_is_rcast(hdr) == msg_is_rcast(_hdr)) { __skb_dequeue(inputq); __skb_queue_tail(defq, skb); return; } /* Matching SYN messages => return the one with data, if any */ __skb_unlink(_skb, defq); if (msg_data_sz(hdr)) { kfree_skb(_skb); } else { __skb_dequeue(inputq); kfree_skb(skb); __skb_queue_tail(inputq, _skb); } /* Deliver subsequent non-SYN messages from same peer */ skb_queue_walk_safe(defq, _skb, tmp) { _hdr = buf_msg(_skb); if (msg_orignode(_hdr) != node) continue; if (msg_origport(_hdr) != port) continue; if (msg_is_syn(_hdr)) break; __skb_unlink(_skb, defq); __skb_queue_tail(inputq, _skb); } }
1 10 25 2 23 2 20 11 11 1 10 8 18 19 17 6 6 3 3 3 3 4 1 1 1 1 136 15 14 3 7 1 5 8 6 9 7 9 9 9 4 9 9 9 28 125 9 9 9 5 7 9 9 9 8 2 2 2 3 3 3 7 7 13 10 8 8 2 4 38 34 4 15 15 15 2 23 23 10 2 5 38 7 6 3 3 6 6 3 3 3 2 3 3 2 2 4 133 133 89 89 3 1 1 1 5 4 3 23 4 4 2 10 4 6 1 1 1 3 3 3 2 4 11 11 2 14 1 7 5 2 6 9 6 4 20 20 42 42 46 46 32 32 4 5 2741 2740 1346 345 566 566 32 44 44 44 1 566 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io */ /* Devmaps primary use is as a backend map for XDP BPF helper call * bpf_redirect_map(). Because XDP is mostly concerned with performance we * spent some effort to ensure the datapath with redirect maps does not use * any locking. This is a quick note on the details. * * We have three possible paths to get into the devmap control plane bpf * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall * will invoke an update, delete, or lookup operation. To ensure updates and * deletes appear atomic from the datapath side xchg() is used to modify the * netdev_map array. Then because the datapath does a lookup into the netdev_map * array (read-only) from an RCU critical section we use call_rcu() to wait for * an rcu grace period before free'ing the old data structures. This ensures the * datapath always has a valid copy. However, the datapath does a "flush" * operation that pushes any pending packets in the driver outside the RCU * critical section. Each bpf_dtab_netdev tracks these pending operations using * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed until * this list is empty, indicating outstanding flush operations have completed. * * BPF syscalls may race with BPF program calls on any of the update, delete * or lookup operations. As noted above the xchg() operation also keep the * netdev_map consistent in this case. From the devmap side BPF programs * calling into these operations are the same as multiple user space threads * making system calls. * * Finally, any of the above may race with a netdev_unregister notifier. The * unregister notifier must search for net devices in the map structure that * contain a reference to the net device and remove them. This is a two step * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b) * check to see if the ifindex is the same as the net_device being removed. * When removing the dev a cmpxchg() is used to ensure the correct dev is * removed, in the case of a concurrent update or delete operation it is * possible that the initially referenced dev is no longer in the map. As the * notifier hook walks the map we know that new dev references can not be * added by the user because core infrastructure ensures dev_get_by_index() * calls will fail at this point. * * The devmap_hash type is a map type which interprets keys as ifindexes and * indexes these using a hashmap. This allows maps that use ifindex as key to be * densely packed instead of having holes in the lookup array for unused * ifindexes. The setup and packet enqueue/send code is shared between the two * types of devmap; only the lookup and insertion is different. */ #include <linux/bpf.h> #include <net/xdp.h> #include <linux/filter.h> #include <trace/events/xdp.h> #include <linux/btf_ids.h> #define DEV_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) struct xdp_dev_bulk_queue { struct xdp_frame *q[DEV_MAP_BULK_SIZE]; struct list_head flush_node; struct net_device *dev; struct net_device *dev_rx; struct bpf_prog *xdp_prog; unsigned int count; }; struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct hlist_node index_hlist; struct bpf_prog *xdp_prog; struct rcu_head rcu; unsigned int idx; struct bpf_devmap_val val; }; struct bpf_dtab { struct bpf_map map; struct bpf_dtab_netdev __rcu **netdev_map; /* DEVMAP type only */ struct list_head list; /* these are only used for DEVMAP_HASH type maps */ struct hlist_head *dev_index_head; spinlock_t index_lock; unsigned int items; u32 n_buckets; }; static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); static struct hlist_head *dev_map_create_hash(unsigned int entries, int numa_node) { int i; struct hlist_head *hash; hash = bpf_map_area_alloc((u64) entries * sizeof(*hash), numa_node); if (hash != NULL) for (i = 0; i < entries; i++) INIT_HLIST_HEAD(&hash[i]); return hash; } static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, int idx) { return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)]; } static int dev_map_alloc_check(union bpf_attr *attr) { u32 valsize = attr->value_size; /* check sanity of attributes. 2 value sizes supported: * 4 bytes: ifindex * 8 bytes: ifindex + prog fd */ if (attr->max_entries == 0 || attr->key_size != 4 || (valsize != offsetofend(struct bpf_devmap_val, ifindex) && valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) || attr->map_flags & ~DEV_CREATE_FLAG_MASK) return -EINVAL; if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { /* Hash table size must be power of 2; roundup_pow_of_two() * can overflow into UB on 32-bit arches */ if (attr->max_entries > 1UL << 31) return -EINVAL; } return 0; } static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { /* Lookup returns a pointer straight to dev->ifindex, so make sure the * verifier prevents writes from the BPF side */ attr->map_flags |= BPF_F_RDONLY_PROG; bpf_map_init_from_attr(&dtab->map, attr); if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { /* Hash table size must be power of 2 */ dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets, dtab->map.numa_node); if (!dtab->dev_index_head) return -ENOMEM; spin_lock_init(&dtab->index_lock); } else { dtab->netdev_map = bpf_map_area_alloc((u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *), dtab->map.numa_node); if (!dtab->netdev_map) return -ENOMEM; } return 0; } static struct bpf_map *dev_map_alloc(union bpf_attr *attr) { struct bpf_dtab *dtab; int err; dtab = bpf_map_area_alloc(sizeof(*dtab), NUMA_NO_NODE); if (!dtab) return ERR_PTR(-ENOMEM); err = dev_map_init_map(dtab, attr); if (err) { bpf_map_area_free(dtab); return ERR_PTR(err); } spin_lock(&dev_map_lock); list_add_tail_rcu(&dtab->list, &dev_map_list); spin_unlock(&dev_map_lock); return &dtab->map; } static void dev_map_free(struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); u32 i; /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, * so the programs (can be more than one that used this map) were * disconnected from events. The following synchronize_rcu() guarantees * both rcu read critical sections complete and waits for * preempt-disable regions (NAPI being the relevant context here) so we * are certain there will be no further reads against the netdev_map and * all flush operations are complete. Flush operations can only be done * from NAPI context for this reason. */ spin_lock(&dev_map_lock); list_del_rcu(&dtab->list); spin_unlock(&dev_map_lock); /* bpf_redirect_info->map is assigned in __bpf_xdp_redirect_map() * during NAPI callback and cleared after the XDP redirect. There is no * explicit RCU read section which protects bpf_redirect_info->map but * local_bh_disable() also marks the beginning an RCU section. This * makes the complete softirq callback RCU protected. Thus after * following synchronize_rcu() there no bpf_redirect_info->map == map * assignment. */ synchronize_rcu(); /* Make sure prior __dev_map_entry_free() have completed. */ rcu_barrier(); if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { for (i = 0; i < dtab->n_buckets; i++) { struct bpf_dtab_netdev *dev; struct hlist_head *head; struct hlist_node *next; head = dev_map_index_hash(dtab, i); hlist_for_each_entry_safe(dev, next, head, index_hlist) { hlist_del_rcu(&dev->index_hlist); if (dev->xdp_prog) bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } } bpf_map_area_free(dtab->dev_index_head); } else { for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev; dev = rcu_dereference_raw(dtab->netdev_map[i]); if (!dev) continue; if (dev->xdp_prog) bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } bpf_map_area_free(dtab->netdev_map); } bpf_map_area_free(dtab); } static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); u32 index = key ? *(u32 *)key : U32_MAX; u32 *next = next_key; if (index >= dtab->map.max_entries) { *next = 0; return 0; } if (index == dtab->map.max_entries - 1) return -ENOENT; *next = index + 1; return 0; } /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or * by local_bh_disable() (from XDP calls inside NAPI). The * rcu_read_lock_bh_held() below makes lockdep accept both. */ static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct hlist_head *head = dev_map_index_hash(dtab, key); struct bpf_dtab_netdev *dev; hlist_for_each_entry_rcu(dev, head, index_hlist, lockdep_is_held(&dtab->index_lock)) if (dev->idx == key) return dev; return NULL; } static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); u32 idx, *next = next_key; struct bpf_dtab_netdev *dev, *next_dev; struct hlist_head *head; int i = 0; if (!key) goto find_first; idx = *(u32 *)key; dev = __dev_map_hash_lookup_elem(map, idx); if (!dev) goto find_first; next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)), struct bpf_dtab_netdev, index_hlist); if (next_dev) { *next = next_dev->idx; return 0; } i = idx & (dtab->n_buckets - 1); i++; find_first: for (; i < dtab->n_buckets; i++) { head = dev_map_index_hash(dtab, i); next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), struct bpf_dtab_netdev, index_hlist); if (next_dev) { *next = next_dev->idx; return 0; } } return -ENOENT; } static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, struct xdp_frame **frames, int n, struct net_device *tx_dev, struct net_device *rx_dev) { struct xdp_txq_info txq = { .dev = tx_dev }; struct xdp_rxq_info rxq = { .dev = rx_dev }; struct xdp_buff xdp; int i, nframes = 0; for (i = 0; i < n; i++) { struct xdp_frame *xdpf = frames[i]; u32 act; int err; xdp_convert_frame_to_buff(xdpf, &xdp); xdp.txq = &txq; xdp.rxq = &rxq; act = bpf_prog_run_xdp(xdp_prog, &xdp); switch (act) { case XDP_PASS: err = xdp_update_frame_from_buff(&xdp, xdpf); if (unlikely(err < 0)) xdp_return_frame_rx_napi(xdpf); else frames[nframes++] = xdpf; break; default: bpf_warn_invalid_xdp_action(NULL, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(tx_dev, xdp_prog, act); fallthrough; case XDP_DROP: xdp_return_frame_rx_napi(xdpf); break; } } return nframes; /* sent frames count */ } static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) { struct net_device *dev = bq->dev; unsigned int cnt = bq->count; int sent = 0, err = 0; int to_send = cnt; int i; if (unlikely(!cnt)) return; for (i = 0; i < cnt; i++) { struct xdp_frame *xdpf = bq->q[i]; prefetch(xdpf); } if (bq->xdp_prog) { to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx); if (!to_send) goto out; } sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_send, bq->q, flags); if (sent < 0) { /* If ndo_xdp_xmit fails with an errno, no frames have * been xmit'ed. */ err = sent; sent = 0; } /* If not all frames have been transmitted, it is our * responsibility to free them */ for (i = sent; unlikely(i < to_send); i++) xdp_return_frame_rx_napi(bq->q[i]); out: bq->count = 0; trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err); } /* __dev_flush is called from xdp_do_flush() which _must_ be signalled from the * driver before returning from its napi->poll() routine. See the comment above * xdp_do_flush() in filter.c. */ void __dev_flush(struct list_head *flush_list) { struct xdp_dev_bulk_queue *bq, *tmp; list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { bq_xmit_all(bq, XDP_XMIT_FLUSH); bq->dev_rx = NULL; bq->xdp_prog = NULL; __list_del_clearprev(&bq->flush_node); } } /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or * by local_bh_disable() (from XDP calls inside NAPI). The * rcu_read_lock_bh_held() below makes lockdep accept both. */ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *obj; if (key >= map->max_entries) return NULL; obj = rcu_dereference_check(dtab->netdev_map[key], rcu_read_lock_bh_held()); return obj; } /* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu * variable access, and map elements stick around. See comment above * xdp_do_flush() in filter.c. */ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_prog *xdp_prog) { struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) bq_xmit_all(bq, 0); /* Ingress dev_rx will be the same for all xdp_frame's in * bulk_queue, because bq stored per-CPU and must be flushed * from net_device drivers NAPI func end. * * Do the same with xdp_prog and flush_list since these fields * are only ever modified together. */ if (!bq->dev_rx) { struct list_head *flush_list = bpf_net_ctx_get_dev_flush_list(); bq->dev_rx = dev_rx; bq->xdp_prog = xdp_prog; list_add(&bq->flush_node, flush_list); } bq->q[bq->count++] = xdpf; } static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_prog *xdp_prog) { int err; if (!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT)) return -EOPNOTSUPP; if (unlikely(!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) && xdp_frame_has_frags(xdpf))) return -EOPNOTSUPP; err = xdp_ok_fwd_dev(dev, xdp_get_frame_len(xdpf)); if (unlikely(err)) return err; bq_enqueue(dev, xdpf, dev_rx, xdp_prog); return 0; } static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev *dst) { struct xdp_txq_info txq = { .dev = dst->dev }; struct xdp_buff xdp; u32 act; if (!dst->xdp_prog) return XDP_PASS; __skb_pull(skb, skb->mac_len); xdp.txq = &txq; act = bpf_prog_run_generic_xdp(skb, &xdp, dst->xdp_prog); switch (act) { case XDP_PASS: __skb_push(skb, skb->mac_len); break; default: bpf_warn_invalid_xdp_action(NULL, dst->xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(dst->dev, dst->xdp_prog, act); fallthrough; case XDP_DROP: kfree_skb(skb); break; } return act; } int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx) { return __xdp_enqueue(dev, xdpf, dev_rx, NULL); } int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, struct net_device *dev_rx) { struct net_device *dev = dst->dev; return __xdp_enqueue(dev, xdpf, dev_rx, dst->xdp_prog); } static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf) { if (!obj) return false; if (!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT)) return false; if (unlikely(!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) && xdp_frame_has_frags(xdpf))) return false; if (xdp_ok_fwd_dev(obj->dev, xdp_get_frame_len(xdpf))) return false; return true; } static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj, struct net_device *dev_rx, struct xdp_frame *xdpf) { struct xdp_frame *nxdpf; nxdpf = xdpf_clone(xdpf); if (!nxdpf) return -ENOMEM; bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog); return 0; } static inline bool is_ifindex_excluded(int *excluded, int num_excluded, int ifindex) { while (num_excluded--) { if (ifindex == excluded[num_excluded]) return true; } return false; } /* Get ifindex of each upper device. 'indexes' must be able to hold at * least MAX_NEST_DEV elements. * Returns the number of ifindexes added. */ static int get_upper_ifindexes(struct net_device *dev, int *indexes) { struct net_device *upper; struct list_head *iter; int n = 0; netdev_for_each_upper_dev_rcu(dev, upper, iter) { indexes[n++] = upper->ifindex; } return n; } int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_map *map, bool exclude_ingress) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dst, *last_dst = NULL; int excluded_devices[1+MAX_NEST_DEV]; struct hlist_head *head; int num_excluded = 0; unsigned int i; int err; if (exclude_ingress) { num_excluded = get_upper_ifindexes(dev_rx, excluded_devices); excluded_devices[num_excluded++] = dev_rx->ifindex; } if (map->map_type == BPF_MAP_TYPE_DEVMAP) { for (i = 0; i < map->max_entries; i++) { dst = rcu_dereference_check(dtab->netdev_map[i], rcu_read_lock_bh_held()); if (!is_valid_dst(dst, xdpf)) continue; if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) continue; /* we only need n-1 clones; last_dst enqueued below */ if (!last_dst) { last_dst = dst; continue; } err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf); if (err) return err; last_dst = dst; } } else { /* BPF_MAP_TYPE_DEVMAP_HASH */ for (i = 0; i < dtab->n_buckets; i++) { head = dev_map_index_hash(dtab, i); hlist_for_each_entry_rcu(dst, head, index_hlist, lockdep_is_held(&dtab->index_lock)) { if (!is_valid_dst(dst, xdpf)) continue; if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) continue; /* we only need n-1 clones; last_dst enqueued below */ if (!last_dst) { last_dst = dst; continue; } err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf); if (err) return err; last_dst = dst; } } } /* consume the last copy of the frame */ if (last_dst) bq_enqueue(last_dst->dev, xdpf, dev_rx, last_dst->xdp_prog); else xdp_return_frame_rx_napi(xdpf); /* dtab is empty */ return 0; } int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, const struct bpf_prog *xdp_prog) { int err; err = xdp_ok_fwd_dev(dst->dev, skb->len); if (unlikely(err)) return err; /* Redirect has already succeeded semantically at this point, so we just * return 0 even if packet is dropped. Helper below takes care of * freeing skb. */ if (dev_map_bpf_prog_run_skb(skb, dst) != XDP_PASS) return 0; skb->dev = dst->dev; generic_xdp_tx(skb, xdp_prog); return 0; } static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst, struct sk_buff *skb, const struct bpf_prog *xdp_prog) { struct sk_buff *nskb; int err; nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return -ENOMEM; err = dev_map_generic_redirect(dst, nskb, xdp_prog); if (unlikely(err)) { consume_skb(nskb); return err; } return 0; } int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, const struct bpf_prog *xdp_prog, struct bpf_map *map, bool exclude_ingress) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dst, *last_dst = NULL; int excluded_devices[1+MAX_NEST_DEV]; struct hlist_head *head; struct hlist_node *next; int num_excluded = 0; unsigned int i; int err; if (exclude_ingress) { num_excluded = get_upper_ifindexes(dev, excluded_devices); excluded_devices[num_excluded++] = dev->ifindex; } if (map->map_type == BPF_MAP_TYPE_DEVMAP) { for (i = 0; i < map->max_entries; i++) { dst = rcu_dereference_check(dtab->netdev_map[i], rcu_read_lock_bh_held()); if (!dst) continue; if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) continue; /* we only need n-1 clones; last_dst enqueued below */ if (!last_dst) { last_dst = dst; continue; } err = dev_map_redirect_clone(last_dst, skb, xdp_prog); if (err) return err; last_dst = dst; } } else { /* BPF_MAP_TYPE_DEVMAP_HASH */ for (i = 0; i < dtab->n_buckets; i++) { head = dev_map_index_hash(dtab, i); hlist_for_each_entry_safe(dst, next, head, index_hlist) { if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) continue; /* we only need n-1 clones; last_dst enqueued below */ if (!last_dst) { last_dst = dst; continue; } err = dev_map_redirect_clone(last_dst, skb, xdp_prog); if (err) return err; last_dst = dst; } } } /* consume the first skb and return */ if (last_dst) return dev_map_generic_redirect(last_dst, skb, xdp_prog); /* dtab is empty */ consume_skb(skb); return 0; } static void *dev_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key); return obj ? &obj->val : NULL; } static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) { struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map, *(u32 *)key); return obj ? &obj->val : NULL; } static void __dev_map_entry_free(struct rcu_head *rcu) { struct bpf_dtab_netdev *dev; dev = container_of(rcu, struct bpf_dtab_netdev, rcu); if (dev->xdp_prog) bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } static long dev_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *old_dev; u32 k = *(u32 *)key; if (k >= map->max_entries) return -EINVAL; old_dev = unrcu_pointer(xchg(&dtab->netdev_map[k], NULL)); if (old_dev) { call_rcu(&old_dev->rcu, __dev_map_entry_free); atomic_dec((atomic_t *)&dtab->items); } return 0; } static long dev_map_hash_delete_elem(struct bpf_map *map, void *key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *old_dev; u32 k = *(u32 *)key; unsigned long flags; int ret = -ENOENT; spin_lock_irqsave(&dtab->index_lock, flags); old_dev = __dev_map_hash_lookup_elem(map, k); if (old_dev) { dtab->items--; hlist_del_init_rcu(&old_dev->index_hlist); call_rcu(&old_dev->rcu, __dev_map_entry_free); ret = 0; } spin_unlock_irqrestore(&dtab->index_lock, flags); return ret; } static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_dtab *dtab, struct bpf_devmap_val *val, unsigned int idx) { struct bpf_prog *prog = NULL; struct bpf_dtab_netdev *dev; dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev), GFP_NOWAIT, dtab->map.numa_node); if (!dev) return ERR_PTR(-ENOMEM); dev->dev = dev_get_by_index(net, val->ifindex); if (!dev->dev) goto err_out; if (val->bpf_prog.fd > 0) { prog = bpf_prog_get_type_dev(val->bpf_prog.fd, BPF_PROG_TYPE_XDP, false); if (IS_ERR(prog)) goto err_put_dev; if (prog->expected_attach_type != BPF_XDP_DEVMAP || !bpf_prog_map_compatible(&dtab->map, prog)) goto err_put_prog; } dev->idx = idx; if (prog) { dev->xdp_prog = prog; dev->val.bpf_prog.id = prog->aux->id; } else { dev->xdp_prog = NULL; dev->val.bpf_prog.id = 0; } dev->val.ifindex = val->ifindex; return dev; err_put_prog: bpf_prog_put(prog); err_put_dev: dev_put(dev->dev); err_out: kfree(dev); return ERR_PTR(-EINVAL); } static long __dev_map_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev, *old_dev; struct bpf_devmap_val val = {}; u32 i = *(u32 *)key; if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; if (unlikely(i >= dtab->map.max_entries)) return -E2BIG; if (unlikely(map_flags == BPF_NOEXIST)) return -EEXIST; /* already verified value_size <= sizeof val */ memcpy(&val, value, map->value_size); if (!val.ifindex) { dev = NULL; /* can not specify fd if ifindex is 0 */ if (val.bpf_prog.fd > 0) return -EINVAL; } else { dev = __dev_map_alloc_node(net, dtab, &val, i); if (IS_ERR(dev)) return PTR_ERR(dev); } /* Use call_rcu() here to ensure rcu critical sections have completed * Remembering the driver side flush operation will happen before the * net device is removed. */ old_dev = unrcu_pointer(xchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev))); if (old_dev) call_rcu(&old_dev->rcu, __dev_map_entry_free); else atomic_inc((atomic_t *)&dtab->items); return 0; } static long dev_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { return __dev_map_update_elem(current->nsproxy->net_ns, map, key, value, map_flags); } static long __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev, *old_dev; struct bpf_devmap_val val = {}; u32 idx = *(u32 *)key; unsigned long flags; int err = -EEXIST; /* already verified value_size <= sizeof val */ memcpy(&val, value, map->value_size); if (unlikely(map_flags > BPF_EXIST || !val.ifindex)) return -EINVAL; spin_lock_irqsave(&dtab->index_lock, flags); old_dev = __dev_map_hash_lookup_elem(map, idx); if (old_dev && (map_flags & BPF_NOEXIST)) goto out_err; dev = __dev_map_alloc_node(net, dtab, &val, idx); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto out_err; } if (old_dev) { hlist_del_rcu(&old_dev->index_hlist); } else { if (dtab->items >= dtab->map.max_entries) { spin_unlock_irqrestore(&dtab->index_lock, flags); call_rcu(&dev->rcu, __dev_map_entry_free); return -E2BIG; } dtab->items++; } hlist_add_head_rcu(&dev->index_hlist, dev_map_index_hash(dtab, idx)); spin_unlock_irqrestore(&dtab->index_lock, flags); if (old_dev) call_rcu(&old_dev->rcu, __dev_map_entry_free); return 0; out_err: spin_unlock_irqrestore(&dtab->index_lock, flags); return err; } static long dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { return __dev_map_hash_update_elem(current->nsproxy->net_ns, map, key, value, map_flags); } static long dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) { return __bpf_xdp_redirect_map(map, ifindex, flags, BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, __dev_map_lookup_elem); } static long dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) { return __bpf_xdp_redirect_map(map, ifindex, flags, BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, __dev_map_hash_lookup_elem); } static u64 dev_map_mem_usage(const struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); u64 usage = sizeof(struct bpf_dtab); if (map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) usage += (u64)dtab->n_buckets * sizeof(struct hlist_head); else usage += (u64)map->max_entries * sizeof(struct bpf_dtab_netdev *); usage += atomic_read((atomic_t *)&dtab->items) * (u64)sizeof(struct bpf_dtab_netdev); return usage; } BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab) const struct bpf_map_ops dev_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = dev_map_alloc_check, .map_alloc = dev_map_alloc, .map_free = dev_map_free, .map_get_next_key = dev_map_get_next_key, .map_lookup_elem = dev_map_lookup_elem, .map_update_elem = dev_map_update_elem, .map_delete_elem = dev_map_delete_elem, .map_check_btf = map_check_no_btf, .map_mem_usage = dev_map_mem_usage, .map_btf_id = &dev_map_btf_ids[0], .map_redirect = dev_map_redirect, }; const struct bpf_map_ops dev_map_hash_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = dev_map_alloc_check, .map_alloc = dev_map_alloc, .map_free = dev_map_free, .map_get_next_key = dev_map_hash_get_next_key, .map_lookup_elem = dev_map_hash_lookup_elem, .map_update_elem = dev_map_hash_update_elem, .map_delete_elem = dev_map_hash_delete_elem, .map_check_btf = map_check_no_btf, .map_mem_usage = dev_map_mem_usage, .map_btf_id = &dev_map_btf_ids[0], .map_redirect = dev_hash_map_redirect, }; static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab, struct net_device *netdev) { unsigned long flags; u32 i; spin_lock_irqsave(&dtab->index_lock, flags); for (i = 0; i < dtab->n_buckets; i++) { struct bpf_dtab_netdev *dev; struct hlist_head *head; struct hlist_node *next; head = dev_map_index_hash(dtab, i); hlist_for_each_entry_safe(dev, next, head, index_hlist) { if (netdev != dev->dev) continue; dtab->items--; hlist_del_rcu(&dev->index_hlist); call_rcu(&dev->rcu, __dev_map_entry_free); } } spin_unlock_irqrestore(&dtab->index_lock, flags); } static int dev_map_notification(struct notifier_block *notifier, ulong event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); struct bpf_dtab *dtab; int i, cpu; switch (event) { case NETDEV_REGISTER: if (!netdev->netdev_ops->ndo_xdp_xmit || netdev->xdp_bulkq) break; /* will be freed in free_netdev() */ netdev->xdp_bulkq = alloc_percpu(struct xdp_dev_bulk_queue); if (!netdev->xdp_bulkq) return NOTIFY_BAD; for_each_possible_cpu(cpu) per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev; break; case NETDEV_UNREGISTER: /* This rcu_read_lock/unlock pair is needed because * dev_map_list is an RCU list AND to ensure a delete * operation does not free a netdev_map entry while we * are comparing it against the netdev being unregistered. */ rcu_read_lock(); list_for_each_entry_rcu(dtab, &dev_map_list, list) { if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dev_map_hash_remove_netdev(dtab, netdev); continue; } for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev, *odev; dev = rcu_dereference(dtab->netdev_map[i]); if (!dev || netdev != dev->dev) continue; odev = unrcu_pointer(cmpxchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev), NULL)); if (dev == odev) { call_rcu(&dev->rcu, __dev_map_entry_free); atomic_dec((atomic_t *)&dtab->items); } } } rcu_read_unlock(); break; default: break; } return NOTIFY_OK; } static struct notifier_block dev_map_notifier = { .notifier_call = dev_map_notification, }; static int __init dev_map_init(void) { /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */ BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) != offsetof(struct _bpf_dtab_netdev, dev)); register_netdevice_notifier(&dev_map_notifier); return 0; } subsys_initcall(dev_map_init);
1 1 3 2 1 1 3 1 2 63 63 63 59 59 16 16 3 3 4 4 2 4 2 2 3 1 1 1 38 1 8 36 3 1 1 1 9 9 3 3 3 3 4 3 4 1 2 5 3 1 1 1 6 1 2 1 2 4 2 1 1 7 7 2 3 2 2 1 2 4 1 1 2 1 3 1 1 1 1 12 4 4 4 4 6 4 4 1 2 3 3 2 3 2 5 2 1 2 7 2 1 4 6 2 1 4 5 2 1 2 2 2 2 2 2 2 1 1 7 1 2 1 1 1 1 5 1 2 1 1 445 105 341 341 59 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2011 Instituto Nokia de Tecnologia * * Authors: * Lauro Ramos Venancio <lauro.venancio@openbossa.org> * Aloisio Almeida Jr <aloisio.almeida@openbossa.org> * * Vendor commands implementation based on net/wireless/nl80211.c * which is: * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH */ #define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__ #include <net/genetlink.h> #include <linux/nfc.h> #include <linux/slab.h> #include "nfc.h" #include "llcp.h" static const struct genl_multicast_group nfc_genl_mcgrps[] = { { .name = NFC_GENL_MCAST_EVENT_NAME, }, }; static struct genl_family nfc_genl_family; static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = { [NFC_ATTR_DEVICE_INDEX] = { .type = NLA_U32 }, [NFC_ATTR_DEVICE_NAME] = { .type = NLA_STRING, .len = NFC_DEVICE_NAME_MAXSIZE }, [NFC_ATTR_PROTOCOLS] = { .type = NLA_U32 }, [NFC_ATTR_TARGET_INDEX] = { .type = NLA_U32 }, [NFC_ATTR_COMM_MODE] = { .type = NLA_U8 }, [NFC_ATTR_RF_MODE] = { .type = NLA_U8 }, [NFC_ATTR_DEVICE_POWERED] = { .type = NLA_U8 }, [NFC_ATTR_IM_PROTOCOLS] = { .type = NLA_U32 }, [NFC_ATTR_TM_PROTOCOLS] = { .type = NLA_U32 }, [NFC_ATTR_LLC_PARAM_LTO] = { .type = NLA_U8 }, [NFC_ATTR_LLC_PARAM_RW] = { .type = NLA_U8 }, [NFC_ATTR_LLC_PARAM_MIUX] = { .type = NLA_U16 }, [NFC_ATTR_LLC_SDP] = { .type = NLA_NESTED }, [NFC_ATTR_FIRMWARE_NAME] = { .type = NLA_STRING, .len = NFC_FIRMWARE_NAME_MAXSIZE }, [NFC_ATTR_SE_INDEX] = { .type = NLA_U32 }, [NFC_ATTR_SE_APDU] = { .type = NLA_BINARY }, [NFC_ATTR_VENDOR_ID] = { .type = NLA_U32 }, [NFC_ATTR_VENDOR_SUBCMD] = { .type = NLA_U32 }, [NFC_ATTR_VENDOR_DATA] = { .type = NLA_BINARY }, }; static const struct nla_policy nfc_sdp_genl_policy[NFC_SDP_ATTR_MAX + 1] = { [NFC_SDP_ATTR_URI] = { .type = NLA_STRING, .len = U8_MAX - 4 }, [NFC_SDP_ATTR_SAP] = { .type = NLA_U8 }, }; static int nfc_genl_send_target(struct sk_buff *msg, struct nfc_target *target, struct netlink_callback *cb, int flags) { void *hdr; hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &nfc_genl_family, flags, NFC_CMD_GET_TARGET); if (!hdr) return -EMSGSIZE; genl_dump_check_consistent(cb, hdr); if (nla_put_u32(msg, NFC_ATTR_TARGET_INDEX, target->idx) || nla_put_u32(msg, NFC_ATTR_PROTOCOLS, target->supported_protocols) || nla_put_u16(msg, NFC_ATTR_TARGET_SENS_RES, target->sens_res) || nla_put_u8(msg, NFC_ATTR_TARGET_SEL_RES, target->sel_res)) goto nla_put_failure; if (target->nfcid1_len > 0 && nla_put(msg, NFC_ATTR_TARGET_NFCID1, target->nfcid1_len, target->nfcid1)) goto nla_put_failure; if (target->sensb_res_len > 0 && nla_put(msg, NFC_ATTR_TARGET_SENSB_RES, target->sensb_res_len, target->sensb_res)) goto nla_put_failure; if (target->sensf_res_len > 0 && nla_put(msg, NFC_ATTR_TARGET_SENSF_RES, target->sensf_res_len, target->sensf_res)) goto nla_put_failure; if (target->is_iso15693) { if (nla_put_u8(msg, NFC_ATTR_TARGET_ISO15693_DSFID, target->iso15693_dsfid) || nla_put(msg, NFC_ATTR_TARGET_ISO15693_UID, sizeof(target->iso15693_uid), target->iso15693_uid)) goto nla_put_failure; } if (target->ats_len > 0 && nla_put(msg, NFC_ATTR_TARGET_ATS, target->ats_len, target->ats)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static struct nfc_dev *__get_device_from_cb(struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct nfc_dev *dev; u32 idx; if (!info->info.attrs[NFC_ATTR_DEVICE_INDEX]) return ERR_PTR(-EINVAL); idx = nla_get_u32(info->info.attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return ERR_PTR(-ENODEV); return dev; } static int nfc_genl_dump_targets(struct sk_buff *skb, struct netlink_callback *cb) { int i = cb->args[0]; struct nfc_dev *dev = (struct nfc_dev *) cb->args[1]; int rc; if (!dev) { dev = __get_device_from_cb(cb); if (IS_ERR(dev)) return PTR_ERR(dev); cb->args[1] = (long) dev; } device_lock(&dev->dev); cb->seq = dev->targets_generation; while (i < dev->n_targets) { rc = nfc_genl_send_target(skb, &dev->targets[i], cb, NLM_F_MULTI); if (rc < 0) break; i++; } device_unlock(&dev->dev); cb->args[0] = i; return skb->len; } static int nfc_genl_dump_targets_done(struct netlink_callback *cb) { struct nfc_dev *dev = (struct nfc_dev *) cb->args[1]; if (dev) nfc_put_device(dev); return 0; } int nfc_genl_targets_found(struct nfc_dev *dev) { struct sk_buff *msg; void *hdr; dev->genl_data.poll_req_portid = 0; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_TARGETS_FOUND); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; genlmsg_end(msg, hdr); return genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_target_lost(struct nfc_dev *dev, u32 target_idx) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_TARGET_LOST); if (!hdr) goto free_msg; if (nla_put_string(msg, NFC_ATTR_DEVICE_NAME, nfc_device_name(dev)) || nla_put_u32(msg, NFC_ATTR_TARGET_INDEX, target_idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_tm_activated(struct nfc_dev *dev, u32 protocol) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_TM_ACTIVATED); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; if (nla_put_u32(msg, NFC_ATTR_TM_PROTOCOLS, protocol)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_tm_deactivated(struct nfc_dev *dev) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_TM_DEACTIVATED); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } static int nfc_genl_setup_device_added(struct nfc_dev *dev, struct sk_buff *msg) { if (nla_put_string(msg, NFC_ATTR_DEVICE_NAME, nfc_device_name(dev)) || nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_PROTOCOLS, dev->supported_protocols) || nla_put_u8(msg, NFC_ATTR_DEVICE_POWERED, dev->dev_up) || nla_put_u8(msg, NFC_ATTR_RF_MODE, dev->rf_mode)) return -1; return 0; } int nfc_genl_device_added(struct nfc_dev *dev) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_DEVICE_ADDED); if (!hdr) goto free_msg; if (nfc_genl_setup_device_added(dev, msg)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_device_removed(struct nfc_dev *dev) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_DEVICE_REMOVED); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_llc_send_sdres(struct nfc_dev *dev, struct hlist_head *sdres_list) { struct sk_buff *msg; struct nlattr *sdp_attr, *uri_attr; struct nfc_llcp_sdp_tlv *sdres; struct hlist_node *n; void *hdr; int rc = -EMSGSIZE; int i; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_LLC_SDRES); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; sdp_attr = nla_nest_start_noflag(msg, NFC_ATTR_LLC_SDP); if (sdp_attr == NULL) { rc = -ENOMEM; goto nla_put_failure; } i = 1; hlist_for_each_entry_safe(sdres, n, sdres_list, node) { pr_debug("uri: %s, sap: %d\n", sdres->uri, sdres->sap); uri_attr = nla_nest_start_noflag(msg, i++); if (uri_attr == NULL) { rc = -ENOMEM; goto nla_put_failure; } if (nla_put_u8(msg, NFC_SDP_ATTR_SAP, sdres->sap)) goto nla_put_failure; if (nla_put_string(msg, NFC_SDP_ATTR_URI, sdres->uri)) goto nla_put_failure; nla_nest_end(msg, uri_attr); hlist_del(&sdres->node); nfc_llcp_free_sdp_tlv(sdres); } nla_nest_end(msg, sdp_attr); genlmsg_end(msg, hdr); return genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); nla_put_failure: free_msg: nlmsg_free(msg); nfc_llcp_free_sdp_tlv_list(sdres_list); return rc; } int nfc_genl_se_added(struct nfc_dev *dev, u32 se_idx, u16 type) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_SE_ADDED); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx) || nla_put_u8(msg, NFC_ATTR_SE_TYPE, type)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_se_removed(struct nfc_dev *dev, u32 se_idx) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_SE_REMOVED); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_se_transaction(struct nfc_dev *dev, u8 se_idx, struct nfc_evt_transaction *evt_transaction) { struct nfc_se *se; struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_SE_TRANSACTION); if (!hdr) goto free_msg; se = nfc_find_se(dev, se_idx); if (!se) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx) || nla_put_u8(msg, NFC_ATTR_SE_TYPE, se->type) || nla_put(msg, NFC_ATTR_SE_AID, evt_transaction->aid_len, evt_transaction->aid) || nla_put(msg, NFC_ATTR_SE_PARAMS, evt_transaction->params_len, evt_transaction->params)) goto nla_put_failure; /* evt_transaction is no more used */ devm_kfree(&dev->dev, evt_transaction); genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: /* evt_transaction is no more used */ devm_kfree(&dev->dev, evt_transaction); nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_se_connectivity(struct nfc_dev *dev, u8 se_idx) { const struct nfc_se *se; struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_SE_CONNECTIVITY); if (!hdr) goto free_msg; se = nfc_find_se(dev, se_idx); if (!se) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx) || nla_put_u8(msg, NFC_ATTR_SE_TYPE, se->type)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } static int nfc_genl_send_device(struct sk_buff *msg, struct nfc_dev *dev, u32 portid, u32 seq, struct netlink_callback *cb, int flags) { void *hdr; hdr = genlmsg_put(msg, portid, seq, &nfc_genl_family, flags, NFC_CMD_GET_DEVICE); if (!hdr) return -EMSGSIZE; if (cb) genl_dump_check_consistent(cb, hdr); if (nfc_genl_setup_device_added(dev, msg)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int nfc_genl_dump_devices(struct sk_buff *skb, struct netlink_callback *cb) { struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0]; struct nfc_dev *dev = (struct nfc_dev *) cb->args[1]; bool first_call = false; if (!iter) { first_call = true; iter = kmalloc(sizeof(struct class_dev_iter), GFP_KERNEL); if (!iter) return -ENOMEM; cb->args[0] = (long) iter; } mutex_lock(&nfc_devlist_mutex); cb->seq = nfc_devlist_generation; if (first_call) { nfc_device_iter_init(iter); dev = nfc_device_iter_next(iter); } while (dev) { int rc; rc = nfc_genl_send_device(skb, dev, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb, NLM_F_MULTI); if (rc < 0) break; dev = nfc_device_iter_next(iter); } mutex_unlock(&nfc_devlist_mutex); cb->args[1] = (long) dev; return skb->len; } static int nfc_genl_dump_devices_done(struct netlink_callback *cb) { struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0]; if (iter) { nfc_device_iter_exit(iter); kfree(iter); } return 0; } int nfc_genl_dep_link_up_event(struct nfc_dev *dev, u32 target_idx, u8 comm_mode, u8 rf_mode) { struct sk_buff *msg; void *hdr; pr_debug("DEP link is up\n"); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_CMD_DEP_LINK_UP); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; if (rf_mode == NFC_RF_INITIATOR && nla_put_u32(msg, NFC_ATTR_TARGET_INDEX, target_idx)) goto nla_put_failure; if (nla_put_u8(msg, NFC_ATTR_COMM_MODE, comm_mode) || nla_put_u8(msg, NFC_ATTR_RF_MODE, rf_mode)) goto nla_put_failure; genlmsg_end(msg, hdr); dev->dep_link_up = true; genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_dep_link_down_event(struct nfc_dev *dev) { struct sk_buff *msg; void *hdr; pr_debug("DEP link is down\n"); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_CMD_DEP_LINK_DOWN); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } static int nfc_genl_get_device(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; struct nfc_dev *dev; u32 idx; int rc = -ENOBUFS; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { rc = -ENOMEM; goto out_putdev; } rc = nfc_genl_send_device(msg, dev, info->snd_portid, info->snd_seq, NULL, 0); if (rc < 0) goto out_free; nfc_put_device(dev); return genlmsg_reply(msg, info); out_free: nlmsg_free(msg); out_putdev: nfc_put_device(dev); return rc; } static int nfc_genl_dev_up(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_dev_up(dev); nfc_put_device(dev); return rc; } static int nfc_genl_dev_down(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_dev_down(dev); nfc_put_device(dev); return rc; } static int nfc_genl_start_poll(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; u32 im_protocols = 0, tm_protocols = 0; pr_debug("Poll start\n"); if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || ((!info->attrs[NFC_ATTR_IM_PROTOCOLS] && !info->attrs[NFC_ATTR_PROTOCOLS]) && !info->attrs[NFC_ATTR_TM_PROTOCOLS])) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); if (info->attrs[NFC_ATTR_TM_PROTOCOLS]) tm_protocols = nla_get_u32(info->attrs[NFC_ATTR_TM_PROTOCOLS]); if (info->attrs[NFC_ATTR_IM_PROTOCOLS]) im_protocols = nla_get_u32(info->attrs[NFC_ATTR_IM_PROTOCOLS]); else if (info->attrs[NFC_ATTR_PROTOCOLS]) im_protocols = nla_get_u32(info->attrs[NFC_ATTR_PROTOCOLS]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; mutex_lock(&dev->genl_data.genl_data_mutex); rc = nfc_start_poll(dev, im_protocols, tm_protocols); if (!rc) dev->genl_data.poll_req_portid = info->snd_portid; mutex_unlock(&dev->genl_data.genl_data_mutex); nfc_put_device(dev); return rc; } static int nfc_genl_stop_poll(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; device_lock(&dev->dev); if (!dev->polling) { device_unlock(&dev->dev); nfc_put_device(dev); return -EINVAL; } device_unlock(&dev->dev); mutex_lock(&dev->genl_data.genl_data_mutex); if (dev->genl_data.poll_req_portid != info->snd_portid) { rc = -EBUSY; goto out; } rc = nfc_stop_poll(dev); dev->genl_data.poll_req_portid = 0; out: mutex_unlock(&dev->genl_data.genl_data_mutex); nfc_put_device(dev); return rc; } static int nfc_genl_activate_target(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; u32 device_idx, target_idx, protocol; int rc; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_TARGET_INDEX] || !info->attrs[NFC_ATTR_PROTOCOLS]) return -EINVAL; device_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(device_idx); if (!dev) return -ENODEV; target_idx = nla_get_u32(info->attrs[NFC_ATTR_TARGET_INDEX]); protocol = nla_get_u32(info->attrs[NFC_ATTR_PROTOCOLS]); nfc_deactivate_target(dev, target_idx, NFC_TARGET_MODE_SLEEP); rc = nfc_activate_target(dev, target_idx, protocol); nfc_put_device(dev); return rc; } static int nfc_genl_deactivate_target(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; u32 device_idx, target_idx; int rc; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_TARGET_INDEX]) return -EINVAL; device_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(device_idx); if (!dev) return -ENODEV; target_idx = nla_get_u32(info->attrs[NFC_ATTR_TARGET_INDEX]); rc = nfc_deactivate_target(dev, target_idx, NFC_TARGET_MODE_SLEEP); nfc_put_device(dev); return rc; } static int nfc_genl_dep_link_up(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc, tgt_idx; u32 idx; u8 comm; pr_debug("DEP link up\n"); if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_COMM_MODE]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); if (!info->attrs[NFC_ATTR_TARGET_INDEX]) tgt_idx = NFC_TARGET_IDX_ANY; else tgt_idx = nla_get_u32(info->attrs[NFC_ATTR_TARGET_INDEX]); comm = nla_get_u8(info->attrs[NFC_ATTR_COMM_MODE]); if (comm != NFC_COMM_ACTIVE && comm != NFC_COMM_PASSIVE) return -EINVAL; dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_dep_link_up(dev, tgt_idx, comm); nfc_put_device(dev); return rc; } static int nfc_genl_dep_link_down(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_dep_link_down(dev); nfc_put_device(dev); return rc; } static int nfc_genl_send_params(struct sk_buff *msg, struct nfc_llcp_local *local, u32 portid, u32 seq) { void *hdr; hdr = genlmsg_put(msg, portid, seq, &nfc_genl_family, 0, NFC_CMD_LLC_GET_PARAMS); if (!hdr) return -EMSGSIZE; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, local->dev->idx) || nla_put_u8(msg, NFC_ATTR_LLC_PARAM_LTO, local->lto) || nla_put_u8(msg, NFC_ATTR_LLC_PARAM_RW, local->rw) || nla_put_u16(msg, NFC_ATTR_LLC_PARAM_MIUX, be16_to_cpu(local->miux))) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int nfc_genl_llc_get_params(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; struct nfc_llcp_local *local; int rc = 0; struct sk_buff *msg = NULL; u32 idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; device_lock(&dev->dev); local = nfc_llcp_find_local(dev); if (!local) { rc = -ENODEV; goto exit; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { rc = -ENOMEM; goto put_local; } rc = nfc_genl_send_params(msg, local, info->snd_portid, info->snd_seq); put_local: nfc_llcp_local_put(local); exit: device_unlock(&dev->dev); nfc_put_device(dev); if (rc < 0) { if (msg) nlmsg_free(msg); return rc; } return genlmsg_reply(msg, info); } static int nfc_genl_llc_set_params(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; struct nfc_llcp_local *local; u8 rw = 0; u16 miux = 0; u32 idx; int rc = 0; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || (!info->attrs[NFC_ATTR_LLC_PARAM_LTO] && !info->attrs[NFC_ATTR_LLC_PARAM_RW] && !info->attrs[NFC_ATTR_LLC_PARAM_MIUX])) return -EINVAL; if (info->attrs[NFC_ATTR_LLC_PARAM_RW]) { rw = nla_get_u8(info->attrs[NFC_ATTR_LLC_PARAM_RW]); if (rw > LLCP_MAX_RW) return -EINVAL; } if (info->attrs[NFC_ATTR_LLC_PARAM_MIUX]) { miux = nla_get_u16(info->attrs[NFC_ATTR_LLC_PARAM_MIUX]); if (miux > LLCP_MAX_MIUX) return -EINVAL; } idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; device_lock(&dev->dev); local = nfc_llcp_find_local(dev); if (!local) { rc = -ENODEV; goto exit; } if (info->attrs[NFC_ATTR_LLC_PARAM_LTO]) { if (dev->dep_link_up) { rc = -EINPROGRESS; goto put_local; } local->lto = nla_get_u8(info->attrs[NFC_ATTR_LLC_PARAM_LTO]); } if (info->attrs[NFC_ATTR_LLC_PARAM_RW]) local->rw = rw; if (info->attrs[NFC_ATTR_LLC_PARAM_MIUX]) local->miux = cpu_to_be16(miux); put_local: nfc_llcp_local_put(local); exit: device_unlock(&dev->dev); nfc_put_device(dev); return rc; } static int nfc_genl_llc_sdreq(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; struct nfc_llcp_local *local; struct nlattr *attr, *sdp_attrs[NFC_SDP_ATTR_MAX+1]; u32 idx; u8 tid; char *uri; int rc = 0, rem; size_t uri_len, tlvs_len; struct hlist_head sdreq_list; struct nfc_llcp_sdp_tlv *sdreq; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_LLC_SDP]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; device_lock(&dev->dev); if (dev->dep_link_up == false) { rc = -ENOLINK; goto exit; } local = nfc_llcp_find_local(dev); if (!local) { rc = -ENODEV; goto exit; } INIT_HLIST_HEAD(&sdreq_list); tlvs_len = 0; nla_for_each_nested(attr, info->attrs[NFC_ATTR_LLC_SDP], rem) { rc = nla_parse_nested_deprecated(sdp_attrs, NFC_SDP_ATTR_MAX, attr, nfc_sdp_genl_policy, info->extack); if (rc != 0) { rc = -EINVAL; goto put_local; } if (!sdp_attrs[NFC_SDP_ATTR_URI]) continue; uri_len = nla_len(sdp_attrs[NFC_SDP_ATTR_URI]); if (uri_len == 0) continue; uri = nla_data(sdp_attrs[NFC_SDP_ATTR_URI]); if (*uri == 0) continue; tid = local->sdreq_next_tid++; sdreq = nfc_llcp_build_sdreq_tlv(tid, uri, uri_len); if (sdreq == NULL) { rc = -ENOMEM; goto put_local; } tlvs_len += sdreq->tlv_len; hlist_add_head(&sdreq->node, &sdreq_list); } if (hlist_empty(&sdreq_list)) { rc = -EINVAL; goto put_local; } rc = nfc_llcp_send_snl_sdreq(local, &sdreq_list, tlvs_len); put_local: nfc_llcp_local_put(local); exit: device_unlock(&dev->dev); nfc_put_device(dev); return rc; } static int nfc_genl_fw_download(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; char firmware_name[NFC_FIRMWARE_NAME_MAXSIZE + 1]; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_FIRMWARE_NAME]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; nla_strscpy(firmware_name, info->attrs[NFC_ATTR_FIRMWARE_NAME], sizeof(firmware_name)); rc = nfc_fw_download(dev, firmware_name); nfc_put_device(dev); return rc; } int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name, u32 result) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_CMD_FW_DOWNLOAD); if (!hdr) goto free_msg; if (nla_put_string(msg, NFC_ATTR_FIRMWARE_NAME, firmware_name) || nla_put_u32(msg, NFC_ATTR_FIRMWARE_DOWNLOAD_STATUS, result) || nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } static int nfc_genl_enable_se(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx, se_idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_SE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); se_idx = nla_get_u32(info->attrs[NFC_ATTR_SE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_enable_se(dev, se_idx); nfc_put_device(dev); return rc; } static int nfc_genl_disable_se(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx, se_idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_SE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); se_idx = nla_get_u32(info->attrs[NFC_ATTR_SE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_disable_se(dev, se_idx); nfc_put_device(dev); return rc; } static int nfc_genl_send_se(struct sk_buff *msg, struct nfc_dev *dev, u32 portid, u32 seq, struct netlink_callback *cb, int flags) { void *hdr; struct nfc_se *se, *n; list_for_each_entry_safe(se, n, &dev->secure_elements, list) { hdr = genlmsg_put(msg, portid, seq, &nfc_genl_family, flags, NFC_CMD_GET_SE); if (!hdr) goto nla_put_failure; if (cb) genl_dump_check_consistent(cb, hdr); if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se->idx) || nla_put_u8(msg, NFC_ATTR_SE_TYPE, se->type)) goto nla_put_failure; genlmsg_end(msg, hdr); } return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int nfc_genl_dump_ses(struct sk_buff *skb, struct netlink_callback *cb) { struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0]; struct nfc_dev *dev = (struct nfc_dev *) cb->args[1]; bool first_call = false; if (!iter) { first_call = true; iter = kmalloc(sizeof(struct class_dev_iter), GFP_KERNEL); if (!iter) return -ENOMEM; cb->args[0] = (long) iter; } mutex_lock(&nfc_devlist_mutex); cb->seq = nfc_devlist_generation; if (first_call) { nfc_device_iter_init(iter); dev = nfc_device_iter_next(iter); } while (dev) { int rc; rc = nfc_genl_send_se(skb, dev, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb, NLM_F_MULTI); if (rc < 0) break; dev = nfc_device_iter_next(iter); } mutex_unlock(&nfc_devlist_mutex); cb->args[1] = (long) dev; return skb->len; } static int nfc_genl_dump_ses_done(struct netlink_callback *cb) { struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0]; if (iter) { nfc_device_iter_exit(iter); kfree(iter); } return 0; } static int nfc_se_io(struct nfc_dev *dev, u32 se_idx, u8 *apdu, size_t apdu_length, se_io_cb_t cb, void *cb_context) { struct nfc_se *se; int rc; pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx); device_lock(&dev->dev); if (!device_is_registered(&dev->dev)) { rc = -ENODEV; goto error; } if (!dev->dev_up) { rc = -ENODEV; goto error; } if (!dev->ops->se_io) { rc = -EOPNOTSUPP; goto error; } se = nfc_find_se(dev, se_idx); if (!se) { rc = -EINVAL; goto error; } if (se->state != NFC_SE_ENABLED) { rc = -ENODEV; goto error; } rc = dev->ops->se_io(dev, se_idx, apdu, apdu_length, cb, cb_context); device_unlock(&dev->dev); return rc; error: device_unlock(&dev->dev); kfree(cb_context); return rc; } struct se_io_ctx { u32 dev_idx; u32 se_idx; }; static void se_io_cb(void *context, u8 *apdu, size_t apdu_len, int err) { struct se_io_ctx *ctx = context; struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { kfree(ctx); return; } hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_CMD_SE_IO); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, ctx->dev_idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, ctx->se_idx) || nla_put(msg, NFC_ATTR_SE_APDU, apdu_len, apdu)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); kfree(ctx); return; nla_put_failure: free_msg: nlmsg_free(msg); kfree(ctx); return; } static int nfc_genl_se_io(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; struct se_io_ctx *ctx; u32 dev_idx, se_idx; u8 *apdu; size_t apdu_len; int rc; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_SE_INDEX] || !info->attrs[NFC_ATTR_SE_APDU]) return -EINVAL; dev_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); se_idx = nla_get_u32(info->attrs[NFC_ATTR_SE_INDEX]); dev = nfc_get_device(dev_idx); if (!dev) return -ENODEV; if (!dev->ops || !dev->ops->se_io) { rc = -EOPNOTSUPP; goto put_dev; } apdu_len = nla_len(info->attrs[NFC_ATTR_SE_APDU]); if (apdu_len == 0) { rc = -EINVAL; goto put_dev; } apdu = nla_data(info->attrs[NFC_ATTR_SE_APDU]); ctx = kzalloc(sizeof(struct se_io_ctx), GFP_KERNEL); if (!ctx) { rc = -ENOMEM; goto put_dev; } ctx->dev_idx = dev_idx; ctx->se_idx = se_idx; rc = nfc_se_io(dev, se_idx, apdu, apdu_len, se_io_cb, ctx); put_dev: nfc_put_device(dev); return rc; } static int nfc_genl_vendor_cmd(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; const struct nfc_vendor_cmd *cmd; u32 dev_idx, vid, subcmd; u8 *data; size_t data_len; int i, err; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_VENDOR_ID] || !info->attrs[NFC_ATTR_VENDOR_SUBCMD]) return -EINVAL; dev_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); vid = nla_get_u32(info->attrs[NFC_ATTR_VENDOR_ID]); subcmd = nla_get_u32(info->attrs[NFC_ATTR_VENDOR_SUBCMD]); dev = nfc_get_device(dev_idx); if (!dev) return -ENODEV; if (!dev->vendor_cmds || !dev->n_vendor_cmds) { err = -ENODEV; goto put_dev; } if (info->attrs[NFC_ATTR_VENDOR_DATA]) { data = nla_data(info->attrs[NFC_ATTR_VENDOR_DATA]); data_len = nla_len(info->attrs[NFC_ATTR_VENDOR_DATA]); if (data_len == 0) { err = -EINVAL; goto put_dev; } } else { data = NULL; data_len = 0; } for (i = 0; i < dev->n_vendor_cmds; i++) { cmd = &dev->vendor_cmds[i]; if (cmd->vendor_id != vid || cmd->subcmd != subcmd) continue; dev->cur_cmd_info = info; err = cmd->doit(dev, data, data_len); dev->cur_cmd_info = NULL; goto put_dev; } err = -EOPNOTSUPP; put_dev: nfc_put_device(dev); return err; } /* message building helper */ static inline void *nfc_hdr_put(struct sk_buff *skb, u32 portid, u32 seq, int flags, u8 cmd) { /* since there is no private header just add the generic one */ return genlmsg_put(skb, portid, seq, &nfc_genl_family, flags, cmd); } static struct sk_buff * __nfc_alloc_vendor_cmd_skb(struct nfc_dev *dev, int approxlen, u32 portid, u32 seq, enum nfc_attrs attr, u32 oui, u32 subcmd, gfp_t gfp) { struct sk_buff *skb; void *hdr; skb = nlmsg_new(approxlen + 100, gfp); if (!skb) return NULL; hdr = nfc_hdr_put(skb, portid, seq, 0, NFC_CMD_VENDOR); if (!hdr) { kfree_skb(skb); return NULL; } if (nla_put_u32(skb, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; if (nla_put_u32(skb, NFC_ATTR_VENDOR_ID, oui)) goto nla_put_failure; if (nla_put_u32(skb, NFC_ATTR_VENDOR_SUBCMD, subcmd)) goto nla_put_failure; ((void **)skb->cb)[0] = dev; ((void **)skb->cb)[1] = hdr; return skb; nla_put_failure: kfree_skb(skb); return NULL; } struct sk_buff *__nfc_alloc_vendor_cmd_reply_skb(struct nfc_dev *dev, enum nfc_attrs attr, u32 oui, u32 subcmd, int approxlen) { if (WARN_ON(!dev->cur_cmd_info)) return NULL; return __nfc_alloc_vendor_cmd_skb(dev, approxlen, dev->cur_cmd_info->snd_portid, dev->cur_cmd_info->snd_seq, attr, oui, subcmd, GFP_KERNEL); } EXPORT_SYMBOL(__nfc_alloc_vendor_cmd_reply_skb); int nfc_vendor_cmd_reply(struct sk_buff *skb) { struct nfc_dev *dev = ((void **)skb->cb)[0]; void *hdr = ((void **)skb->cb)[1]; /* clear CB data for netlink core to own from now on */ memset(skb->cb, 0, sizeof(skb->cb)); if (WARN_ON(!dev->cur_cmd_info)) { kfree_skb(skb); return -EINVAL; } genlmsg_end(skb, hdr); return genlmsg_reply(skb, dev->cur_cmd_info); } EXPORT_SYMBOL(nfc_vendor_cmd_reply); static const struct genl_ops nfc_genl_ops[] = { { .cmd = NFC_CMD_GET_DEVICE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_get_device, .dumpit = nfc_genl_dump_devices, .done = nfc_genl_dump_devices_done, }, { .cmd = NFC_CMD_DEV_UP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dev_up, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEV_DOWN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dev_down, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_START_POLL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_start_poll, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_STOP_POLL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_stop_poll, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEP_LINK_UP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dep_link_up, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEP_LINK_DOWN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dep_link_down, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_GET_TARGET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = nfc_genl_dump_targets, .done = nfc_genl_dump_targets_done, }, { .cmd = NFC_CMD_LLC_GET_PARAMS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_llc_get_params, }, { .cmd = NFC_CMD_LLC_SET_PARAMS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_llc_set_params, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_LLC_SDREQ, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_llc_sdreq, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_FW_DOWNLOAD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_fw_download, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_ENABLE_SE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_enable_se, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DISABLE_SE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_disable_se, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_GET_SE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .dumpit = nfc_genl_dump_ses, .done = nfc_genl_dump_ses_done, }, { .cmd = NFC_CMD_SE_IO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_se_io, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_ACTIVATE_TARGET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_activate_target, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_VENDOR, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_vendor_cmd, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEACTIVATE_TARGET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_deactivate_target, .flags = GENL_ADMIN_PERM, }, }; static struct genl_family nfc_genl_family __ro_after_init = { .hdrsize = 0, .name = NFC_GENL_NAME, .version = NFC_GENL_VERSION, .maxattr = NFC_ATTR_MAX, .policy = nfc_genl_policy, .module = THIS_MODULE, .ops = nfc_genl_ops, .n_ops = ARRAY_SIZE(nfc_genl_ops), .resv_start_op = NFC_CMD_DEACTIVATE_TARGET + 1, .mcgrps = nfc_genl_mcgrps, .n_mcgrps = ARRAY_SIZE(nfc_genl_mcgrps), }; struct urelease_work { struct work_struct w; u32 portid; }; static void nfc_urelease_event_work(struct work_struct *work) { struct urelease_work *w = container_of(work, struct urelease_work, w); struct class_dev_iter iter; struct nfc_dev *dev; pr_debug("portid %d\n", w->portid); mutex_lock(&nfc_devlist_mutex); nfc_device_iter_init(&iter); dev = nfc_device_iter_next(&iter); while (dev) { mutex_lock(&dev->genl_data.genl_data_mutex); if (dev->genl_data.poll_req_portid == w->portid) { nfc_stop_poll(dev); dev->genl_data.poll_req_portid = 0; } mutex_unlock(&dev->genl_data.genl_data_mutex); dev = nfc_device_iter_next(&iter); } nfc_device_iter_exit(&iter); mutex_unlock(&nfc_devlist_mutex); kfree(w); } static int nfc_genl_rcv_nl_event(struct notifier_block *this, unsigned long event, void *ptr) { struct netlink_notify *n = ptr; struct urelease_work *w; if (event != NETLINK_URELEASE || n->protocol != NETLINK_GENERIC) goto out; pr_debug("NETLINK_URELEASE event from id %d\n", n->portid); w = kmalloc(sizeof(*w), GFP_ATOMIC); if (w) { INIT_WORK(&w->w, nfc_urelease_event_work); w->portid = n->portid; schedule_work(&w->w); } out: return NOTIFY_DONE; } void nfc_genl_data_init(struct nfc_genl_data *genl_data) { genl_data->poll_req_portid = 0; mutex_init(&genl_data->genl_data_mutex); } void nfc_genl_data_exit(struct nfc_genl_data *genl_data) { mutex_destroy(&genl_data->genl_data_mutex); } static struct notifier_block nl_notifier = { .notifier_call = nfc_genl_rcv_nl_event, }; /** * nfc_genl_init() - Initialize netlink interface * * This initialization function registers the nfc netlink family. */ int __init nfc_genl_init(void) { int rc; rc = genl_register_family(&nfc_genl_family); if (rc) return rc; netlink_register_notifier(&nl_notifier); return 0; } /** * nfc_genl_exit() - Deinitialize netlink interface * * This exit function unregisters the nfc netlink family. */ void nfc_genl_exit(void) { netlink_unregister_notifier(&nl_notifier); genl_unregister_family(&nfc_genl_family); }
7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 #ifndef _CRYPTO_GCM_H #define _CRYPTO_GCM_H #include <linux/errno.h> #include <crypto/aes.h> #include <crypto/gf128mul.h> #define GCM_AES_IV_SIZE 12 #define GCM_RFC4106_IV_SIZE 8 #define GCM_RFC4543_IV_SIZE 8 /* * validate authentication tag for GCM */ static inline int crypto_gcm_check_authsize(unsigned int authsize) { switch (authsize) { case 4: case 8: case 12: case 13: case 14: case 15: case 16: break; default: return -EINVAL; } return 0; } /* * validate authentication tag for RFC4106 */ static inline int crypto_rfc4106_check_authsize(unsigned int authsize) { switch (authsize) { case 8: case 12: case 16: break; default: return -EINVAL; } return 0; } /* * validate assoclen for RFC4106/RFC4543 */ static inline int crypto_ipsec_check_assoclen(unsigned int assoclen) { switch (assoclen) { case 16: case 20: break; default: return -EINVAL; } return 0; } struct aesgcm_ctx { be128 ghash_key; struct crypto_aes_ctx aes_ctx; unsigned int authsize; }; int aesgcm_expandkey(struct aesgcm_ctx *ctx, const u8 *key, unsigned int keysize, unsigned int authsize); void aesgcm_encrypt(const struct aesgcm_ctx *ctx, u8 *dst, const u8 *src, int crypt_len, const u8 *assoc, int assoc_len, const u8 iv[GCM_AES_IV_SIZE], u8 *authtag); bool __must_check aesgcm_decrypt(const struct aesgcm_ctx *ctx, u8 *dst, const u8 *src, int crypt_len, const u8 *assoc, int assoc_len, const u8 iv[GCM_AES_IV_SIZE], const u8 *authtag); #endif
2 2 2 185 183 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 // SPDX-License-Identifier: GPL-2.0-or-later /* * Handle incoming frames * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/slab.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/netfilter_bridge.h> #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE #include <net/netfilter/nf_queue.h> #endif #include <linux/neighbour.h> #include <net/arp.h> #include <net/dsa.h> #include <linux/export.h> #include <linux/rculist.h> #include "br_private.h" #include "br_private_tunnel.h" static int br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb) { br_drop_fake_rtable(skb); return netif_receive_skb(skb); } static int br_pass_frame_up(struct sk_buff *skb, bool promisc) { struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev; struct net_bridge *br = netdev_priv(brdev); struct net_bridge_vlan_group *vg; dev_sw_netstats_rx_add(brdev, skb->len); vg = br_vlan_group_rcu(br); /* Reset the offload_fwd_mark because there could be a stacked * bridge above, and it should not think this bridge it doing * that bridge's work forwarding out its ports. */ br_switchdev_frame_unmark(skb); /* Bridge is just like any other port. Make sure the * packet is allowed except in promisc mode when someone * may be running packet capture. */ if (!(brdev->flags & IFF_PROMISC) && !br_allowed_egress(vg, skb)) { kfree_skb(skb); return NET_RX_DROP; } indev = skb->dev; skb->dev = brdev; skb = br_handle_vlan(br, NULL, vg, skb); if (!skb) return NET_RX_DROP; /* update the multicast stats if the packet is IGMP/MLD */ br_multicast_count(br, NULL, skb, br_multicast_igmp_type(skb), BR_MCAST_DIR_TX); BR_INPUT_SKB_CB(skb)->promisc = promisc; return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, dev_net(indev), NULL, skb, indev, NULL, br_netif_receive_skb); } /* note: already called with rcu_read_lock */ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct net_bridge_port *p = br_port_get_rcu(skb->dev); enum br_pkt_type pkt_type = BR_PKT_UNICAST; struct net_bridge_fdb_entry *dst = NULL; struct net_bridge_mcast_port *pmctx; struct net_bridge_mdb_entry *mdst; bool local_rcv, mcast_hit = false; struct net_bridge_mcast *brmctx; struct net_bridge_vlan *vlan; struct net_bridge *br; bool promisc; u16 vid = 0; u8 state; if (!p) goto drop; br = p->br; if (br_mst_is_enabled(br)) { state = BR_STATE_FORWARDING; } else { if (p->state == BR_STATE_DISABLED) { reason = SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE; goto drop; } state = p->state; } brmctx = &p->br->multicast_ctx; pmctx = &p->multicast_ctx; if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid, &state, &vlan)) goto out; if (p->flags & BR_PORT_LOCKED) { struct net_bridge_fdb_entry *fdb_src = br_fdb_find_rcu(br, eth_hdr(skb)->h_source, vid); if (!fdb_src) { /* FDB miss. Create locked FDB entry if MAB is enabled * and drop the packet. */ if (p->flags & BR_PORT_MAB) br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, BIT(BR_FDB_LOCKED)); goto drop; } else if (READ_ONCE(fdb_src->dst) != p || test_bit(BR_FDB_LOCAL, &fdb_src->flags)) { /* FDB mismatch. Drop the packet without roaming. */ goto drop; } else if (test_bit(BR_FDB_LOCKED, &fdb_src->flags)) { /* FDB match, but entry is locked. Refresh it and drop * the packet. */ br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, BIT(BR_FDB_LOCKED)); goto drop; } } nbp_switchdev_frame_mark(p, skb); /* insert into forwarding database after filtering to avoid spoofing */ if (p->flags & BR_LEARNING) br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, 0); promisc = !!(br->dev->flags & IFF_PROMISC); local_rcv = promisc; if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) { /* by definition the broadcast is also a multicast address */ if (is_broadcast_ether_addr(eth_hdr(skb)->h_dest)) { pkt_type = BR_PKT_BROADCAST; local_rcv = true; } else { pkt_type = BR_PKT_MULTICAST; if (br_multicast_rcv(&brmctx, &pmctx, vlan, skb, vid)) goto drop; } } if (state == BR_STATE_LEARNING) { reason = SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE; goto drop; } BR_INPUT_SKB_CB(skb)->brdev = br->dev; BR_INPUT_SKB_CB(skb)->src_port_isolated = !!(p->flags & BR_ISOLATED); if (IS_ENABLED(CONFIG_INET) && (skb->protocol == htons(ETH_P_ARP) || skb->protocol == htons(ETH_P_RARP))) { br_do_proxy_suppress_arp(skb, br, vid, p); } else if (IS_ENABLED(CONFIG_IPV6) && skb->protocol == htons(ETH_P_IPV6) && br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) && pskb_may_pull(skb, sizeof(struct ipv6hdr) + sizeof(struct nd_msg)) && ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) { struct nd_msg *msg, _msg; msg = br_is_nd_neigh_msg(skb, &_msg); if (msg) br_do_suppress_nd(skb, br, vid, p, msg); } switch (pkt_type) { case BR_PKT_MULTICAST: mdst = br_mdb_entry_skb_get(brmctx, skb, vid); if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && br_multicast_querier_exists(brmctx, eth_hdr(skb), mdst)) { if ((mdst && mdst->host_joined) || br_multicast_is_router(brmctx, skb) || br->dev->flags & IFF_ALLMULTI) { local_rcv = true; DEV_STATS_INC(br->dev, multicast); } mcast_hit = true; } else { local_rcv = true; DEV_STATS_INC(br->dev, multicast); } break; case BR_PKT_UNICAST: dst = br_fdb_find_rcu(br, eth_hdr(skb)->h_dest, vid); if (unlikely(!dst && vid && br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0))) { dst = br_fdb_find_rcu(br, eth_hdr(skb)->h_dest, 0); if (dst && (!test_bit(BR_FDB_LOCAL, &dst->flags) || test_bit(BR_FDB_ADDED_BY_USER, &dst->flags))) dst = NULL; } break; default: break; } if (dst) { unsigned long now = jiffies; if (test_bit(BR_FDB_LOCAL, &dst->flags)) return br_pass_frame_up(skb, false); if (now != dst->used) dst->used = now; br_forward(dst->dst, skb, local_rcv, false); } else { if (!mcast_hit) br_flood(br, skb, pkt_type, local_rcv, false, vid); else br_multicast_flood(mdst, skb, brmctx, local_rcv, false); } if (local_rcv) return br_pass_frame_up(skb, promisc); out: return 0; drop: kfree_skb_reason(skb, reason); goto out; } EXPORT_SYMBOL_GPL(br_handle_frame_finish); static void __br_handle_local_finish(struct sk_buff *skb) { struct net_bridge_port *p = br_port_get_rcu(skb->dev); u16 vid = 0; /* check if vlan is allowed, to avoid spoofing */ if ((p->flags & BR_LEARNING) && nbp_state_should_learn(p) && !br_opt_get(p->br, BROPT_NO_LL_LEARN) && br_should_learn(p, skb, &vid)) br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, 0); } /* note: already called with rcu_read_lock */ static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { __br_handle_local_finish(skb); /* return 1 to signal the okfn() was called so it's ok to use the skb */ return 1; } static int nf_hook_bridge_pre(struct sk_buff *skb, struct sk_buff **pskb) { #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE struct nf_hook_entries *e = NULL; struct nf_hook_state state; unsigned int verdict, i; struct net *net; int ret; net = dev_net(skb->dev); #ifdef HAVE_JUMP_LABEL if (!static_key_false(&nf_hooks_needed[NFPROTO_BRIDGE][NF_BR_PRE_ROUTING])) goto frame_finish; #endif e = rcu_dereference(net->nf.hooks_bridge[NF_BR_PRE_ROUTING]); if (!e) goto frame_finish; nf_hook_state_init(&state, NF_BR_PRE_ROUTING, NFPROTO_BRIDGE, skb->dev, NULL, NULL, net, br_handle_frame_finish); for (i = 0; i < e->num_hook_entries; i++) { verdict = nf_hook_entry_hookfn(&e->hooks[i], skb, &state); switch (verdict & NF_VERDICT_MASK) { case NF_ACCEPT: if (BR_INPUT_SKB_CB(skb)->br_netfilter_broute) { *pskb = skb; return RX_HANDLER_PASS; } break; case NF_DROP: kfree_skb(skb); return RX_HANDLER_CONSUMED; case NF_QUEUE: ret = nf_queue(skb, &state, i, verdict); if (ret == 1) continue; return RX_HANDLER_CONSUMED; default: /* STOLEN */ return RX_HANDLER_CONSUMED; } } frame_finish: net = dev_net(skb->dev); br_handle_frame_finish(net, NULL, skb); #else br_handle_frame_finish(dev_net(skb->dev), NULL, skb); #endif return RX_HANDLER_CONSUMED; } /* Return 0 if the frame was not processed otherwise 1 * note: already called with rcu_read_lock */ static int br_process_frame_type(struct net_bridge_port *p, struct sk_buff *skb) { struct br_frame_type *tmp; hlist_for_each_entry_rcu(tmp, &p->br->frame_type_list, list) if (unlikely(tmp->type == skb->protocol)) return tmp->frame_handler(p, skb); return 0; } /* * Return NULL if skb is handled * note: already called with rcu_read_lock */ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct net_bridge_port *p; struct sk_buff *skb = *pskb; const unsigned char *dest = eth_hdr(skb)->h_dest; if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; if (!is_valid_ether_addr(eth_hdr(skb)->h_source)) { reason = SKB_DROP_REASON_MAC_INVALID_SOURCE; goto drop; } skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return RX_HANDLER_CONSUMED; memset(skb->cb, 0, sizeof(struct br_input_skb_cb)); br_tc_skb_miss_set(skb, false); p = br_port_get_rcu(skb->dev); if (p->flags & BR_VLAN_TUNNEL) br_handle_ingress_vlan_tunnel(skb, p, nbp_vlan_group_rcu(p)); if (unlikely(is_link_local_ether_addr(dest))) { u16 fwd_mask = p->br->group_fwd_mask_required; /* * See IEEE 802.1D Table 7-10 Reserved addresses * * Assignment Value * Bridge Group Address 01-80-C2-00-00-00 * (MAC Control) 802.3 01-80-C2-00-00-01 * (Link Aggregation) 802.3 01-80-C2-00-00-02 * 802.1X PAE address 01-80-C2-00-00-03 * * 802.1AB LLDP 01-80-C2-00-00-0E * * Others reserved for future standardization */ fwd_mask |= p->group_fwd_mask; switch (dest[5]) { case 0x00: /* Bridge Group Address */ /* If STP is turned off, then must forward to keep loop detection */ if (p->br->stp_enabled == BR_NO_STP || fwd_mask & (1u << dest[5])) goto forward; *pskb = skb; __br_handle_local_finish(skb); return RX_HANDLER_PASS; case 0x01: /* IEEE MAC (Pause) */ reason = SKB_DROP_REASON_MAC_IEEE_MAC_CONTROL; goto drop; case 0x0E: /* 802.1AB LLDP */ fwd_mask |= p->br->group_fwd_mask; if (fwd_mask & (1u << dest[5])) goto forward; *pskb = skb; __br_handle_local_finish(skb); return RX_HANDLER_PASS; default: /* Allow selective forwarding for most other protocols */ fwd_mask |= p->br->group_fwd_mask; if (fwd_mask & (1u << dest[5])) goto forward; } BR_INPUT_SKB_CB(skb)->promisc = false; /* The else clause should be hit when nf_hook(): * - returns < 0 (drop/error) * - returns = 0 (stolen/nf_queue) * Thus return 1 from the okfn() to signal the skb is ok to pass */ if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, dev_net(skb->dev), NULL, skb, skb->dev, NULL, br_handle_local_finish) == 1) { return RX_HANDLER_PASS; } else { return RX_HANDLER_CONSUMED; } } if (unlikely(br_process_frame_type(p, skb))) return RX_HANDLER_PASS; forward: if (br_mst_is_enabled(p->br)) goto defer_stp_filtering; switch (p->state) { case BR_STATE_FORWARDING: case BR_STATE_LEARNING: defer_stp_filtering: if (ether_addr_equal(p->br->dev->dev_addr, dest)) skb->pkt_type = PACKET_HOST; return nf_hook_bridge_pre(skb, pskb); default: reason = SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE; drop: kfree_skb_reason(skb, reason); } return RX_HANDLER_CONSUMED; } /* This function has no purpose other than to appease the br_port_get_rcu/rtnl * helpers which identify bridged ports according to the rx_handler installed * on them (so there _needs_ to be a bridge rx_handler even if we don't need it * to do anything useful). This bridge won't support traffic to/from the stack, * but only hardware bridging. So return RX_HANDLER_PASS so we don't steal * frames from the ETH_P_XDSA packet_type handler. */ static rx_handler_result_t br_handle_frame_dummy(struct sk_buff **pskb) { return RX_HANDLER_PASS; } rx_handler_func_t *br_get_rx_handler(const struct net_device *dev) { if (netdev_uses_dsa(dev)) return br_handle_frame_dummy; return br_handle_frame; } void br_add_frame(struct net_bridge *br, struct br_frame_type *ft) { hlist_add_head_rcu(&ft->list, &br->frame_type_list); } void br_del_frame(struct net_bridge *br, struct br_frame_type *ft) { struct br_frame_type *tmp; hlist_for_each_entry(tmp, &br->frame_type_list, list) if (ft == tmp) { hlist_del_rcu(&ft->list); return; } }
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2020 Anna Schumaker <Anna.Schumaker@Netapp.com> */ #include <linux/sunrpc/clnt.h> #include <linux/kobject.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/xprtsock.h> #include "sysfs.h" struct xprt_addr { const char *addr; struct rcu_head rcu; }; static void free_xprt_addr(struct rcu_head *head) { struct xprt_addr *addr = container_of(head, struct xprt_addr, rcu); kfree(addr->addr); kfree(addr); } static struct kset *rpc_sunrpc_kset; static struct kobject *rpc_sunrpc_client_kobj, *rpc_sunrpc_xprt_switch_kobj; static void rpc_sysfs_object_release(struct kobject *kobj) { kfree(kobj); } static const struct kobj_ns_type_operations * rpc_sysfs_object_child_ns_type(const struct kobject *kobj) { return &net_ns_type_operations; } static const struct kobj_type rpc_sysfs_object_type = { .release = rpc_sysfs_object_release, .sysfs_ops = &kobj_sysfs_ops, .child_ns_type = rpc_sysfs_object_child_ns_type, }; static struct kobject *rpc_sysfs_object_alloc(const char *name, struct kset *kset, struct kobject *parent) { struct kobject *kobj; kobj = kzalloc(sizeof(*kobj), GFP_KERNEL); if (kobj) { kobj->kset = kset; if (kobject_init_and_add(kobj, &rpc_sysfs_object_type, parent, "%s", name) == 0) return kobj; kobject_put(kobj); } return NULL; } static inline struct rpc_clnt * rpc_sysfs_client_kobj_get_clnt(struct kobject *kobj) { struct rpc_sysfs_client *c = container_of(kobj, struct rpc_sysfs_client, kobject); struct rpc_clnt *ret = c->clnt; return refcount_inc_not_zero(&ret->cl_count) ? ret : NULL; } static inline struct rpc_xprt * rpc_sysfs_xprt_kobj_get_xprt(struct kobject *kobj) { struct rpc_sysfs_xprt *x = container_of(kobj, struct rpc_sysfs_xprt, kobject); return xprt_get(x->xprt); } static inline struct rpc_xprt_switch * rpc_sysfs_xprt_kobj_get_xprt_switch(struct kobject *kobj) { struct rpc_sysfs_xprt *x = container_of(kobj, struct rpc_sysfs_xprt, kobject); return xprt_switch_get(x->xprt_switch); } static inline struct rpc_xprt_switch * rpc_sysfs_xprt_switch_kobj_get_xprt(struct kobject *kobj) { struct rpc_sysfs_xprt_switch *x = container_of(kobj, struct rpc_sysfs_xprt_switch, kobject); return xprt_switch_get(x->xprt_switch); } static ssize_t rpc_sysfs_clnt_version_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct rpc_clnt *clnt = rpc_sysfs_client_kobj_get_clnt(kobj); ssize_t ret; if (!clnt) return sprintf(buf, "<closed>\n"); ret = sprintf(buf, "%u", clnt->cl_vers); refcount_dec(&clnt->cl_count); return ret; } static ssize_t rpc_sysfs_clnt_program_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct rpc_clnt *clnt = rpc_sysfs_client_kobj_get_clnt(kobj); ssize_t ret; if (!clnt) return sprintf(buf, "<closed>\n"); ret = sprintf(buf, "%s", clnt->cl_program->name); refcount_dec(&clnt->cl_count); return ret; } static ssize_t rpc_sysfs_clnt_max_connect_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct rpc_clnt *clnt = rpc_sysfs_client_kobj_get_clnt(kobj); ssize_t ret; if (!clnt) return sprintf(buf, "<closed>\n"); ret = sprintf(buf, "%u\n", clnt->cl_max_connect); refcount_dec(&clnt->cl_count); return ret; } static ssize_t rpc_sysfs_xprt_dstaddr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); ssize_t ret; if (!xprt) { ret = sprintf(buf, "<closed>\n"); goto out; } ret = sprintf(buf, "%s\n", xprt->address_strings[RPC_DISPLAY_ADDR]); xprt_put(xprt); out: return ret; } static ssize_t rpc_sysfs_xprt_srcaddr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); size_t buflen = PAGE_SIZE; ssize_t ret; if (!xprt || !xprt_connected(xprt)) { ret = sprintf(buf, "<closed>\n"); } else if (xprt->ops->get_srcaddr) { ret = xprt->ops->get_srcaddr(xprt, buf, buflen); if (ret > 0) { if (ret < buflen - 1) { buf[ret] = '\n'; ret++; buf[ret] = '\0'; } } else ret = sprintf(buf, "<closed>\n"); } else ret = sprintf(buf, "<not a socket>\n"); xprt_put(xprt); return ret; } static const char *xprtsec_strings[] = { [RPC_XPRTSEC_NONE] = "none", [RPC_XPRTSEC_TLS_ANON] = "tls-anon", [RPC_XPRTSEC_TLS_X509] = "tls-x509", }; static ssize_t rpc_sysfs_xprt_xprtsec_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); ssize_t ret; if (!xprt) { ret = sprintf(buf, "<closed>\n"); goto out; } ret = sprintf(buf, "%s\n", xprtsec_strings[xprt->xprtsec.policy]); xprt_put(xprt); out: return ret; } static ssize_t rpc_sysfs_xprt_info_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); unsigned short srcport = 0; size_t buflen = PAGE_SIZE; ssize_t ret; if (!xprt || !xprt_connected(xprt)) { ret = sprintf(buf, "<closed>\n"); goto out; } if (xprt->ops->get_srcport) srcport = xprt->ops->get_srcport(xprt); ret = snprintf(buf, buflen, "last_used=%lu\ncur_cong=%lu\ncong_win=%lu\n" "max_num_slots=%u\nmin_num_slots=%u\nnum_reqs=%u\n" "binding_q_len=%u\nsending_q_len=%u\npending_q_len=%u\n" "backlog_q_len=%u\nmain_xprt=%d\nsrc_port=%u\n" "tasks_queuelen=%ld\ndst_port=%s\n", xprt->last_used, xprt->cong, xprt->cwnd, xprt->max_reqs, xprt->min_reqs, xprt->num_reqs, xprt->binding.qlen, xprt->sending.qlen, xprt->pending.qlen, xprt->backlog.qlen, xprt->main, srcport, atomic_long_read(&xprt->queuelen), xprt->address_strings[RPC_DISPLAY_PORT]); out: xprt_put(xprt); return ret; } static ssize_t rpc_sysfs_xprt_state_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); ssize_t ret; int locked, connected, connecting, close_wait, bound, binding, closing, congested, cwnd_wait, write_space, offline, remove; if (!(xprt && xprt->state)) { ret = sprintf(buf, "state=CLOSED\n"); } else { locked = test_bit(XPRT_LOCKED, &xprt->state); connected = test_bit(XPRT_CONNECTED, &xprt->state); connecting = test_bit(XPRT_CONNECTING, &xprt->state); close_wait = test_bit(XPRT_CLOSE_WAIT, &xprt->state); bound = test_bit(XPRT_BOUND, &xprt->state); binding = test_bit(XPRT_BINDING, &xprt->state); closing = test_bit(XPRT_CLOSING, &xprt->state); congested = test_bit(XPRT_CONGESTED, &xprt->state); cwnd_wait = test_bit(XPRT_CWND_WAIT, &xprt->state); write_space = test_bit(XPRT_WRITE_SPACE, &xprt->state); offline = test_bit(XPRT_OFFLINE, &xprt->state); remove = test_bit(XPRT_REMOVE, &xprt->state); ret = sprintf(buf, "state=%s %s %s %s %s %s %s %s %s %s %s %s\n", locked ? "LOCKED" : "", connected ? "CONNECTED" : "", connecting ? "CONNECTING" : "", close_wait ? "CLOSE_WAIT" : "", bound ? "BOUND" : "", binding ? "BOUNDING" : "", closing ? "CLOSING" : "", congested ? "CONGESTED" : "", cwnd_wait ? "CWND_WAIT" : "", write_space ? "WRITE_SPACE" : "", offline ? "OFFLINE" : "", remove ? "REMOVE" : ""); } xprt_put(xprt); return ret; } static ssize_t rpc_sysfs_xprt_del_xprt_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sprintf(buf, "# delete this xprt\n"); } static ssize_t rpc_sysfs_xprt_switch_info_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct rpc_xprt_switch *xprt_switch = rpc_sysfs_xprt_switch_kobj_get_xprt(kobj); ssize_t ret; if (!xprt_switch) return 0; ret = sprintf(buf, "num_xprts=%u\nnum_active=%u\n" "num_unique_destaddr=%u\nqueue_len=%ld\n", xprt_switch->xps_nxprts, xprt_switch->xps_nactive, xprt_switch->xps_nunique_destaddr_xprts, atomic_long_read(&xprt_switch->xps_queuelen)); xprt_switch_put(xprt_switch); return ret; } static ssize_t rpc_sysfs_xprt_switch_add_xprt_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sprintf(buf, "# add one xprt to this xprt_switch\n"); } static ssize_t rpc_sysfs_xprt_switch_add_xprt_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { struct rpc_xprt_switch *xprt_switch = rpc_sysfs_xprt_switch_kobj_get_xprt(kobj); struct xprt_create xprt_create_args; struct rpc_xprt *xprt, *new; if (!xprt_switch) return 0; xprt = rpc_xprt_switch_get_main_xprt(xprt_switch); if (!xprt) goto out; xprt_create_args.ident = xprt->xprt_class->ident; xprt_create_args.net = xprt->xprt_net; xprt_create_args.dstaddr = (struct sockaddr *)&xprt->addr; xprt_create_args.addrlen = xprt->addrlen; xprt_create_args.servername = xprt->servername; xprt_create_args.bc_xprt = xprt->bc_xprt; xprt_create_args.xprtsec = xprt->xprtsec; xprt_create_args.connect_timeout = xprt->connect_timeout; xprt_create_args.reconnect_timeout = xprt->max_reconnect_timeout; new = xprt_create_transport(&xprt_create_args); if (IS_ERR_OR_NULL(new)) { count = PTR_ERR(new); goto out_put_xprt; } rpc_xprt_switch_add_xprt(xprt_switch, new); xprt_put(new); out_put_xprt: xprt_put(xprt); out: xprt_switch_put(xprt_switch); return count; } static ssize_t rpc_sysfs_xprt_dstaddr_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); struct sockaddr *saddr; char *dst_addr; int port; struct xprt_addr *saved_addr; size_t buf_len; if (!xprt) return 0; if (!(xprt->xprt_class->ident == XPRT_TRANSPORT_TCP || xprt->xprt_class->ident == XPRT_TRANSPORT_TCP_TLS || xprt->xprt_class->ident == XPRT_TRANSPORT_RDMA)) { xprt_put(xprt); return -EOPNOTSUPP; } if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) { count = -EINTR; goto out_put; } saddr = (struct sockaddr *)&xprt->addr; port = rpc_get_port(saddr); /* buf_len is the len until the first occurrence of either * '\n' or '\0' */ buf_len = strcspn(buf, "\n"); dst_addr = kstrndup(buf, buf_len, GFP_KERNEL); if (!dst_addr) goto out_err; saved_addr = kzalloc(sizeof(*saved_addr), GFP_KERNEL); if (!saved_addr) goto out_err_free; saved_addr->addr = rcu_dereference_raw(xprt->address_strings[RPC_DISPLAY_ADDR]); rcu_assign_pointer(xprt->address_strings[RPC_DISPLAY_ADDR], dst_addr); call_rcu(&saved_addr->rcu, free_xprt_addr); xprt->addrlen = rpc_pton(xprt->xprt_net, buf, buf_len, saddr, sizeof(*saddr)); rpc_set_port(saddr, port); xprt_force_disconnect(xprt); out: xprt_release_write(xprt, NULL); out_put: xprt_put(xprt); return count; out_err_free: kfree(dst_addr); out_err: count = -ENOMEM; goto out; } static ssize_t rpc_sysfs_xprt_state_change(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); int offline = 0, online = 0, remove = 0; struct rpc_xprt_switch *xps = rpc_sysfs_xprt_kobj_get_xprt_switch(kobj); if (!xprt || !xps) { count = 0; goto out_put; } if (!strncmp(buf, "offline", 7)) offline = 1; else if (!strncmp(buf, "online", 6)) online = 1; else if (!strncmp(buf, "remove", 6)) remove = 1; else { count = -EINVAL; goto out_put; } if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) { count = -EINTR; goto out_put; } if (xprt->main) { count = -EINVAL; goto release_tasks; } if (offline) { xprt_set_offline_locked(xprt, xps); } else if (online) { xprt_set_online_locked(xprt, xps); } else if (remove) { if (test_bit(XPRT_OFFLINE, &xprt->state)) xprt_delete_locked(xprt, xps); else count = -EINVAL; } release_tasks: xprt_release_write(xprt, NULL); out_put: xprt_put(xprt); xprt_switch_put(xps); return count; } static ssize_t rpc_sysfs_xprt_del_xprt(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); struct rpc_xprt_switch *xps = rpc_sysfs_xprt_kobj_get_xprt_switch(kobj); if (!xprt || !xps) { count = 0; goto out; } if (xprt->main) { count = -EINVAL; goto release_tasks; } if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) { count = -EINTR; goto out_put; } xprt_set_offline_locked(xprt, xps); xprt_delete_locked(xprt, xps); release_tasks: xprt_release_write(xprt, NULL); out_put: xprt_put(xprt); xprt_switch_put(xps); out: return count; } int rpc_sysfs_init(void) { rpc_sunrpc_kset = kset_create_and_add("sunrpc", NULL, kernel_kobj); if (!rpc_sunrpc_kset) return -ENOMEM; rpc_sunrpc_client_kobj = rpc_sysfs_object_alloc("rpc-clients", rpc_sunrpc_kset, NULL); if (!rpc_sunrpc_client_kobj) goto err_client; rpc_sunrpc_xprt_switch_kobj = rpc_sysfs_object_alloc("xprt-switches", rpc_sunrpc_kset, NULL); if (!rpc_sunrpc_xprt_switch_kobj) goto err_switch; return 0; err_switch: kobject_put(rpc_sunrpc_client_kobj); rpc_sunrpc_client_kobj = NULL; err_client: kset_unregister(rpc_sunrpc_kset); rpc_sunrpc_kset = NULL; return -ENOMEM; } static void rpc_sysfs_client_release(struct kobject *kobj) { struct rpc_sysfs_client *c; c = container_of(kobj, struct rpc_sysfs_client, kobject); kfree(c); } static void rpc_sysfs_xprt_switch_release(struct kobject *kobj) { struct rpc_sysfs_xprt_switch *xprt_switch; xprt_switch = container_of(kobj, struct rpc_sysfs_xprt_switch, kobject); kfree(xprt_switch); } static void rpc_sysfs_xprt_release(struct kobject *kobj) { struct rpc_sysfs_xprt *xprt; xprt = container_of(kobj, struct rpc_sysfs_xprt, kobject); kfree(xprt); } static const void *rpc_sysfs_client_namespace(const struct kobject *kobj) { return container_of(kobj, struct rpc_sysfs_client, kobject)->net; } static const void *rpc_sysfs_xprt_switch_namespace(const struct kobject *kobj) { return container_of(kobj, struct rpc_sysfs_xprt_switch, kobject)->net; } static const void *rpc_sysfs_xprt_namespace(const struct kobject *kobj) { return container_of(kobj, struct rpc_sysfs_xprt, kobject)->xprt->xprt_net; } static struct kobj_attribute rpc_sysfs_clnt_version = __ATTR(rpc_version, 0444, rpc_sysfs_clnt_version_show, NULL); static struct kobj_attribute rpc_sysfs_clnt_program = __ATTR(program, 0444, rpc_sysfs_clnt_program_show, NULL); static struct kobj_attribute rpc_sysfs_clnt_max_connect = __ATTR(max_connect, 0444, rpc_sysfs_clnt_max_connect_show, NULL); static struct attribute *rpc_sysfs_rpc_clnt_attrs[] = { &rpc_sysfs_clnt_version.attr, &rpc_sysfs_clnt_program.attr, &rpc_sysfs_clnt_max_connect.attr, NULL, }; ATTRIBUTE_GROUPS(rpc_sysfs_rpc_clnt); static struct kobj_attribute rpc_sysfs_xprt_dstaddr = __ATTR(dstaddr, 0644, rpc_sysfs_xprt_dstaddr_show, rpc_sysfs_xprt_dstaddr_store); static struct kobj_attribute rpc_sysfs_xprt_srcaddr = __ATTR(srcaddr, 0644, rpc_sysfs_xprt_srcaddr_show, NULL); static struct kobj_attribute rpc_sysfs_xprt_xprtsec = __ATTR(xprtsec, 0644, rpc_sysfs_xprt_xprtsec_show, NULL); static struct kobj_attribute rpc_sysfs_xprt_info = __ATTR(xprt_info, 0444, rpc_sysfs_xprt_info_show, NULL); static struct kobj_attribute rpc_sysfs_xprt_change_state = __ATTR(xprt_state, 0644, rpc_sysfs_xprt_state_show, rpc_sysfs_xprt_state_change); static struct kobj_attribute rpc_sysfs_xprt_del = __ATTR(del_xprt, 0644, rpc_sysfs_xprt_del_xprt_show, rpc_sysfs_xprt_del_xprt); static struct attribute *rpc_sysfs_xprt_attrs[] = { &rpc_sysfs_xprt_dstaddr.attr, &rpc_sysfs_xprt_srcaddr.attr, &rpc_sysfs_xprt_xprtsec.attr, &rpc_sysfs_xprt_info.attr, &rpc_sysfs_xprt_change_state.attr, &rpc_sysfs_xprt_del.attr, NULL, }; ATTRIBUTE_GROUPS(rpc_sysfs_xprt); static struct kobj_attribute rpc_sysfs_xprt_switch_info = __ATTR(xprt_switch_info, 0444, rpc_sysfs_xprt_switch_info_show, NULL); static struct kobj_attribute rpc_sysfs_xprt_switch_add_xprt = __ATTR(add_xprt, 0644, rpc_sysfs_xprt_switch_add_xprt_show, rpc_sysfs_xprt_switch_add_xprt_store); static struct attribute *rpc_sysfs_xprt_switch_attrs[] = { &rpc_sysfs_xprt_switch_info.attr, &rpc_sysfs_xprt_switch_add_xprt.attr, NULL, }; ATTRIBUTE_GROUPS(rpc_sysfs_xprt_switch); static const struct kobj_type rpc_sysfs_client_type = { .release = rpc_sysfs_client_release, .default_groups = rpc_sysfs_rpc_clnt_groups, .sysfs_ops = &kobj_sysfs_ops, .namespace = rpc_sysfs_client_namespace, }; static const struct kobj_type rpc_sysfs_xprt_switch_type = { .release = rpc_sysfs_xprt_switch_release, .default_groups = rpc_sysfs_xprt_switch_groups, .sysfs_ops = &kobj_sysfs_ops, .namespace = rpc_sysfs_xprt_switch_namespace, }; static const struct kobj_type rpc_sysfs_xprt_type = { .release = rpc_sysfs_xprt_release, .default_groups = rpc_sysfs_xprt_groups, .sysfs_ops = &kobj_sysfs_ops, .namespace = rpc_sysfs_xprt_namespace, }; void rpc_sysfs_exit(void) { kobject_put(rpc_sunrpc_client_kobj); kobject_put(rpc_sunrpc_xprt_switch_kobj); kset_unregister(rpc_sunrpc_kset); } static struct rpc_sysfs_client *rpc_sysfs_client_alloc(struct kobject *parent, struct net *net, int clid) { struct rpc_sysfs_client *p; p = kzalloc(sizeof(*p), GFP_KERNEL); if (p) { p->net = net; p->kobject.kset = rpc_sunrpc_kset; if (kobject_init_and_add(&p->kobject, &rpc_sysfs_client_type, parent, "clnt-%d", clid) == 0) return p; kobject_put(&p->kobject); } return NULL; } static struct rpc_sysfs_xprt_switch * rpc_sysfs_xprt_switch_alloc(struct kobject *parent, struct rpc_xprt_switch *xprt_switch, struct net *net, gfp_t gfp_flags) { struct rpc_sysfs_xprt_switch *p; p = kzalloc(sizeof(*p), gfp_flags); if (p) { p->net = net; p->kobject.kset = rpc_sunrpc_kset; if (kobject_init_and_add(&p->kobject, &rpc_sysfs_xprt_switch_type, parent, "switch-%d", xprt_switch->xps_id) == 0) return p; kobject_put(&p->kobject); } return NULL; } static struct rpc_sysfs_xprt *rpc_sysfs_xprt_alloc(struct kobject *parent, struct rpc_xprt *xprt, gfp_t gfp_flags) { struct rpc_sysfs_xprt *p; p = kzalloc(sizeof(*p), gfp_flags); if (!p) goto out; p->kobject.kset = rpc_sunrpc_kset; if (kobject_init_and_add(&p->kobject, &rpc_sysfs_xprt_type, parent, "xprt-%d-%s", xprt->id, xprt->address_strings[RPC_DISPLAY_PROTO]) == 0) return p; kobject_put(&p->kobject); out: return NULL; } void rpc_sysfs_client_setup(struct rpc_clnt *clnt, struct rpc_xprt_switch *xprt_switch, struct net *net) { struct rpc_sysfs_client *rpc_client; struct rpc_sysfs_xprt_switch *xswitch = (struct rpc_sysfs_xprt_switch *)xprt_switch->xps_sysfs; if (!xswitch) return; rpc_client = rpc_sysfs_client_alloc(rpc_sunrpc_client_kobj, net, clnt->cl_clid); if (rpc_client) { char name[] = "switch"; int ret; clnt->cl_sysfs = rpc_client; rpc_client->clnt = clnt; rpc_client->xprt_switch = xprt_switch; kobject_uevent(&rpc_client->kobject, KOBJ_ADD); ret = sysfs_create_link_nowarn(&rpc_client->kobject, &xswitch->kobject, name); if (ret) pr_warn("can't create link to %s in sysfs (%d)\n", name, ret); } } void rpc_sysfs_xprt_switch_setup(struct rpc_xprt_switch *xprt_switch, struct rpc_xprt *xprt, gfp_t gfp_flags) { struct rpc_sysfs_xprt_switch *rpc_xprt_switch; struct net *net; if (xprt_switch->xps_net) net = xprt_switch->xps_net; else net = xprt->xprt_net; rpc_xprt_switch = rpc_sysfs_xprt_switch_alloc(rpc_sunrpc_xprt_switch_kobj, xprt_switch, net, gfp_flags); if (rpc_xprt_switch) { xprt_switch->xps_sysfs = rpc_xprt_switch; rpc_xprt_switch->xprt_switch = xprt_switch; rpc_xprt_switch->xprt = xprt; kobject_uevent(&rpc_xprt_switch->kobject, KOBJ_ADD); } else { xprt_switch->xps_sysfs = NULL; } } void rpc_sysfs_xprt_setup(struct rpc_xprt_switch *xprt_switch, struct rpc_xprt *xprt, gfp_t gfp_flags) { struct rpc_sysfs_xprt *rpc_xprt; struct rpc_sysfs_xprt_switch *switch_obj = (struct rpc_sysfs_xprt_switch *)xprt_switch->xps_sysfs; if (!switch_obj) return; rpc_xprt = rpc_sysfs_xprt_alloc(&switch_obj->kobject, xprt, gfp_flags); if (rpc_xprt) { xprt->xprt_sysfs = rpc_xprt; rpc_xprt->xprt = xprt; rpc_xprt->xprt_switch = xprt_switch; kobject_uevent(&rpc_xprt->kobject, KOBJ_ADD); } } void rpc_sysfs_client_destroy(struct rpc_clnt *clnt) { struct rpc_sysfs_client *rpc_client = clnt->cl_sysfs; if (rpc_client) { char name[] = "switch"; sysfs_remove_link(&rpc_client->kobject, name); kobject_uevent(&rpc_client->kobject, KOBJ_REMOVE); kobject_del(&rpc_client->kobject); kobject_put(&rpc_client->kobject); clnt->cl_sysfs = NULL; } } void rpc_sysfs_xprt_switch_destroy(struct rpc_xprt_switch *xprt_switch) { struct rpc_sysfs_xprt_switch *rpc_xprt_switch = xprt_switch->xps_sysfs; if (rpc_xprt_switch) { kobject_uevent(&rpc_xprt_switch->kobject, KOBJ_REMOVE); kobject_del(&rpc_xprt_switch->kobject); kobject_put(&rpc_xprt_switch->kobject); xprt_switch->xps_sysfs = NULL; } } void rpc_sysfs_xprt_destroy(struct rpc_xprt *xprt) { struct rpc_sysfs_xprt *rpc_xprt = xprt->xprt_sysfs; if (rpc_xprt) { kobject_uevent(&rpc_xprt->kobject, KOBJ_REMOVE); kobject_del(&rpc_xprt->kobject); kobject_put(&rpc_xprt->kobject); xprt->xprt_sysfs = NULL; } }
5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2007-2012 Siemens AG * * Written by: * Pavel Smolenskiy <pavel.smolenskiy@gmail.com> * Maxim Gorbachyov <maxim.gorbachev@siemens.com> * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com> * Alexander Smirnov <alex.bluesman.smirnov@gmail.com> */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/crc-ccitt.h> #include <linux/unaligned.h> #include <net/mac802154.h> #include <net/ieee802154_netdev.h> #include <net/nl802154.h> #include "ieee802154_i.h" static int ieee802154_deliver_skb(struct sk_buff *skb) { skb->ip_summed = CHECKSUM_UNNECESSARY; skb->protocol = htons(ETH_P_IEEE802154); return netif_receive_skb(skb); } void mac802154_rx_beacon_worker(struct work_struct *work) { struct ieee802154_local *local = container_of(work, struct ieee802154_local, rx_beacon_work); struct cfg802154_mac_pkt *mac_pkt; mac_pkt = list_first_entry_or_null(&local->rx_beacon_list, struct cfg802154_mac_pkt, node); if (!mac_pkt) return; mac802154_process_beacon(local, mac_pkt->skb, mac_pkt->page, mac_pkt->channel); list_del(&mac_pkt->node); kfree_skb(mac_pkt->skb); kfree(mac_pkt); } static bool mac802154_should_answer_beacon_req(struct ieee802154_local *local) { struct cfg802154_beacon_request *beacon_req; unsigned int interval; rcu_read_lock(); beacon_req = rcu_dereference(local->beacon_req); if (!beacon_req) { rcu_read_unlock(); return false; } interval = beacon_req->interval; rcu_read_unlock(); if (!mac802154_is_beaconing(local)) return false; return interval == IEEE802154_ACTIVE_SCAN_DURATION; } void mac802154_rx_mac_cmd_worker(struct work_struct *work) { struct ieee802154_local *local = container_of(work, struct ieee802154_local, rx_mac_cmd_work); struct cfg802154_mac_pkt *mac_pkt; u8 mac_cmd; int rc; mac_pkt = list_first_entry_or_null(&local->rx_mac_cmd_list, struct cfg802154_mac_pkt, node); if (!mac_pkt) return; rc = ieee802154_get_mac_cmd(mac_pkt->skb, &mac_cmd); if (rc) goto out; switch (mac_cmd) { case IEEE802154_CMD_BEACON_REQ: dev_dbg(&mac_pkt->sdata->dev->dev, "processing BEACON REQ\n"); if (!mac802154_should_answer_beacon_req(local)) break; queue_delayed_work(local->mac_wq, &local->beacon_work, 0); break; case IEEE802154_CMD_ASSOCIATION_RESP: dev_dbg(&mac_pkt->sdata->dev->dev, "processing ASSOC RESP\n"); if (!mac802154_is_associating(local)) break; mac802154_process_association_resp(mac_pkt->sdata, mac_pkt->skb); break; case IEEE802154_CMD_ASSOCIATION_REQ: dev_dbg(&mac_pkt->sdata->dev->dev, "processing ASSOC REQ\n"); if (mac_pkt->sdata->wpan_dev.iftype != NL802154_IFTYPE_COORD) break; mac802154_process_association_req(mac_pkt->sdata, mac_pkt->skb); break; case IEEE802154_CMD_DISASSOCIATION_NOTIFY: dev_dbg(&mac_pkt->sdata->dev->dev, "processing DISASSOC NOTIF\n"); if (mac_pkt->sdata->wpan_dev.iftype != NL802154_IFTYPE_COORD) break; mac802154_process_disassociation_notif(mac_pkt->sdata, mac_pkt->skb); break; default: break; } out: list_del(&mac_pkt->node); kfree_skb(mac_pkt->skb); kfree(mac_pkt); } static int ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata, struct sk_buff *skb, const struct ieee802154_hdr *hdr) { struct wpan_phy *wpan_phy = sdata->local->hw.phy; struct wpan_dev *wpan_dev = &sdata->wpan_dev; struct cfg802154_mac_pkt *mac_pkt; __le16 span, sshort; int rc; pr_debug("getting packet via slave interface %s\n", sdata->dev->name); span = wpan_dev->pan_id; sshort = wpan_dev->short_addr; /* Level 3 filtering: Only beacons are accepted during scans */ if (sdata->required_filtering == IEEE802154_FILTERING_3_SCAN && sdata->required_filtering > wpan_phy->filtering) { if (mac_cb(skb)->type != IEEE802154_FC_TYPE_BEACON) { dev_dbg(&sdata->dev->dev, "drop non-beacon frame (0x%x) during scan\n", mac_cb(skb)->type); goto fail; } } switch (mac_cb(skb)->dest.mode) { case IEEE802154_ADDR_NONE: if (hdr->source.mode == IEEE802154_ADDR_NONE) /* ACK comes with both addresses empty */ skb->pkt_type = PACKET_HOST; else if (!wpan_dev->parent) /* No dest means PAN coordinator is the recipient */ skb->pkt_type = PACKET_HOST; else /* We are not the PAN coordinator, just relaying */ skb->pkt_type = PACKET_OTHERHOST; break; case IEEE802154_ADDR_LONG: if (mac_cb(skb)->dest.pan_id != span && mac_cb(skb)->dest.pan_id != cpu_to_le16(IEEE802154_PANID_BROADCAST)) skb->pkt_type = PACKET_OTHERHOST; else if (mac_cb(skb)->dest.extended_addr == wpan_dev->extended_addr) skb->pkt_type = PACKET_HOST; else skb->pkt_type = PACKET_OTHERHOST; break; case IEEE802154_ADDR_SHORT: if (mac_cb(skb)->dest.pan_id != span && mac_cb(skb)->dest.pan_id != cpu_to_le16(IEEE802154_PANID_BROADCAST)) skb->pkt_type = PACKET_OTHERHOST; else if (mac_cb(skb)->dest.short_addr == sshort) skb->pkt_type = PACKET_HOST; else if (mac_cb(skb)->dest.short_addr == cpu_to_le16(IEEE802154_ADDR_BROADCAST)) skb->pkt_type = PACKET_BROADCAST; else skb->pkt_type = PACKET_OTHERHOST; break; default: pr_debug("invalid dest mode\n"); goto fail; } skb->dev = sdata->dev; /* TODO this should be moved after netif_receive_skb call, otherwise * wireshark will show a mac header with security fields and the * payload is already decrypted. */ rc = mac802154_llsec_decrypt(&sdata->sec, skb); if (rc) { pr_debug("decryption failed: %i\n", rc); goto fail; } sdata->dev->stats.rx_packets++; sdata->dev->stats.rx_bytes += skb->len; switch (mac_cb(skb)->type) { case IEEE802154_FC_TYPE_BEACON: dev_dbg(&sdata->dev->dev, "BEACON received\n"); if (!mac802154_is_scanning(sdata->local)) goto fail; mac_pkt = kzalloc(sizeof(*mac_pkt), GFP_ATOMIC); if (!mac_pkt) goto fail; mac_pkt->skb = skb_get(skb); mac_pkt->sdata = sdata; mac_pkt->page = sdata->local->scan_page; mac_pkt->channel = sdata->local->scan_channel; list_add_tail(&mac_pkt->node, &sdata->local->rx_beacon_list); queue_work(sdata->local->mac_wq, &sdata->local->rx_beacon_work); return NET_RX_SUCCESS; case IEEE802154_FC_TYPE_MAC_CMD: dev_dbg(&sdata->dev->dev, "MAC COMMAND received\n"); mac_pkt = kzalloc(sizeof(*mac_pkt), GFP_ATOMIC); if (!mac_pkt) goto fail; mac_pkt->skb = skb_get(skb); mac_pkt->sdata = sdata; list_add_tail(&mac_pkt->node, &sdata->local->rx_mac_cmd_list); queue_work(sdata->local->mac_wq, &sdata->local->rx_mac_cmd_work); return NET_RX_SUCCESS; case IEEE802154_FC_TYPE_ACK: goto fail; case IEEE802154_FC_TYPE_DATA: return ieee802154_deliver_skb(skb); default: pr_warn_ratelimited("ieee802154: bad frame received " "(type = %d)\n", mac_cb(skb)->type); goto fail; } fail: kfree_skb(skb); return NET_RX_DROP; } static void ieee802154_print_addr(const char *name, const struct ieee802154_addr *addr) { if (addr->mode == IEEE802154_ADDR_NONE) { pr_debug("%s not present\n", name); return; } pr_debug("%s PAN ID: %04x\n", name, le16_to_cpu(addr->pan_id)); if (addr->mode == IEEE802154_ADDR_SHORT) { pr_debug("%s is short: %04x\n", name, le16_to_cpu(addr->short_addr)); } else { u64 hw = swab64((__force u64)addr->extended_addr); pr_debug("%s is hardware: %8phC\n", name, &hw); } } static int ieee802154_parse_frame_start(struct sk_buff *skb, struct ieee802154_hdr *hdr) { int hlen; struct ieee802154_mac_cb *cb = mac_cb(skb); skb_reset_mac_header(skb); hlen = ieee802154_hdr_pull(skb, hdr); if (hlen < 0) return -EINVAL; skb->mac_len = hlen; pr_debug("fc: %04x dsn: %02x\n", le16_to_cpup((__le16 *)&hdr->fc), hdr->seq); cb->type = hdr->fc.type; cb->ackreq = hdr->fc.ack_request; cb->secen = hdr->fc.security_enabled; ieee802154_print_addr("destination", &hdr->dest); ieee802154_print_addr("source", &hdr->source); cb->source = hdr->source; cb->dest = hdr->dest; if (hdr->fc.security_enabled) { u64 key; pr_debug("seclevel %i\n", hdr->sec.level); switch (hdr->sec.key_id_mode) { case IEEE802154_SCF_KEY_IMPLICIT: pr_debug("implicit key\n"); break; case IEEE802154_SCF_KEY_INDEX: pr_debug("key %02x\n", hdr->sec.key_id); break; case IEEE802154_SCF_KEY_SHORT_INDEX: pr_debug("key %04x:%04x %02x\n", le32_to_cpu(hdr->sec.short_src) >> 16, le32_to_cpu(hdr->sec.short_src) & 0xffff, hdr->sec.key_id); break; case IEEE802154_SCF_KEY_HW_INDEX: key = swab64((__force u64)hdr->sec.extended_src); pr_debug("key source %8phC %02x\n", &key, hdr->sec.key_id); break; } } return 0; } static void __ieee802154_rx_handle_packet(struct ieee802154_local *local, struct sk_buff *skb) { int ret; struct ieee802154_sub_if_data *sdata; struct ieee802154_hdr hdr; struct sk_buff *skb2; ret = ieee802154_parse_frame_start(skb, &hdr); if (ret) { pr_debug("got invalid frame\n"); return; } list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (sdata->wpan_dev.iftype == NL802154_IFTYPE_MONITOR) continue; if (!ieee802154_sdata_running(sdata)) continue; /* Do not deliver packets received on interfaces expecting * AACK=1 if the address filters where disabled. */ if (local->hw.phy->filtering < IEEE802154_FILTERING_4_FRAME_FIELDS && sdata->required_filtering == IEEE802154_FILTERING_4_FRAME_FIELDS) continue; skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) { skb2->dev = sdata->dev; ieee802154_subif_frame(sdata, skb2, &hdr); } } } static void ieee802154_monitors_rx(struct ieee802154_local *local, struct sk_buff *skb) { struct sk_buff *skb2; struct ieee802154_sub_if_data *sdata; skb_reset_mac_header(skb); skb->ip_summed = CHECKSUM_UNNECESSARY; skb->pkt_type = PACKET_OTHERHOST; skb->protocol = htons(ETH_P_IEEE802154); list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (sdata->wpan_dev.iftype != NL802154_IFTYPE_MONITOR) continue; if (!ieee802154_sdata_running(sdata)) continue; skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) { skb2->dev = sdata->dev; ieee802154_deliver_skb(skb2); sdata->dev->stats.rx_packets++; sdata->dev->stats.rx_bytes += skb->len; } } } void ieee802154_rx(struct ieee802154_local *local, struct sk_buff *skb) { u16 crc; WARN_ON_ONCE(softirq_count() == 0); if (local->suspended) goto free_skb; /* TODO: When a transceiver omits the checksum here, we * add an own calculated one. This is currently an ugly * solution because the monitor needs a crc here. */ if (local->hw.flags & IEEE802154_HW_RX_OMIT_CKSUM) { crc = crc_ccitt(0, skb->data, skb->len); put_unaligned_le16(crc, skb_put(skb, 2)); } rcu_read_lock(); ieee802154_monitors_rx(local, skb); /* Level 1 filtering: Check the FCS by software when relevant */ if (local->hw.phy->filtering == IEEE802154_FILTERING_NONE) { crc = crc_ccitt(0, skb->data, skb->len); if (crc) goto drop; } /* remove crc */ skb_trim(skb, skb->len - 2); __ieee802154_rx_handle_packet(local, skb); drop: rcu_read_unlock(); free_skb: kfree_skb(skb); } void ieee802154_rx_irqsafe(struct ieee802154_hw *hw, struct sk_buff *skb, u8 lqi) { struct ieee802154_local *local = hw_to_local(hw); struct ieee802154_mac_cb *cb = mac_cb_init(skb); cb->lqi = lqi; skb->pkt_type = IEEE802154_RX_MSG; skb_queue_tail(&local->skb_queue, skb); tasklet_schedule(&local->tasklet); } EXPORT_SYMBOL(ieee802154_rx_irqsafe);
23430 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_X86_XSAVE_H #define __ASM_X86_XSAVE_H #include <linux/uaccess.h> #include <linux/types.h> #include <asm/processor.h> #include <asm/fpu/api.h> #include <asm/user.h> /* Bit 63 of XCR0 is reserved for future expansion */ #define XFEATURE_MASK_EXTEND (~(XFEATURE_MASK_FPSSE | (1ULL << 63))) #define FXSAVE_SIZE 512 #define XSAVE_HDR_SIZE 64 #define XSAVE_HDR_OFFSET FXSAVE_SIZE #define XSAVE_YMM_SIZE 256 #define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) #define XSAVE_ALIGNMENT 64 /* All currently supported user features */ #define XFEATURE_MASK_USER_SUPPORTED (XFEATURE_MASK_FP | \ XFEATURE_MASK_SSE | \ XFEATURE_MASK_YMM | \ XFEATURE_MASK_OPMASK | \ XFEATURE_MASK_ZMM_Hi256 | \ XFEATURE_MASK_Hi16_ZMM | \ XFEATURE_MASK_PKRU | \ XFEATURE_MASK_BNDREGS | \ XFEATURE_MASK_BNDCSR | \ XFEATURE_MASK_XTILE | \ XFEATURE_MASK_APX) /* * Features which are restored when returning to user space. * PKRU is not restored on return to user space because PKRU * is switched eagerly in switch_to() and flush_thread() */ #define XFEATURE_MASK_USER_RESTORE \ (XFEATURE_MASK_USER_SUPPORTED & ~XFEATURE_MASK_PKRU) /* Features which are dynamically enabled for a process on request */ #define XFEATURE_MASK_USER_DYNAMIC XFEATURE_MASK_XTILE_DATA /* Supervisor features which are enabled only in guest FPUs */ #define XFEATURE_MASK_GUEST_SUPERVISOR XFEATURE_MASK_CET_KERNEL /* All currently supported supervisor features */ #define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID | \ XFEATURE_MASK_CET_USER | \ XFEATURE_MASK_GUEST_SUPERVISOR) /* * A supervisor state component may not always contain valuable information, * and its size may be huge. Saving/restoring such supervisor state components * at each context switch can cause high CPU and space overhead, which should * be avoided. Such supervisor state components should only be saved/restored * on demand. The on-demand supervisor features are set in this mask. * * Unlike the existing supported supervisor features, an independent supervisor * feature does not allocate a buffer in task->fpu, and the corresponding * supervisor state component cannot be saved/restored at each context switch. * * To support an independent supervisor feature, a developer should follow the * dos and don'ts as below: * - Do dynamically allocate a buffer for the supervisor state component. * - Do manually invoke the XSAVES/XRSTORS instruction to save/restore the * state component to/from the buffer. * - Don't set the bit corresponding to the independent supervisor feature in * IA32_XSS at run time, since it has been set at boot time. */ #define XFEATURE_MASK_INDEPENDENT (XFEATURE_MASK_LBR) /* * Unsupported supervisor features. When a supervisor feature in this mask is * supported in the future, move it to the supported supervisor feature mask. */ #define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT) /* All supervisor states including supported and unsupported states. */ #define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \ XFEATURE_MASK_INDEPENDENT | \ XFEATURE_MASK_SUPERVISOR_UNSUPPORTED) /* * The feature mask required to restore FPU state: * - All user states which are not eagerly switched in switch_to()/exec() * - The suporvisor states */ #define XFEATURE_MASK_FPSTATE (XFEATURE_MASK_USER_RESTORE | \ XFEATURE_MASK_SUPERVISOR_SUPPORTED) /* * Features in this mask have space allocated in the signal frame, but may not * have that space initialized when the feature is in its init state. */ #define XFEATURE_MASK_SIGFRAME_INITOPT (XFEATURE_MASK_XTILE | \ XFEATURE_MASK_USER_DYNAMIC) extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; extern void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask); int xfeature_size(int xfeature_nr); void xsaves(struct xregs_state *xsave, u64 mask); void xrstors(struct xregs_state *xsave, u64 mask); int xfd_enable_feature(u64 xfd_err); #ifdef CONFIG_X86_64 DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic); #endif #ifdef CONFIG_X86_64 DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic); static __always_inline __pure bool fpu_state_size_dynamic(void) { return static_branch_unlikely(&__fpu_state_size_dynamic); } #else static __always_inline __pure bool fpu_state_size_dynamic(void) { return false; } #endif #endif
27 4 3 22 22 3 3 3 3 3 2 3 12 1 1 7 5 4 3 1 5 4 2 2 8 2 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 /* * net/tipc/net.c: TIPC network routing code * * Copyright (c) 1995-2006, 2014, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "net.h" #include "name_distr.h" #include "subscr.h" #include "socket.h" #include "node.h" #include "bcast.h" #include "link.h" #include "netlink.h" #include "monitor.h" /* * The TIPC locking policy is designed to ensure a very fine locking * granularity, permitting complete parallel access to individual * port and node/link instances. The code consists of four major * locking domains, each protected with their own disjunct set of locks. * * 1: The bearer level. * RTNL lock is used to serialize the process of configuring bearer * on update side, and RCU lock is applied on read side to make * bearer instance valid on both paths of message transmission and * reception. * * 2: The node and link level. * All node instances are saved into two tipc_node_list and node_htable * lists. The two lists are protected by node_list_lock on write side, * and they are guarded with RCU lock on read side. Especially node * instance is destroyed only when TIPC module is removed, and we can * confirm that there has no any user who is accessing the node at the * moment. Therefore, Except for iterating the two lists within RCU * protection, it's no needed to hold RCU that we access node instance * in other places. * * In addition, all members in node structure including link instances * are protected by node spin lock. * * 3: The transport level of the protocol. * This consists of the structures port, (and its user level * representations, such as user_port and tipc_sock), reference and * tipc_user (port.c, reg.c, socket.c). * * This layer has four different locks: * - The tipc_port spin_lock. This is protecting each port instance * from parallel data access and removal. Since we can not place * this lock in the port itself, it has been placed in the * corresponding reference table entry, which has the same life * cycle as the module. This entry is difficult to access from * outside the TIPC core, however, so a pointer to the lock has * been added in the port instance, -to be used for unlocking * only. * - A read/write lock to protect the reference table itself (teg.c). * (Nobody is using read-only access to this, so it can just as * well be changed to a spin_lock) * - A spin lock to protect the registry of kernel/driver users (reg.c) * - A global spin_lock (tipc_port_lock), which only task is to ensure * consistency where more than one port is involved in an operation, * i.e., when a port is part of a linked list of ports. * There are two such lists; 'port_list', which is used for management, * and 'wait_list', which is used to queue ports during congestion. * * 4: The name table (name_table.c, name_distr.c, subscription.c) * - There is one big read/write-lock (tipc_nametbl_lock) protecting the * overall name table structure. Nothing must be added/removed to * this structure without holding write access to it. * - There is one local spin_lock per sub_sequence, which can be seen * as a sub-domain to the tipc_nametbl_lock domain. It is used only * for translation operations, and is needed because a translation * steps the root of the 'publication' linked list between each lookup. * This is always used within the scope of a tipc_nametbl_lock(read). * - A local spin_lock protecting the queue of subscriber events. */ static void tipc_net_finalize(struct net *net, u32 addr); int tipc_net_init(struct net *net, u8 *node_id, u32 addr) { if (tipc_own_id(net)) { pr_info("Cannot configure node identity twice\n"); return -1; } pr_info("Started in network mode\n"); if (node_id) tipc_set_node_id(net, node_id); if (addr) tipc_net_finalize(net, addr); return 0; } static void tipc_net_finalize(struct net *net, u32 addr) { struct tipc_net *tn = tipc_net(net); struct tipc_socket_addr sk = {0, addr}; struct tipc_uaddr ua; tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_CLUSTER_SCOPE, TIPC_NODE_STATE, addr, addr); if (cmpxchg(&tn->node_addr, 0, addr)) return; tipc_set_node_addr(net, addr); tipc_named_reinit(net); tipc_sk_reinit(net); tipc_mon_reinit_self(net); tipc_nametbl_publish(net, &ua, &sk, addr); } void tipc_net_finalize_work(struct work_struct *work) { struct tipc_net *tn = container_of(work, struct tipc_net, work); tipc_net_finalize(tipc_link_net(tn->bcl), tn->trial_addr); } void tipc_net_stop(struct net *net) { if (!tipc_own_id(net)) return; rtnl_lock(); tipc_bearer_stop(net); tipc_node_stop(net); rtnl_unlock(); pr_info("Left network mode\n"); } static int __tipc_nl_add_net(struct net *net, struct tipc_nl_msg *msg) { struct tipc_net *tn = net_generic(net, tipc_net_id); u64 *w0 = (u64 *)&tn->node_id[0]; u64 *w1 = (u64 *)&tn->node_id[8]; struct nlattr *attrs; void *hdr; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_NET_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_NET); if (!attrs) goto msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_NET_ID, tn->net_id)) goto attr_msg_full; if (nla_put_u64_64bit(msg->skb, TIPC_NLA_NET_NODEID, *w0, 0)) goto attr_msg_full; if (nla_put_u64_64bit(msg->skb, TIPC_NLA_NET_NODEID_W1, *w1, 0)) goto attr_msg_full; nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); int err; int done = cb->args[0]; struct tipc_nl_msg msg; if (done) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; err = __tipc_nl_add_net(net, &msg); if (err) goto out; done = 1; out: cb->args[0] = done; return skb->len; } int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attrs[TIPC_NLA_NET_MAX + 1]; struct net *net = sock_net(skb->sk); struct tipc_net *tn = tipc_net(net); int err; if (!info->attrs[TIPC_NLA_NET]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_NET_MAX, info->attrs[TIPC_NLA_NET], tipc_nl_net_policy, info->extack); if (err) return err; /* Can't change net id once TIPC has joined a network */ if (tipc_own_addr(net)) return -EPERM; if (attrs[TIPC_NLA_NET_ID]) { u32 val; val = nla_get_u32(attrs[TIPC_NLA_NET_ID]); if (val < 1 || val > 9999) return -EINVAL; tn->net_id = val; } if (attrs[TIPC_NLA_NET_ADDR]) { u32 addr; addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]); if (!addr) return -EINVAL; tn->legacy_addr_format = true; tipc_net_init(net, NULL, addr); } if (attrs[TIPC_NLA_NET_NODEID]) { u8 node_id[NODE_ID_LEN]; u64 *w0 = (u64 *)&node_id[0]; u64 *w1 = (u64 *)&node_id[8]; if (!attrs[TIPC_NLA_NET_NODEID_W1]) return -EINVAL; *w0 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID]); *w1 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID_W1]); tipc_net_init(net, node_id, 0); } return 0; } int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) { int err; rtnl_lock(); err = __tipc_nl_net_set(skb, info); rtnl_unlock(); return err; } static int __tipc_nl_addr_legacy_get(struct net *net, struct tipc_nl_msg *msg) { struct tipc_net *tn = tipc_net(net); struct nlattr *attrs; void *hdr; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, 0, TIPC_NL_ADDR_LEGACY_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start(msg->skb, TIPC_NLA_NET); if (!attrs) goto msg_full; if (tn->legacy_addr_format) if (nla_put_flag(msg->skb, TIPC_NLA_NET_ADDR_LEGACY)) goto attr_msg_full; nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } int tipc_nl_net_addr_legacy_get(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct tipc_nl_msg msg; struct sk_buff *rep; int err; rep = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!rep) return -ENOMEM; msg.skb = rep; msg.portid = info->snd_portid; msg.seq = info->snd_seq; err = __tipc_nl_addr_legacy_get(net, &msg); if (err) { nlmsg_free(msg.skb); return err; } return genlmsg_reply(msg.skb, info); }
24 24 24 24 24 14 24 38 42 43 41 19 26 3 3 43 43 38 26 26 26 25 29 30 30 30 3 3 3 22 22 22 22 14 14 14 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #include "main.h" #include <linux/byteorder/generic.h> #include <linux/container_of.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/pkt_sched.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> #include <uapi/linux/batadv_packet.h> #include "originator.h" #include "send.h" #include "tvlv.h" /** * batadv_tvlv_handler_release() - release tvlv handler from lists and queue for * free after rcu grace period * @ref: kref pointer of the tvlv */ static void batadv_tvlv_handler_release(struct kref *ref) { struct batadv_tvlv_handler *tvlv_handler; tvlv_handler = container_of(ref, struct batadv_tvlv_handler, refcount); kfree_rcu(tvlv_handler, rcu); } /** * batadv_tvlv_handler_put() - decrement the tvlv container refcounter and * possibly release it * @tvlv_handler: the tvlv handler to free */ static void batadv_tvlv_handler_put(struct batadv_tvlv_handler *tvlv_handler) { if (!tvlv_handler) return; kref_put(&tvlv_handler->refcount, batadv_tvlv_handler_release); } /** * batadv_tvlv_handler_get() - retrieve tvlv handler from the tvlv handler list * based on the provided type and version (both need to match) * @bat_priv: the bat priv with all the mesh interface information * @type: tvlv handler type to look for * @version: tvlv handler version to look for * * Return: tvlv handler if found or NULL otherwise. */ static struct batadv_tvlv_handler * batadv_tvlv_handler_get(struct batadv_priv *bat_priv, u8 type, u8 version) { struct batadv_tvlv_handler *tvlv_handler_tmp, *tvlv_handler = NULL; rcu_read_lock(); hlist_for_each_entry_rcu(tvlv_handler_tmp, &bat_priv->tvlv.handler_list, list) { if (tvlv_handler_tmp->type != type) continue; if (tvlv_handler_tmp->version != version) continue; if (!kref_get_unless_zero(&tvlv_handler_tmp->refcount)) continue; tvlv_handler = tvlv_handler_tmp; break; } rcu_read_unlock(); return tvlv_handler; } /** * batadv_tvlv_container_release() - release tvlv from lists and free * @ref: kref pointer of the tvlv */ static void batadv_tvlv_container_release(struct kref *ref) { struct batadv_tvlv_container *tvlv; tvlv = container_of(ref, struct batadv_tvlv_container, refcount); kfree(tvlv); } /** * batadv_tvlv_container_put() - decrement the tvlv container refcounter and * possibly release it * @tvlv: the tvlv container to free */ static void batadv_tvlv_container_put(struct batadv_tvlv_container *tvlv) { if (!tvlv) return; kref_put(&tvlv->refcount, batadv_tvlv_container_release); } /** * batadv_tvlv_container_get() - retrieve tvlv container from the tvlv container * list based on the provided type and version (both need to match) * @bat_priv: the bat priv with all the mesh interface information * @type: tvlv container type to look for * @version: tvlv container version to look for * * Has to be called with the appropriate locks being acquired * (tvlv.container_list_lock). * * Return: tvlv container if found or NULL otherwise. */ static struct batadv_tvlv_container * batadv_tvlv_container_get(struct batadv_priv *bat_priv, u8 type, u8 version) { struct batadv_tvlv_container *tvlv_tmp, *tvlv = NULL; lockdep_assert_held(&bat_priv->tvlv.container_list_lock); hlist_for_each_entry(tvlv_tmp, &bat_priv->tvlv.container_list, list) { if (tvlv_tmp->tvlv_hdr.type != type) continue; if (tvlv_tmp->tvlv_hdr.version != version) continue; kref_get(&tvlv_tmp->refcount); tvlv = tvlv_tmp; break; } return tvlv; } /** * batadv_tvlv_container_list_size() - calculate the size of the tvlv container * list entries * @bat_priv: the bat priv with all the mesh interface information * * Has to be called with the appropriate locks being acquired * (tvlv.container_list_lock). * * Return: size of all currently registered tvlv containers in bytes. */ static u16 batadv_tvlv_container_list_size(struct batadv_priv *bat_priv) { struct batadv_tvlv_container *tvlv; u16 tvlv_len = 0; lockdep_assert_held(&bat_priv->tvlv.container_list_lock); hlist_for_each_entry(tvlv, &bat_priv->tvlv.container_list, list) { tvlv_len += sizeof(struct batadv_tvlv_hdr); tvlv_len += ntohs(tvlv->tvlv_hdr.len); } return tvlv_len; } /** * batadv_tvlv_container_remove() - remove tvlv container from the tvlv * container list * @bat_priv: the bat priv with all the mesh interface information * @tvlv: the to be removed tvlv container * * Has to be called with the appropriate locks being acquired * (tvlv.container_list_lock). */ static void batadv_tvlv_container_remove(struct batadv_priv *bat_priv, struct batadv_tvlv_container *tvlv) { lockdep_assert_held(&bat_priv->tvlv.container_list_lock); if (!tvlv) return; hlist_del(&tvlv->list); /* first call to decrement the counter, second call to free */ batadv_tvlv_container_put(tvlv); batadv_tvlv_container_put(tvlv); } /** * batadv_tvlv_container_unregister() - unregister tvlv container based on the * provided type and version (both need to match) * @bat_priv: the bat priv with all the mesh interface information * @type: tvlv container type to unregister * @version: tvlv container type to unregister */ void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv, u8 type, u8 version) { struct batadv_tvlv_container *tvlv; spin_lock_bh(&bat_priv->tvlv.container_list_lock); tvlv = batadv_tvlv_container_get(bat_priv, type, version); batadv_tvlv_container_remove(bat_priv, tvlv); spin_unlock_bh(&bat_priv->tvlv.container_list_lock); } /** * batadv_tvlv_container_register() - register tvlv type, version and content * to be propagated with each (primary interface) OGM * @bat_priv: the bat priv with all the mesh interface information * @type: tvlv container type * @version: tvlv container version * @tvlv_value: tvlv container content * @tvlv_value_len: tvlv container content length * * If a container of the same type and version was already registered the new * content is going to replace the old one. */ void batadv_tvlv_container_register(struct batadv_priv *bat_priv, u8 type, u8 version, void *tvlv_value, u16 tvlv_value_len) { struct batadv_tvlv_container *tvlv_old, *tvlv_new; if (!tvlv_value) tvlv_value_len = 0; tvlv_new = kzalloc(sizeof(*tvlv_new) + tvlv_value_len, GFP_ATOMIC); if (!tvlv_new) return; tvlv_new->tvlv_hdr.version = version; tvlv_new->tvlv_hdr.type = type; tvlv_new->tvlv_hdr.len = htons(tvlv_value_len); memcpy(tvlv_new + 1, tvlv_value, ntohs(tvlv_new->tvlv_hdr.len)); INIT_HLIST_NODE(&tvlv_new->list); kref_init(&tvlv_new->refcount); spin_lock_bh(&bat_priv->tvlv.container_list_lock); tvlv_old = batadv_tvlv_container_get(bat_priv, type, version); batadv_tvlv_container_remove(bat_priv, tvlv_old); kref_get(&tvlv_new->refcount); hlist_add_head(&tvlv_new->list, &bat_priv->tvlv.container_list); spin_unlock_bh(&bat_priv->tvlv.container_list_lock); /* don't return reference to new tvlv_container */ batadv_tvlv_container_put(tvlv_new); } /** * batadv_tvlv_realloc_packet_buff() - reallocate packet buffer to accommodate * requested packet size * @packet_buff: packet buffer * @packet_buff_len: packet buffer size * @min_packet_len: requested packet minimum size * @additional_packet_len: requested additional packet size on top of minimum * size * * Return: true of the packet buffer could be changed to the requested size, * false otherwise. */ static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff, int *packet_buff_len, int min_packet_len, int additional_packet_len) { unsigned char *new_buff; new_buff = kmalloc(min_packet_len + additional_packet_len, GFP_ATOMIC); /* keep old buffer if kmalloc should fail */ if (!new_buff) return false; memcpy(new_buff, *packet_buff, min_packet_len); kfree(*packet_buff); *packet_buff = new_buff; *packet_buff_len = min_packet_len + additional_packet_len; return true; } /** * batadv_tvlv_container_ogm_append() - append tvlv container content to given * OGM packet buffer * @bat_priv: the bat priv with all the mesh interface information * @packet_buff: ogm packet buffer * @packet_buff_len: ogm packet buffer size including ogm header and tvlv * content * @packet_min_len: ogm header size to be preserved for the OGM itself * * The ogm packet might be enlarged or shrunk depending on the current size * and the size of the to-be-appended tvlv containers. * * Return: size of all appended tvlv containers in bytes. */ u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, unsigned char **packet_buff, int *packet_buff_len, int packet_min_len) { struct batadv_tvlv_container *tvlv; struct batadv_tvlv_hdr *tvlv_hdr; u16 tvlv_value_len; void *tvlv_value; bool ret; spin_lock_bh(&bat_priv->tvlv.container_list_lock); tvlv_value_len = batadv_tvlv_container_list_size(bat_priv); ret = batadv_tvlv_realloc_packet_buff(packet_buff, packet_buff_len, packet_min_len, tvlv_value_len); if (!ret) goto end; if (!tvlv_value_len) goto end; tvlv_value = (*packet_buff) + packet_min_len; hlist_for_each_entry(tvlv, &bat_priv->tvlv.container_list, list) { tvlv_hdr = tvlv_value; tvlv_hdr->type = tvlv->tvlv_hdr.type; tvlv_hdr->version = tvlv->tvlv_hdr.version; tvlv_hdr->len = tvlv->tvlv_hdr.len; tvlv_value = tvlv_hdr + 1; memcpy(tvlv_value, tvlv + 1, ntohs(tvlv->tvlv_hdr.len)); tvlv_value = (u8 *)tvlv_value + ntohs(tvlv->tvlv_hdr.len); } end: spin_unlock_bh(&bat_priv->tvlv.container_list_lock); return tvlv_value_len; } /** * batadv_tvlv_call_handler() - parse the given tvlv buffer to call the * appropriate handlers * @bat_priv: the bat priv with all the mesh interface information * @tvlv_handler: tvlv callback function handling the tvlv content * @packet_type: indicates for which packet type the TVLV handler is called * @orig_node: orig node emitting the ogm packet * @skb: the skb the TVLV handler is called for * @tvlv_value: tvlv content * @tvlv_value_len: tvlv content length * * Return: success if the handler was not found or the return value of the * handler callback. */ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, struct batadv_tvlv_handler *tvlv_handler, u8 packet_type, struct batadv_orig_node *orig_node, struct sk_buff *skb, void *tvlv_value, u16 tvlv_value_len) { unsigned int tvlv_offset; u8 *src, *dst; if (!tvlv_handler) return NET_RX_SUCCESS; switch (packet_type) { case BATADV_IV_OGM: case BATADV_OGM2: if (!tvlv_handler->ogm_handler) return NET_RX_SUCCESS; if (!orig_node) return NET_RX_SUCCESS; tvlv_handler->ogm_handler(bat_priv, orig_node, BATADV_NO_FLAGS, tvlv_value, tvlv_value_len); tvlv_handler->flags |= BATADV_TVLV_HANDLER_OGM_CALLED; break; case BATADV_UNICAST_TVLV: if (!skb) return NET_RX_SUCCESS; if (!tvlv_handler->unicast_handler) return NET_RX_SUCCESS; src = ((struct batadv_unicast_tvlv_packet *)skb->data)->src; dst = ((struct batadv_unicast_tvlv_packet *)skb->data)->dst; return tvlv_handler->unicast_handler(bat_priv, src, dst, tvlv_value, tvlv_value_len); case BATADV_MCAST: if (!skb) return NET_RX_SUCCESS; if (!tvlv_handler->mcast_handler) return NET_RX_SUCCESS; tvlv_offset = (unsigned char *)tvlv_value - skb->data; skb_set_network_header(skb, tvlv_offset); skb_set_transport_header(skb, tvlv_offset + tvlv_value_len); return tvlv_handler->mcast_handler(bat_priv, skb); } return NET_RX_SUCCESS; } /** * batadv_tvlv_containers_process() - parse the given tvlv buffer to call the * appropriate handlers * @bat_priv: the bat priv with all the mesh interface information * @packet_type: indicates for which packet type the TVLV handler is called * @orig_node: orig node emitting the ogm packet * @skb: the skb the TVLV handler is called for * @tvlv_value: tvlv content * @tvlv_value_len: tvlv content length * * Return: success when processing an OGM or the return value of all called * handler callbacks. */ int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, u8 packet_type, struct batadv_orig_node *orig_node, struct sk_buff *skb, void *tvlv_value, u16 tvlv_value_len) { struct batadv_tvlv_handler *tvlv_handler; struct batadv_tvlv_hdr *tvlv_hdr; u16 tvlv_value_cont_len; u8 cifnotfound = BATADV_TVLV_HANDLER_OGM_CIFNOTFND; int ret = NET_RX_SUCCESS; while (tvlv_value_len >= sizeof(*tvlv_hdr)) { tvlv_hdr = tvlv_value; tvlv_value_cont_len = ntohs(tvlv_hdr->len); tvlv_value = tvlv_hdr + 1; tvlv_value_len -= sizeof(*tvlv_hdr); if (tvlv_value_cont_len > tvlv_value_len) break; tvlv_handler = batadv_tvlv_handler_get(bat_priv, tvlv_hdr->type, tvlv_hdr->version); ret |= batadv_tvlv_call_handler(bat_priv, tvlv_handler, packet_type, orig_node, skb, tvlv_value, tvlv_value_cont_len); batadv_tvlv_handler_put(tvlv_handler); tvlv_value = (u8 *)tvlv_value + tvlv_value_cont_len; tvlv_value_len -= tvlv_value_cont_len; } if (packet_type != BATADV_IV_OGM && packet_type != BATADV_OGM2) return ret; rcu_read_lock(); hlist_for_each_entry_rcu(tvlv_handler, &bat_priv->tvlv.handler_list, list) { if (!tvlv_handler->ogm_handler) continue; if ((tvlv_handler->flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) && !(tvlv_handler->flags & BATADV_TVLV_HANDLER_OGM_CALLED)) tvlv_handler->ogm_handler(bat_priv, orig_node, cifnotfound, NULL, 0); tvlv_handler->flags &= ~BATADV_TVLV_HANDLER_OGM_CALLED; } rcu_read_unlock(); return NET_RX_SUCCESS; } /** * batadv_tvlv_ogm_receive() - process an incoming ogm and call the appropriate * handlers * @bat_priv: the bat priv with all the mesh interface information * @batadv_ogm_packet: ogm packet containing the tvlv containers * @orig_node: orig node emitting the ogm packet */ void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv, struct batadv_ogm_packet *batadv_ogm_packet, struct batadv_orig_node *orig_node) { void *tvlv_value; u16 tvlv_value_len; if (!batadv_ogm_packet) return; tvlv_value_len = ntohs(batadv_ogm_packet->tvlv_len); if (!tvlv_value_len) return; tvlv_value = batadv_ogm_packet + 1; batadv_tvlv_containers_process(bat_priv, BATADV_IV_OGM, orig_node, NULL, tvlv_value, tvlv_value_len); } /** * batadv_tvlv_handler_register() - register tvlv handler based on the provided * type and version (both need to match) for ogm tvlv payload and/or unicast * payload * @bat_priv: the bat priv with all the mesh interface information * @optr: ogm tvlv handler callback function. This function receives the orig * node, flags and the tvlv content as argument to process. * @uptr: unicast tvlv handler callback function. This function receives the * source & destination of the unicast packet as well as the tvlv content * to process. * @mptr: multicast packet tvlv handler callback function. This function * receives the full skb to process, with the skb network header pointing * to the current tvlv and the skb transport header pointing to the first * byte after the current tvlv. * @type: tvlv handler type to be registered * @version: tvlv handler version to be registered * @flags: flags to enable or disable TVLV API behavior */ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, void (*optr)(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 flags, void *tvlv_value, u16 tvlv_value_len), int (*uptr)(struct batadv_priv *bat_priv, u8 *src, u8 *dst, void *tvlv_value, u16 tvlv_value_len), int (*mptr)(struct batadv_priv *bat_priv, struct sk_buff *skb), u8 type, u8 version, u8 flags) { struct batadv_tvlv_handler *tvlv_handler; spin_lock_bh(&bat_priv->tvlv.handler_list_lock); tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version); if (tvlv_handler) { spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); batadv_tvlv_handler_put(tvlv_handler); return; } tvlv_handler = kzalloc(sizeof(*tvlv_handler), GFP_ATOMIC); if (!tvlv_handler) { spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); return; } tvlv_handler->ogm_handler = optr; tvlv_handler->unicast_handler = uptr; tvlv_handler->mcast_handler = mptr; tvlv_handler->type = type; tvlv_handler->version = version; tvlv_handler->flags = flags; kref_init(&tvlv_handler->refcount); INIT_HLIST_NODE(&tvlv_handler->list); kref_get(&tvlv_handler->refcount); hlist_add_head_rcu(&tvlv_handler->list, &bat_priv->tvlv.handler_list); spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); /* don't return reference to new tvlv_handler */ batadv_tvlv_handler_put(tvlv_handler); } /** * batadv_tvlv_handler_unregister() - unregister tvlv handler based on the * provided type and version (both need to match) * @bat_priv: the bat priv with all the mesh interface information * @type: tvlv handler type to be unregistered * @version: tvlv handler version to be unregistered */ void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv, u8 type, u8 version) { struct batadv_tvlv_handler *tvlv_handler; tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version); if (!tvlv_handler) return; batadv_tvlv_handler_put(tvlv_handler); spin_lock_bh(&bat_priv->tvlv.handler_list_lock); hlist_del_rcu(&tvlv_handler->list); spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); batadv_tvlv_handler_put(tvlv_handler); } /** * batadv_tvlv_unicast_send() - send a unicast packet with tvlv payload to the * specified host * @bat_priv: the bat priv with all the mesh interface information * @src: source mac address of the unicast packet * @dst: destination mac address of the unicast packet * @type: tvlv type * @version: tvlv version * @tvlv_value: tvlv content * @tvlv_value_len: tvlv content length */ void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, const u8 *src, const u8 *dst, u8 type, u8 version, void *tvlv_value, u16 tvlv_value_len) { struct batadv_unicast_tvlv_packet *unicast_tvlv_packet; struct batadv_tvlv_hdr *tvlv_hdr; struct batadv_orig_node *orig_node; struct sk_buff *skb; unsigned char *tvlv_buff; unsigned int tvlv_len; ssize_t hdr_len = sizeof(*unicast_tvlv_packet); orig_node = batadv_orig_hash_find(bat_priv, dst); if (!orig_node) return; tvlv_len = sizeof(*tvlv_hdr) + tvlv_value_len; skb = netdev_alloc_skb_ip_align(NULL, ETH_HLEN + hdr_len + tvlv_len); if (!skb) goto out; skb->priority = TC_PRIO_CONTROL; skb_reserve(skb, ETH_HLEN); tvlv_buff = skb_put(skb, sizeof(*unicast_tvlv_packet) + tvlv_len); unicast_tvlv_packet = (struct batadv_unicast_tvlv_packet *)tvlv_buff; unicast_tvlv_packet->packet_type = BATADV_UNICAST_TVLV; unicast_tvlv_packet->version = BATADV_COMPAT_VERSION; unicast_tvlv_packet->ttl = BATADV_TTL; unicast_tvlv_packet->reserved = 0; unicast_tvlv_packet->tvlv_len = htons(tvlv_len); unicast_tvlv_packet->align = 0; ether_addr_copy(unicast_tvlv_packet->src, src); ether_addr_copy(unicast_tvlv_packet->dst, dst); tvlv_buff = (unsigned char *)(unicast_tvlv_packet + 1); tvlv_hdr = (struct batadv_tvlv_hdr *)tvlv_buff; tvlv_hdr->version = version; tvlv_hdr->type = type; tvlv_hdr->len = htons(tvlv_value_len); tvlv_buff += sizeof(*tvlv_hdr); memcpy(tvlv_buff, tvlv_value, tvlv_value_len); batadv_send_skb_to_orig(skb, orig_node, NULL); out: batadv_orig_node_put(orig_node); }
2 2 1 1 2 2 2 1 1 1 1 1 1 1 1 24 24 13 11 21 21 21 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2019 Mellanox Technologies. All rights reserved. */ #include <rdma/ib_verbs.h> #include <rdma/rdma_counter.h> #include "core_priv.h" #include "restrack.h" #define ALL_AUTO_MODE_MASKS (RDMA_COUNTER_MASK_QP_TYPE | RDMA_COUNTER_MASK_PID) static int __counter_set_mode(struct rdma_port_counter *port_counter, enum rdma_nl_counter_mode new_mode, enum rdma_nl_counter_mask new_mask, bool bind_opcnt) { if (new_mode == RDMA_COUNTER_MODE_AUTO) { if (new_mask & (~ALL_AUTO_MODE_MASKS)) return -EINVAL; if (port_counter->num_counters) return -EBUSY; } port_counter->mode.mode = new_mode; port_counter->mode.mask = new_mask; port_counter->mode.bind_opcnt = bind_opcnt; return 0; } /* * rdma_counter_set_auto_mode() - Turn on/off per-port auto mode * * @dev: Device to operate * @port: Port to use * @mask: Mask to configure * @extack: Message to the user * * Return 0 on success. If counter mode wasn't changed then it is considered * as success as well. * Return -EBUSY when changing to auto mode while there are bounded counters. * */ int rdma_counter_set_auto_mode(struct ib_device *dev, u32 port, enum rdma_nl_counter_mask mask, bool bind_opcnt, struct netlink_ext_ack *extack) { struct rdma_port_counter *port_counter; enum rdma_nl_counter_mode mode; int ret; port_counter = &dev->port_data[port].port_counter; if (!port_counter->hstats) return -EOPNOTSUPP; mutex_lock(&port_counter->lock); if (mask) mode = RDMA_COUNTER_MODE_AUTO; else mode = (port_counter->num_counters) ? RDMA_COUNTER_MODE_MANUAL : RDMA_COUNTER_MODE_NONE; if (port_counter->mode.mode == mode && port_counter->mode.mask == mask && port_counter->mode.bind_opcnt == bind_opcnt) { ret = 0; goto out; } ret = __counter_set_mode(port_counter, mode, mask, bind_opcnt); out: mutex_unlock(&port_counter->lock); if (ret == -EBUSY) NL_SET_ERR_MSG( extack, "Modifying auto mode is not allowed when there is a bound QP"); return ret; } static void auto_mode_init_counter(struct rdma_counter *counter, const struct ib_qp *qp, enum rdma_nl_counter_mask new_mask) { struct auto_mode_param *param = &counter->mode.param; counter->mode.mode = RDMA_COUNTER_MODE_AUTO; counter->mode.mask = new_mask; if (new_mask & RDMA_COUNTER_MASK_QP_TYPE) param->qp_type = qp->qp_type; } static int __rdma_counter_bind_qp(struct rdma_counter *counter, struct ib_qp *qp, u32 port) { int ret; if (qp->counter) return -EINVAL; if (!qp->device->ops.counter_bind_qp) return -EOPNOTSUPP; mutex_lock(&counter->lock); ret = qp->device->ops.counter_bind_qp(counter, qp, port); mutex_unlock(&counter->lock); return ret; } int rdma_counter_modify(struct ib_device *dev, u32 port, unsigned int index, bool enable) { struct rdma_hw_stats *stats; int ret = 0; if (!dev->ops.modify_hw_stat) return -EOPNOTSUPP; stats = ib_get_hw_stats_port(dev, port); if (!stats || index >= stats->num_counters || !(stats->descs[index].flags & IB_STAT_FLAG_OPTIONAL)) return -EINVAL; mutex_lock(&stats->lock); if (enable != test_bit(index, stats->is_disabled)) goto out; ret = dev->ops.modify_hw_stat(dev, port, index, enable); if (ret) goto out; if (enable) clear_bit(index, stats->is_disabled); else set_bit(index, stats->is_disabled); out: mutex_unlock(&stats->lock); return ret; } static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port, struct ib_qp *qp, enum rdma_nl_counter_mode mode, bool bind_opcnt) { struct rdma_port_counter *port_counter; struct rdma_counter *counter; int ret; if (!dev->ops.counter_dealloc || !dev->ops.counter_alloc_stats) return NULL; counter = rdma_zalloc_drv_obj(dev, rdma_counter); if (!counter) return NULL; counter->device = dev; counter->port = port; dev->ops.counter_init(counter); rdma_restrack_new(&counter->res, RDMA_RESTRACK_COUNTER); counter->stats = dev->ops.counter_alloc_stats(counter); if (!counter->stats) goto err_stats; port_counter = &dev->port_data[port].port_counter; mutex_lock(&port_counter->lock); switch (mode) { case RDMA_COUNTER_MODE_MANUAL: ret = __counter_set_mode(port_counter, RDMA_COUNTER_MODE_MANUAL, 0, bind_opcnt); if (ret) { mutex_unlock(&port_counter->lock); goto err_mode; } break; case RDMA_COUNTER_MODE_AUTO: auto_mode_init_counter(counter, qp, port_counter->mode.mask); break; default: ret = -EOPNOTSUPP; mutex_unlock(&port_counter->lock); goto err_mode; } port_counter->num_counters++; mutex_unlock(&port_counter->lock); counter->mode.mode = mode; counter->mode.bind_opcnt = bind_opcnt; kref_init(&counter->kref); mutex_init(&counter->lock); ret = __rdma_counter_bind_qp(counter, qp, port); if (ret) goto err_mode; rdma_restrack_parent_name(&counter->res, &qp->res); rdma_restrack_add(&counter->res); return counter; err_mode: rdma_free_hw_stats_struct(counter->stats); err_stats: rdma_restrack_put(&counter->res); kfree(counter); return NULL; } static void rdma_counter_free(struct rdma_counter *counter) { struct rdma_port_counter *port_counter; port_counter = &counter->device->port_data[counter->port].port_counter; mutex_lock(&port_counter->lock); port_counter->num_counters--; if (!port_counter->num_counters && (port_counter->mode.mode == RDMA_COUNTER_MODE_MANUAL)) __counter_set_mode(port_counter, RDMA_COUNTER_MODE_NONE, 0, false); mutex_unlock(&port_counter->lock); rdma_restrack_del(&counter->res); rdma_free_hw_stats_struct(counter->stats); kfree(counter); } static bool auto_mode_match(struct ib_qp *qp, struct rdma_counter *counter, enum rdma_nl_counter_mask auto_mask) { struct auto_mode_param *param = &counter->mode.param; bool match = true; if (auto_mask & RDMA_COUNTER_MASK_QP_TYPE) match &= (param->qp_type == qp->qp_type); if (auto_mask & RDMA_COUNTER_MASK_PID) match &= (task_pid_nr(counter->res.task) == task_pid_nr(qp->res.task)); return match; } static int __rdma_counter_unbind_qp(struct ib_qp *qp, u32 port) { struct rdma_counter *counter = qp->counter; int ret; if (!qp->device->ops.counter_unbind_qp) return -EOPNOTSUPP; mutex_lock(&counter->lock); ret = qp->device->ops.counter_unbind_qp(qp, port); mutex_unlock(&counter->lock); return ret; } static void counter_history_stat_update(struct rdma_counter *counter) { struct ib_device *dev = counter->device; struct rdma_port_counter *port_counter; int i; port_counter = &dev->port_data[counter->port].port_counter; if (!port_counter->hstats) return; rdma_counter_query_stats(counter); for (i = 0; i < counter->stats->num_counters; i++) port_counter->hstats->value[i] += counter->stats->value[i]; } /* * rdma_get_counter_auto_mode - Find the counter that @qp should be bound * with in auto mode * * Return: The counter (with ref-count increased) if found */ static struct rdma_counter *rdma_get_counter_auto_mode(struct ib_qp *qp, u32 port) { struct rdma_port_counter *port_counter; struct rdma_counter *counter = NULL; struct ib_device *dev = qp->device; struct rdma_restrack_entry *res; struct rdma_restrack_root *rt; unsigned long id = 0; port_counter = &dev->port_data[port].port_counter; rt = &dev->res[RDMA_RESTRACK_COUNTER]; xa_lock(&rt->xa); xa_for_each(&rt->xa, id, res) { counter = container_of(res, struct rdma_counter, res); if ((counter->device != qp->device) || (counter->port != port)) goto next; if (auto_mode_match(qp, counter, port_counter->mode.mask)) break; next: counter = NULL; } if (counter && !kref_get_unless_zero(&counter->kref)) counter = NULL; xa_unlock(&rt->xa); return counter; } static void counter_release(struct kref *kref) { struct rdma_counter *counter; counter = container_of(kref, struct rdma_counter, kref); counter_history_stat_update(counter); counter->device->ops.counter_dealloc(counter); rdma_counter_free(counter); } /* * rdma_counter_bind_qp_auto - Check and bind the QP to a counter base on * the auto-mode rule */ int rdma_counter_bind_qp_auto(struct ib_qp *qp, u32 port) { struct rdma_port_counter *port_counter; struct ib_device *dev = qp->device; struct rdma_counter *counter; int ret; if (!rdma_restrack_is_tracked(&qp->res) || rdma_is_kernel_res(&qp->res)) return 0; if (!rdma_is_port_valid(dev, port)) return -EINVAL; port_counter = &dev->port_data[port].port_counter; if (port_counter->mode.mode != RDMA_COUNTER_MODE_AUTO) return 0; counter = rdma_get_counter_auto_mode(qp, port); if (counter) { ret = __rdma_counter_bind_qp(counter, qp, port); if (ret) { kref_put(&counter->kref, counter_release); return ret; } } else { counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_AUTO, port_counter->mode.bind_opcnt); if (!counter) return -ENOMEM; } return 0; } /* * rdma_counter_unbind_qp - Unbind a qp from a counter * @force: * true - Decrease the counter ref-count anyway (e.g., qp destroy) */ int rdma_counter_unbind_qp(struct ib_qp *qp, u32 port, bool force) { struct rdma_counter *counter = qp->counter; int ret; if (!counter) return -EINVAL; ret = __rdma_counter_unbind_qp(qp, port); if (ret && !force) return ret; kref_put(&counter->kref, counter_release); return 0; } int rdma_counter_query_stats(struct rdma_counter *counter) { struct ib_device *dev = counter->device; int ret; if (!dev->ops.counter_update_stats) return -EINVAL; mutex_lock(&counter->lock); ret = dev->ops.counter_update_stats(counter); mutex_unlock(&counter->lock); return ret; } static u64 get_running_counters_hwstat_sum(struct ib_device *dev, u32 port, u32 index) { struct rdma_restrack_entry *res; struct rdma_restrack_root *rt; struct rdma_counter *counter; unsigned long id = 0; u64 sum = 0; rt = &dev->res[RDMA_RESTRACK_COUNTER]; xa_lock(&rt->xa); xa_for_each(&rt->xa, id, res) { if (!rdma_restrack_get(res)) continue; xa_unlock(&rt->xa); counter = container_of(res, struct rdma_counter, res); if ((counter->device != dev) || (counter->port != port) || rdma_counter_query_stats(counter)) goto next; sum += counter->stats->value[index]; next: xa_lock(&rt->xa); rdma_restrack_put(res); } xa_unlock(&rt->xa); return sum; } /* * rdma_counter_get_hwstat_value() - Get the sum value of all counters on a * specific port, including the running ones and history data */ u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u32 port, u32 index) { struct rdma_port_counter *port_counter; u64 sum; port_counter = &dev->port_data[port].port_counter; if (!port_counter->hstats) return 0; sum = get_running_counters_hwstat_sum(dev, port, index); sum += port_counter->hstats->value[index]; return sum; } static struct ib_qp *rdma_counter_get_qp(struct ib_device *dev, u32 qp_num) { struct rdma_restrack_entry *res = NULL; struct ib_qp *qp = NULL; res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_QP, qp_num); if (IS_ERR(res)) return NULL; qp = container_of(res, struct ib_qp, res); if (qp->qp_type == IB_QPT_RAW_PACKET && !rdma_dev_has_raw_cap(dev)) goto err; return qp; err: rdma_restrack_put(res); return NULL; } static struct rdma_counter *rdma_get_counter_by_id(struct ib_device *dev, u32 counter_id) { struct rdma_restrack_entry *res; struct rdma_counter *counter; res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_COUNTER, counter_id); if (IS_ERR(res)) return NULL; counter = container_of(res, struct rdma_counter, res); kref_get(&counter->kref); rdma_restrack_put(res); return counter; } /* * rdma_counter_bind_qpn() - Bind QP @qp_num to counter @counter_id */ int rdma_counter_bind_qpn(struct ib_device *dev, u32 port, u32 qp_num, u32 counter_id) { struct rdma_port_counter *port_counter; struct rdma_counter *counter; struct ib_qp *qp; int ret; port_counter = &dev->port_data[port].port_counter; if (port_counter->mode.mode == RDMA_COUNTER_MODE_AUTO) return -EINVAL; qp = rdma_counter_get_qp(dev, qp_num); if (!qp) return -ENOENT; counter = rdma_get_counter_by_id(dev, counter_id); if (!counter) { ret = -ENOENT; goto err; } if (rdma_is_kernel_res(&counter->res) != rdma_is_kernel_res(&qp->res)) { ret = -EINVAL; goto err_task; } if ((counter->device != qp->device) || (counter->port != qp->port)) { ret = -EINVAL; goto err_task; } ret = __rdma_counter_bind_qp(counter, qp, port); if (ret) goto err_task; rdma_restrack_put(&qp->res); return 0; err_task: kref_put(&counter->kref, counter_release); err: rdma_restrack_put(&qp->res); return ret; } /* * rdma_counter_bind_qpn_alloc() - Alloc a counter and bind QP @qp_num to it * The id of new counter is returned in @counter_id */ int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u32 port, u32 qp_num, u32 *counter_id) { struct rdma_port_counter *port_counter; struct rdma_counter *counter; struct ib_qp *qp; int ret; if (!rdma_is_port_valid(dev, port)) return -EINVAL; port_counter = &dev->port_data[port].port_counter; if (!port_counter->hstats) return -EOPNOTSUPP; if (port_counter->mode.mode == RDMA_COUNTER_MODE_AUTO) return -EINVAL; qp = rdma_counter_get_qp(dev, qp_num); if (!qp) return -ENOENT; if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) { ret = -EINVAL; goto err; } counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_MANUAL, true); if (!counter) { ret = -ENOMEM; goto err; } if (counter_id) *counter_id = counter->id; rdma_restrack_put(&qp->res); return 0; err: rdma_restrack_put(&qp->res); return ret; } /* * rdma_counter_unbind_qpn() - Unbind QP @qp_num from a counter */ int rdma_counter_unbind_qpn(struct ib_device *dev, u32 port, u32 qp_num, u32 counter_id) { struct rdma_port_counter *port_counter; struct ib_qp *qp; int ret; if (!rdma_is_port_valid(dev, port)) return -EINVAL; qp = rdma_counter_get_qp(dev, qp_num); if (!qp) return -ENOENT; if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) { ret = -EINVAL; goto out; } port_counter = &dev->port_data[port].port_counter; if (!qp->counter || qp->counter->id != counter_id || port_counter->mode.mode != RDMA_COUNTER_MODE_MANUAL) { ret = -EINVAL; goto out; } ret = rdma_counter_unbind_qp(qp, port, false); out: rdma_restrack_put(&qp->res); return ret; } int rdma_counter_get_mode(struct ib_device *dev, u32 port, enum rdma_nl_counter_mode *mode, enum rdma_nl_counter_mask *mask, bool *opcnt) { struct rdma_port_counter *port_counter; port_counter = &dev->port_data[port].port_counter; *mode = port_counter->mode.mode; *mask = port_counter->mode.mask; *opcnt = port_counter->mode.bind_opcnt; return 0; } void rdma_counter_init(struct ib_device *dev) { struct rdma_port_counter *port_counter; u32 port, i; if (!dev->port_data) return; rdma_for_each_port(dev, port) { port_counter = &dev->port_data[port].port_counter; port_counter->mode.mode = RDMA_COUNTER_MODE_NONE; mutex_init(&port_counter->lock); if (!dev->ops.alloc_hw_port_stats) continue; port_counter->hstats = dev->ops.alloc_hw_port_stats(dev, port); if (!port_counter->hstats) goto fail; } return; fail: for (i = port; i >= rdma_start_port(dev); i--) { port_counter = &dev->port_data[port].port_counter; rdma_free_hw_stats_struct(port_counter->hstats); port_counter->hstats = NULL; mutex_destroy(&port_counter->lock); } } void rdma_counter_release(struct ib_device *dev) { struct rdma_port_counter *port_counter; u32 port; rdma_for_each_port(dev, port) { port_counter = &dev->port_data[port].port_counter; rdma_free_hw_stats_struct(port_counter->hstats); mutex_destroy(&port_counter->lock); } }
2192 2192 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/of.h> #include <linux/of_device.h> #include <linux/of_address.h> #include <linux/of_iommu.h> #include <linux/of_reserved_mem.h> #include <linux/dma-direct.h> /* for bus_dma_region */ #include <linux/dma-map-ops.h> #include <linux/init.h> #include <linux/mod_devicetable.h> #include <linux/slab.h> #include <linux/platform_device.h> #include <asm/errno.h> #include "of_private.h" /** * of_match_device - Tell if a struct device matches an of_device_id list * @matches: array of of_device_id match structures to search in * @dev: the OF device structure to match against * * Used by a driver to check whether an platform_device present in the * system is in its list of supported devices. */ const struct of_device_id *of_match_device(const struct of_device_id *matches, const struct device *dev) { if (!matches || !dev->of_node || dev->of_node_reused) return NULL; return of_match_node(matches, dev->of_node); } EXPORT_SYMBOL(of_match_device); static void of_dma_set_restricted_buffer(struct device *dev, struct device_node *np) { struct device_node *of_node = dev->of_node; struct of_phandle_iterator it; int rc, i = 0; if (!IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL)) return; /* * If dev->of_node doesn't exist or doesn't contain memory-region, try * the OF node having DMA configuration. */ if (!of_property_present(of_node, "memory-region")) of_node = np; of_for_each_phandle(&it, rc, of_node, "memory-region", NULL, 0) { /* * There might be multiple memory regions, but only one * restricted-dma-pool region is allowed. */ if (of_device_is_compatible(it.node, "restricted-dma-pool") && of_device_is_available(it.node)) { if (of_reserved_mem_device_init_by_idx(dev, of_node, i)) dev_warn(dev, "failed to initialise \"restricted-dma-pool\" memory node\n"); of_node_put(it.node); break; } i++; } } /** * of_dma_configure_id - Setup DMA configuration * @dev: Device to apply DMA configuration * @np: Pointer to OF node having DMA configuration * @force_dma: Whether device is to be set up by of_dma_configure() even if * DMA capability is not explicitly described by firmware. * @id: Optional const pointer value input id * * Try to get devices's DMA configuration from DT and update it * accordingly. * * If platform code needs to use its own special DMA configuration, it * can use a platform bus notifier and handle BUS_NOTIFY_ADD_DEVICE events * to fix up DMA configuration. */ int of_dma_configure_id(struct device *dev, struct device_node *np, bool force_dma, const u32 *id) { const struct bus_dma_region *map = NULL; struct device_node *bus_np; u64 mask, end = 0; bool coherent, set_map = false; int ret; if (dev->dma_range_map) { dev_dbg(dev, "dma_range_map already set\n"); goto skip_map; } if (np == dev->of_node) bus_np = __of_get_dma_parent(np); else bus_np = of_node_get(np); ret = of_dma_get_range(bus_np, &map); of_node_put(bus_np); if (ret < 0) { /* * For legacy reasons, we have to assume some devices need * DMA configuration regardless of whether "dma-ranges" is * correctly specified or not. */ if (!force_dma) return ret == -ENODEV ? 0 : ret; } else { /* Determine the overall bounds of all DMA regions */ end = dma_range_map_max(map); set_map = true; } skip_map: /* * If @dev is expected to be DMA-capable then the bus code that created * it should have initialised its dma_mask pointer by this point. For * now, we'll continue the legacy behaviour of coercing it to the * coherent mask if not, but we'll no longer do so quietly. */ if (!dev->dma_mask) { dev_warn(dev, "DMA mask not set\n"); dev->dma_mask = &dev->coherent_dma_mask; } if (!end && dev->coherent_dma_mask) end = dev->coherent_dma_mask; else if (!end) end = (1ULL << 32) - 1; /* * Limit coherent and dma mask based on size and default mask * set by the driver. */ mask = DMA_BIT_MASK(ilog2(end) + 1); dev->coherent_dma_mask &= mask; *dev->dma_mask &= mask; /* ...but only set bus limit and range map if we found valid dma-ranges earlier */ if (set_map) { dev->bus_dma_limit = end; dev->dma_range_map = map; } coherent = of_dma_is_coherent(np); dev_dbg(dev, "device is%sdma coherent\n", coherent ? " " : " not "); ret = of_iommu_configure(dev, np, id); if (ret == -EPROBE_DEFER) { /* Don't touch range map if it wasn't set from a valid dma-ranges */ if (set_map) dev->dma_range_map = NULL; kfree(map); return -EPROBE_DEFER; } /* Take all other IOMMU errors to mean we'll just carry on without it */ dev_dbg(dev, "device is%sbehind an iommu\n", !ret ? " " : " not "); arch_setup_dma_ops(dev, coherent); if (ret) of_dma_set_restricted_buffer(dev, np); return 0; } EXPORT_SYMBOL_GPL(of_dma_configure_id); const void *of_device_get_match_data(const struct device *dev) { const struct of_device_id *match; match = of_match_device(dev->driver->of_match_table, dev); if (!match) return NULL; return match->data; } EXPORT_SYMBOL(of_device_get_match_data); /** * of_device_modalias - Fill buffer with newline terminated modalias string * @dev: Calling device * @str: Modalias string * @len: Size of @str */ ssize_t of_device_modalias(struct device *dev, char *str, ssize_t len) { ssize_t sl; if (!dev || !dev->of_node || dev->of_node_reused) return -ENODEV; sl = of_modalias(dev->of_node, str, len - 2); if (sl < 0) return sl; if (sl > len - 2) return -ENOMEM; str[sl++] = '\n'; str[sl] = 0; return sl; } EXPORT_SYMBOL_GPL(of_device_modalias); /** * of_device_uevent - Display OF related uevent information * @dev: Device to display the uevent information for * @env: Kernel object's userspace event reference to fill up */ void of_device_uevent(const struct device *dev, struct kobj_uevent_env *env) { const char *compat, *type; struct alias_prop *app; struct property *p; int seen = 0; if ((!dev) || (!dev->of_node)) return; add_uevent_var(env, "OF_NAME=%pOFn", dev->of_node); add_uevent_var(env, "OF_FULLNAME=%pOF", dev->of_node); type = of_node_get_device_type(dev->of_node); if (type) add_uevent_var(env, "OF_TYPE=%s", type); /* Since the compatible field can contain pretty much anything * it's not really legal to split it out with commas. We split it * up using a number of environment variables instead. */ of_property_for_each_string(dev->of_node, "compatible", p, compat) { add_uevent_var(env, "OF_COMPATIBLE_%d=%s", seen, compat); seen++; } add_uevent_var(env, "OF_COMPATIBLE_N=%d", seen); seen = 0; mutex_lock(&of_mutex); list_for_each_entry(app, &aliases_lookup, link) { if (dev->of_node == app->np) { add_uevent_var(env, "OF_ALIAS_%d=%s", seen, app->alias); seen++; } } mutex_unlock(&of_mutex); } EXPORT_SYMBOL_GPL(of_device_uevent); int of_device_uevent_modalias(const struct device *dev, struct kobj_uevent_env *env) { int sl; if ((!dev) || (!dev->of_node) || dev->of_node_reused) return -ENODEV; /* Devicetree modalias is tricky, we add it in 2 steps */ if (add_uevent_var(env, "MODALIAS=")) return -ENOMEM; sl = of_modalias(dev->of_node, &env->buf[env->buflen-1], sizeof(env->buf) - env->buflen); if (sl < 0) return sl; if (sl >= (sizeof(env->buf) - env->buflen)) return -ENOMEM; env->buflen += sl; return 0; } EXPORT_SYMBOL_GPL(of_device_uevent_modalias); /** * of_device_make_bus_id - Use the device node data to assign a unique name * @dev: pointer to device structure that is linked to a device tree node * * This routine will first try using the translated bus address to * derive a unique name. If it cannot, then it will prepend names from * parent nodes until a unique name can be derived. */ void of_device_make_bus_id(struct device *dev) { struct device_node *node = dev->of_node; const __be32 *reg; u64 addr; u32 mask; /* Construct the name, using parent nodes if necessary to ensure uniqueness */ while (node->parent) { /* * If the address can be translated, then that is as much * uniqueness as we need. Make it the first component and return */ reg = of_get_property(node, "reg", NULL); if (reg && (addr = of_translate_address(node, reg)) != OF_BAD_ADDR) { if (!of_property_read_u32(node, "mask", &mask)) dev_set_name(dev, dev_name(dev) ? "%llx.%x.%pOFn:%s" : "%llx.%x.%pOFn", addr, ffs(mask) - 1, node, dev_name(dev)); else dev_set_name(dev, dev_name(dev) ? "%llx.%pOFn:%s" : "%llx.%pOFn", addr, node, dev_name(dev)); return; } /* format arguments only used if dev_name() resolves to NULL */ dev_set_name(dev, dev_name(dev) ? "%s:%s" : "%s", kbasename(node->full_name), dev_name(dev)); node = node->parent; } } EXPORT_SYMBOL_GPL(of_device_make_bus_id);
150 4 7 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright 2006, Johannes Berg <johannes@sipsolutions.net> */ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/leds.h> #include "ieee80211_i.h" #define MAC80211_BLINK_DELAY 50 /* ms */ static inline void ieee80211_led_rx(struct ieee80211_local *local) { #ifdef CONFIG_MAC80211_LEDS if (!atomic_read(&local->rx_led_active)) return; led_trigger_blink_oneshot(&local->rx_led, MAC80211_BLINK_DELAY, MAC80211_BLINK_DELAY, 0); #endif } static inline void ieee80211_led_tx(struct ieee80211_local *local) { #ifdef CONFIG_MAC80211_LEDS if (!atomic_read(&local->tx_led_active)) return; led_trigger_blink_oneshot(&local->tx_led, MAC80211_BLINK_DELAY, MAC80211_BLINK_DELAY, 0); #endif } #ifdef CONFIG_MAC80211_LEDS void ieee80211_led_assoc(struct ieee80211_local *local, bool associated); void ieee80211_led_radio(struct ieee80211_local *local, bool enabled); void ieee80211_alloc_led_names(struct ieee80211_local *local); void ieee80211_free_led_names(struct ieee80211_local *local); void ieee80211_led_init(struct ieee80211_local *local); void ieee80211_led_exit(struct ieee80211_local *local); void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local, unsigned int types_on, unsigned int types_off); #else static inline void ieee80211_led_assoc(struct ieee80211_local *local, bool associated) { } static inline void ieee80211_led_radio(struct ieee80211_local *local, bool enabled) { } static inline void ieee80211_alloc_led_names(struct ieee80211_local *local) { } static inline void ieee80211_free_led_names(struct ieee80211_local *local) { } static inline void ieee80211_led_init(struct ieee80211_local *local) { } static inline void ieee80211_led_exit(struct ieee80211_local *local) { } static inline void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local, unsigned int types_on, unsigned int types_off) { } #endif static inline void ieee80211_tpt_led_trig_tx(struct ieee80211_local *local, int bytes) { #ifdef CONFIG_MAC80211_LEDS if (atomic_read(&local->tpt_led_active)) local->tpt_led_trigger->tx_bytes += bytes; #endif } static inline void ieee80211_tpt_led_trig_rx(struct ieee80211_local *local, int bytes) { #ifdef CONFIG_MAC80211_LEDS if (atomic_read(&local->tpt_led_active)) local->tpt_led_trigger->rx_bytes += bytes; #endif }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 /* SPDX-License-Identifier: GPL-2.0 */ /* * PHY device list allow maintaining a list of PHY devices that are * part of a netdevice's link topology. PHYs can for example be chained, * as is the case when using a PHY that exposes an SFP module, on which an * SFP transceiver that embeds a PHY is connected. * * This list can then be used by userspace to leverage individual PHY * capabilities. */ #ifndef __PHY_LINK_TOPOLOGY_H #define __PHY_LINK_TOPOLOGY_H #include <linux/ethtool.h> #include <linux/netdevice.h> struct xarray; struct phy_device; struct sfp_bus; struct phy_link_topology { struct xarray phys; u32 next_phy_index; }; struct phy_device_node { enum phy_upstream upstream_type; union { struct net_device *netdev; struct phy_device *phydev; } upstream; struct sfp_bus *parent_sfp_bus; struct phy_device *phy; }; #if IS_ENABLED(CONFIG_PHYLIB) int phy_link_topo_add_phy(struct net_device *dev, struct phy_device *phy, enum phy_upstream upt, void *upstream); void phy_link_topo_del_phy(struct net_device *dev, struct phy_device *phy); static inline struct phy_device * phy_link_topo_get_phy(struct net_device *dev, u32 phyindex) { struct phy_link_topology *topo = dev->link_topo; struct phy_device_node *pdn; if (!topo) return NULL; pdn = xa_load(&topo->phys, phyindex); if (pdn) return pdn->phy; return NULL; } #else static inline int phy_link_topo_add_phy(struct net_device *dev, struct phy_device *phy, enum phy_upstream upt, void *upstream) { return 0; } static inline void phy_link_topo_del_phy(struct net_device *dev, struct phy_device *phy) { } static inline struct phy_device * phy_link_topo_get_phy(struct net_device *dev, u32 phyindex) { return NULL; } #endif #endif /* __PHY_LINK_TOPOLOGY_H */
27 22 22 1 2 13 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 // SPDX-License-Identifier: GPL-2.0-only /* iptables module for using new netfilter netlink queue * * (C) 2005 by Harald Welte <laforge@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter.h> #include <linux/netfilter_arp.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_NFQUEUE.h> #include <net/netfilter/nf_queue.h> MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("Xtables: packet forwarding to netlink"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_NFQUEUE"); MODULE_ALIAS("ip6t_NFQUEUE"); MODULE_ALIAS("arpt_NFQUEUE"); static u32 jhash_initval __read_mostly; static unsigned int nfqueue_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_NFQ_info *tinfo = par->targinfo; return NF_QUEUE_NR(tinfo->queuenum); } static unsigned int nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_NFQ_info_v1 *info = par->targinfo; u32 queue = info->queuenum; if (info->queues_total > 1) { queue = nfqueue_hash(skb, queue, info->queues_total, xt_family(par), jhash_initval); } return NF_QUEUE_NR(queue); } static unsigned int nfqueue_tg_v2(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_NFQ_info_v2 *info = par->targinfo; unsigned int ret = nfqueue_tg_v1(skb, par); if (info->bypass) ret |= NF_VERDICT_FLAG_QUEUE_BYPASS; return ret; } static int nfqueue_tg_check(const struct xt_tgchk_param *par) { const struct xt_NFQ_info_v3 *info = par->targinfo; u32 maxid; init_hashrandom(&jhash_initval); if (info->queues_total == 0) { pr_info_ratelimited("number of total queues is 0\n"); return -EINVAL; } maxid = info->queues_total - 1 + info->queuenum; if (maxid > 0xffff) { pr_info_ratelimited("number of queues (%u) out of range (got %u)\n", info->queues_total, maxid); return -ERANGE; } if (par->target->revision == 2 && info->flags > 1) return -EINVAL; if (par->target->revision == 3 && info->flags & ~NFQ_FLAG_MASK) return -EINVAL; return 0; } static unsigned int nfqueue_tg_v3(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_NFQ_info_v3 *info = par->targinfo; u32 queue = info->queuenum; int ret; if (info->queues_total > 1) { if (info->flags & NFQ_FLAG_CPU_FANOUT) { int cpu = smp_processor_id(); queue = info->queuenum + cpu % info->queues_total; } else { queue = nfqueue_hash(skb, queue, info->queues_total, xt_family(par), jhash_initval); } } ret = NF_QUEUE_NR(queue); if (info->flags & NFQ_FLAG_BYPASS) ret |= NF_VERDICT_FLAG_QUEUE_BYPASS; return ret; } static struct xt_target nfqueue_tg_reg[] __read_mostly = { { .name = "NFQUEUE", .family = NFPROTO_UNSPEC, .target = nfqueue_tg, .targetsize = sizeof(struct xt_NFQ_info), .me = THIS_MODULE, }, { .name = "NFQUEUE", .revision = 1, .family = NFPROTO_UNSPEC, .checkentry = nfqueue_tg_check, .target = nfqueue_tg_v1, .targetsize = sizeof(struct xt_NFQ_info_v1), .me = THIS_MODULE, }, { .name = "NFQUEUE", .revision = 2, .family = NFPROTO_UNSPEC, .checkentry = nfqueue_tg_check, .target = nfqueue_tg_v2, .targetsize = sizeof(struct xt_NFQ_info_v2), .me = THIS_MODULE, }, { .name = "NFQUEUE", .revision = 3, .family = NFPROTO_UNSPEC, .checkentry = nfqueue_tg_check, .target = nfqueue_tg_v3, .targetsize = sizeof(struct xt_NFQ_info_v3), .me = THIS_MODULE, }, }; static int __init nfqueue_tg_init(void) { return xt_register_targets(nfqueue_tg_reg, ARRAY_SIZE(nfqueue_tg_reg)); } static void __exit nfqueue_tg_exit(void) { xt_unregister_targets(nfqueue_tg_reg, ARRAY_SIZE(nfqueue_tg_reg)); } module_init(nfqueue_tg_init); module_exit(nfqueue_tg_exit);
5273 450 196 189 185 21 9477 9486 9489 5284 5605 113 8127 440 1865 1864 1863 4 1873 1869 1868 1856 6346 6367 5890 556 6366 1556 1556 38 1389 1517 1557 1516 2365 2323 211 299 2363 631 655 2062 2365 2368 57 57 57 3 52 55 57 55 2640 2645 2644 2643 2640 661 662 661 662 2361 3 237 1 337 20 37 15 2405 667 7 2111 38 2411 11 2414 2413 38 376 76 3151 8 3278 1319 1294 40 41 20 18 24 1117 1080 38 1094 36 85 91 39 15 4 5 4 3 3 15 534 582 2636 4 2642 2644 2640 57 2642 2595 107 2639 2637 2642 2645 2645 2643 2678 2680 2683 2683 2676 2633 126 2680 2633 127 2678 23 24 2680 2681 2682 2 2 1 2681 2678 2680 2676 161 2678 2684 2684 161 2678 10 2684 2679 2681 21 2676 179 21 234 8 4 2679 2679 2637 433 2678 12 2684 2677 2681 2680 1888 1377 2684 2677 8 2684 2678 15 2404 30 2622 2677 2683 2 2677 65 2678 2681 2677 8 2682 1 2635 127 2 2680 4 2680 2682 2680 9 2681 4 5 2680 4 5 3 3 1 5 3 2 3 3 4 3 4 2 8 7 5 10 6 2 5 5 7 6 9 2653 216 580 2295 1 27 23 2680 2408 2681 2679 2678 2680 10 2681 2682 30 11 2 2541 289 289 2 4 2682 42 1 2 3 41 1 44 44 44 13 13 3 1 4 4 1 3 45 55 54 7 13 42 41 2 4 4 6 7 1 35 4 44 44 44 43 44 3 47 20 20 1 14 76 7 17 17 17 1859 4 1882 1880 5 1870 8 9 2036 1 2 1708 1707 1704 1696 3 1701 2 2 1696 2 1697 2 1696 1 1611 69 1 7 49 55 3 72 2 4 66 1 4 53 1 9 2 1 3 46 3 3 1 39 1 3 35 2 3 28 3 3 21 3 3 14 3 3 3 1 5 3 2 267 263 3 19 28 28 28 39 83 42 168 168 168 19 8 3 1 1 1 5 2 7 6 595 27 138 436 570 6 576 572 2 535 36 36 19 17 504 16 46 393 160 531 5 16 529 22 545 3 548 469 467 463 50 68 513 3 13 486 520 2 7 513 4 12 522 2 5 525 1 521 2 2 516 11 498 28 432 5 6 3 80 5 434 4 430 1 2 370 50 39 13 418 19 357 3 2 3 3 26 6 1 6 148 135 339 359 161 576 295 13 75 1 4 70 5 63 63 63 49 3 12 8 60 63 2 1 1 1 3 146 162 1 1 4 155 1 135 42 15 20 149 20 40 2 155 138 827 658 659 66 815 1170 26 955 226 12 977 223 1 1 1166 10 1123 3 31 1112 43 1145 9 1151 4 1152 4 1146 8 1151 2 1150 4 1150 4 1150 4 1151 2 1148 3 17 3 291 15 240 354 26 1 34 123 5 164 1167 1170 65 1103 15 1150 3 500 820 820 651 191 713 169 168 1 63 63 1 8 13 21 412 12 267 561 1215 7 165 4 16 1167 1984 4 5 1979 1557 437 1513 440 38 122 1400 82 1440 1292 85 5 4 4 1817 1812 2 1809 1815 1797 1804 1513 1 18 4 4 5 9 9 9 6 37 37 23 2 12 1 8 4 4 2 6 5 6 12 2 7 4 10 34 1 9 2 12 2 9 5 2 13 2 11 4 4 22 13 75 75 6 5 38 5 5 5 5 67 67 44 67 67 67 41 21 66 2631 2504 277 2630 426 928 2369 79 2575 2298 2302 10 10 20 8 4 19 1 4 6 6 8 1 1 1 3 2 5 3 47 3 9 3 49 1 2 3 2 2 1 44 37 7 4 5 27 8 30 6 5 1 3 1 38 17 21 1 2 1 18 1 1 15 1 29 2 4 5 1 3 6 5 2 23 3 15 3 14 15 11 15 15 1 3 1 4 1 6 6 1 1 1 1 2 8 1 3 2 5 31 12 19 14 13 2 4 16 16 12 11 4 4 2 12 13 13 10 15 1 4 4 1 7 14 14 1 3 1 9 3 1 27 27 2 1 1 1 1 2 1 2 1 1 2 2 1 1 1 3 2 5 4 13 5 8 9 6 5 3 8 23 23 15 13 13 13 2 12 13 2 12 13 12 12 67 1 5 11 53 48 2 27 47 9 24 9 25 12 12 2 14 11 11 24 30 1 3 4 20 2 7 5 14 11 9 9 2 2 1 7 7 26 18 18 18 8 18 3 6 7 4 8 20 3 8 8 9 18 20 2 18 1 2 8 8 2 28 28 9 19 8 17 28 15 1 6 15 28 8 4 2 4 27 1 20 28 14 14 14 2 10 14 14 11 11 10 15 5 5 5 8 5 1 1 5 10 10 4 4 4 4 4 2 32 6 24 2 2 46 7 7 5 3 23 23 2 3 1 2 13 23 22 1 4 13 12 13 2 2 2 18 18 1 4 2 1 5 2 1 2 2 1 1 1 1 8 4 4 5 2 5 5 10 1 1 1 1 1 4 1 17 1 10 1 2 1 1 1 22 1 3 1 1 4 1 5 1 4 1 1 1 11 1 19 1 9 1 1 1 1 5 10 1 1 1 1 5 1 27 10 17 12 1 2 1 1 4 1 4 469 25 450 21 397 28 46 29 111 6359 4 1 6354 6355 5852 536 32 459 75 533 422 111 5847 335 5847 1 3785 2272 2 3 6387 10 9 5 2741 2728 943 191 191 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Routing netlink socket interface: protocol independent part. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Fixes: * Vitaly E. Lavrov RTA_OK arithmetic was wrong. */ #include <linux/bitops.h> #include <linux/errno.h> #include <linux/module.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/kernel.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/interrupt.h> #include <linux/capability.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/security.h> #include <linux/mutex.h> #include <linux/if_addr.h> #include <linux/if_bridge.h> #include <linux/if_vlan.h> #include <linux/pci.h> #include <linux/etherdevice.h> #include <linux/bpf.h> #include <linux/uaccess.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <net/ip.h> #include <net/protocol.h> #include <net/arp.h> #include <net/route.h> #include <net/udp.h> #include <net/tcp.h> #include <net/sock.h> #include <net/pkt_sched.h> #include <net/fib_rules.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/netdev_lock.h> #include <net/devlink.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/addrconf.h> #endif #include <linux/dpll.h> #include "dev.h" #define RTNL_MAX_TYPE 50 #define RTNL_SLAVE_MAX_TYPE 44 struct rtnl_link { rtnl_doit_func doit; rtnl_dumpit_func dumpit; struct module *owner; unsigned int flags; struct rcu_head rcu; }; static DEFINE_MUTEX(rtnl_mutex); void rtnl_lock(void) { mutex_lock(&rtnl_mutex); } EXPORT_SYMBOL(rtnl_lock); int rtnl_lock_interruptible(void) { return mutex_lock_interruptible(&rtnl_mutex); } int rtnl_lock_killable(void) { return mutex_lock_killable(&rtnl_mutex); } static struct sk_buff *defer_kfree_skb_list; void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail) { if (head && tail) { tail->next = defer_kfree_skb_list; defer_kfree_skb_list = head; } } EXPORT_SYMBOL(rtnl_kfree_skbs); void __rtnl_unlock(void) { struct sk_buff *head = defer_kfree_skb_list; defer_kfree_skb_list = NULL; /* Ensure that we didn't actually add any TODO item when __rtnl_unlock() * is used. In some places, e.g. in cfg80211, we have code that will do * something like * rtnl_lock() * wiphy_lock() * ... * rtnl_unlock() * * and because netdev_run_todo() acquires the RTNL for items on the list * we could cause a situation such as this: * Thread 1 Thread 2 * rtnl_lock() * unregister_netdevice() * __rtnl_unlock() * rtnl_lock() * wiphy_lock() * rtnl_unlock() * netdev_run_todo() * __rtnl_unlock() * * // list not empty now * // because of thread 2 * rtnl_lock() * while (!list_empty(...)) * rtnl_lock() * wiphy_lock() * **** DEADLOCK **** * * However, usage of __rtnl_unlock() is rare, and so we can ensure that * it's not used in cases where something is added to do the list. */ WARN_ON(!list_empty(&net_todo_list)); mutex_unlock(&rtnl_mutex); while (head) { struct sk_buff *next = head->next; kfree_skb(head); cond_resched(); head = next; } } void rtnl_unlock(void) { /* This fellow will unlock it for us. */ netdev_run_todo(); } EXPORT_SYMBOL(rtnl_unlock); int rtnl_trylock(void) { return mutex_trylock(&rtnl_mutex); } EXPORT_SYMBOL(rtnl_trylock); int rtnl_is_locked(void) { return mutex_is_locked(&rtnl_mutex); } EXPORT_SYMBOL(rtnl_is_locked); bool refcount_dec_and_rtnl_lock(refcount_t *r) { return refcount_dec_and_mutex_lock(r, &rtnl_mutex); } EXPORT_SYMBOL(refcount_dec_and_rtnl_lock); #ifdef CONFIG_PROVE_LOCKING bool lockdep_rtnl_is_held(void) { return lockdep_is_held(&rtnl_mutex); } EXPORT_SYMBOL(lockdep_rtnl_is_held); #endif /* #ifdef CONFIG_PROVE_LOCKING */ #ifdef CONFIG_DEBUG_NET_SMALL_RTNL void __rtnl_net_lock(struct net *net) { ASSERT_RTNL(); mutex_lock(&net->rtnl_mutex); } EXPORT_SYMBOL(__rtnl_net_lock); void __rtnl_net_unlock(struct net *net) { ASSERT_RTNL(); mutex_unlock(&net->rtnl_mutex); } EXPORT_SYMBOL(__rtnl_net_unlock); void rtnl_net_lock(struct net *net) { rtnl_lock(); __rtnl_net_lock(net); } EXPORT_SYMBOL(rtnl_net_lock); void rtnl_net_unlock(struct net *net) { __rtnl_net_unlock(net); rtnl_unlock(); } EXPORT_SYMBOL(rtnl_net_unlock); int rtnl_net_trylock(struct net *net) { int ret = rtnl_trylock(); if (ret) __rtnl_net_lock(net); return ret; } EXPORT_SYMBOL(rtnl_net_trylock); int rtnl_net_lock_killable(struct net *net) { int ret = rtnl_lock_killable(); if (!ret) __rtnl_net_lock(net); return ret; } static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b) { if (net_eq(net_a, net_b)) return 0; /* always init_net first */ if (net_eq(net_a, &init_net)) return -1; if (net_eq(net_b, &init_net)) return 1; /* otherwise lock in ascending order */ return net_a < net_b ? -1 : 1; } int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b) { const struct net *net_a, *net_b; net_a = container_of(a, struct net, rtnl_mutex.dep_map); net_b = container_of(b, struct net, rtnl_mutex.dep_map); return rtnl_net_cmp_locks(net_a, net_b); } bool rtnl_net_is_locked(struct net *net) { return rtnl_is_locked() && mutex_is_locked(&net->rtnl_mutex); } EXPORT_SYMBOL(rtnl_net_is_locked); bool lockdep_rtnl_net_is_held(struct net *net) { return lockdep_rtnl_is_held() && lockdep_is_held(&net->rtnl_mutex); } EXPORT_SYMBOL(lockdep_rtnl_net_is_held); #else static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b) { /* No need to swap */ return -1; } #endif struct rtnl_nets { /* ->newlink() needs to freeze 3 netns at most; * 2 for the new device, 1 for its peer. */ struct net *net[3]; unsigned char len; }; static void rtnl_nets_init(struct rtnl_nets *rtnl_nets) { memset(rtnl_nets, 0, sizeof(*rtnl_nets)); } static void rtnl_nets_destroy(struct rtnl_nets *rtnl_nets) { int i; for (i = 0; i < rtnl_nets->len; i++) { put_net(rtnl_nets->net[i]); rtnl_nets->net[i] = NULL; } rtnl_nets->len = 0; } /** * rtnl_nets_add - Add netns to be locked before ->newlink(). * * @rtnl_nets: rtnl_nets pointer passed to ->get_peer_net(). * @net: netns pointer with an extra refcnt held. * * The extra refcnt is released in rtnl_nets_destroy(). */ static void rtnl_nets_add(struct rtnl_nets *rtnl_nets, struct net *net) { int i; DEBUG_NET_WARN_ON_ONCE(rtnl_nets->len == ARRAY_SIZE(rtnl_nets->net)); for (i = 0; i < rtnl_nets->len; i++) { switch (rtnl_net_cmp_locks(rtnl_nets->net[i], net)) { case 0: put_net(net); return; case 1: swap(rtnl_nets->net[i], net); } } rtnl_nets->net[i] = net; rtnl_nets->len++; } static void rtnl_nets_lock(struct rtnl_nets *rtnl_nets) { int i; rtnl_lock(); for (i = 0; i < rtnl_nets->len; i++) __rtnl_net_lock(rtnl_nets->net[i]); } static void rtnl_nets_unlock(struct rtnl_nets *rtnl_nets) { int i; for (i = 0; i < rtnl_nets->len; i++) __rtnl_net_unlock(rtnl_nets->net[i]); rtnl_unlock(); } static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; static inline int rtm_msgindex(int msgtype) { int msgindex = msgtype - RTM_BASE; /* * msgindex < 0 implies someone tried to register a netlink * control code. msgindex >= RTM_NR_MSGTYPES may indicate that * the message type has not been added to linux/rtnetlink.h */ BUG_ON(msgindex < 0 || msgindex >= RTM_NR_MSGTYPES); return msgindex; } static struct rtnl_link *rtnl_get_link(int protocol, int msgtype) { struct rtnl_link __rcu **tab; if (protocol >= ARRAY_SIZE(rtnl_msg_handlers)) protocol = PF_UNSPEC; tab = rcu_dereference_rtnl(rtnl_msg_handlers[protocol]); if (!tab) tab = rcu_dereference_rtnl(rtnl_msg_handlers[PF_UNSPEC]); return rcu_dereference_rtnl(tab[msgtype]); } static int rtnl_register_internal(struct module *owner, int protocol, int msgtype, rtnl_doit_func doit, rtnl_dumpit_func dumpit, unsigned int flags) { struct rtnl_link *link, *old; struct rtnl_link __rcu **tab; int msgindex; int ret = -ENOBUFS; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); rtnl_lock(); tab = rtnl_dereference(rtnl_msg_handlers[protocol]); if (tab == NULL) { tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL); if (!tab) goto unlock; /* ensures we see the 0 stores */ rcu_assign_pointer(rtnl_msg_handlers[protocol], tab); } old = rtnl_dereference(tab[msgindex]); if (old) { link = kmemdup(old, sizeof(*old), GFP_KERNEL); if (!link) goto unlock; } else { link = kzalloc(sizeof(*link), GFP_KERNEL); if (!link) goto unlock; } WARN_ON(link->owner && link->owner != owner); link->owner = owner; WARN_ON(doit && link->doit && link->doit != doit); if (doit) link->doit = doit; WARN_ON(dumpit && link->dumpit && link->dumpit != dumpit); if (dumpit) link->dumpit = dumpit; WARN_ON(rtnl_msgtype_kind(msgtype) != RTNL_KIND_DEL && (flags & RTNL_FLAG_BULK_DEL_SUPPORTED)); link->flags |= flags; /* publish protocol:msgtype */ rcu_assign_pointer(tab[msgindex], link); ret = 0; if (old) kfree_rcu(old, rcu); unlock: rtnl_unlock(); return ret; } /** * rtnl_unregister - Unregister a rtnetlink message type * @protocol: Protocol family or PF_UNSPEC * @msgtype: rtnetlink message type * * Returns 0 on success or a negative error code. */ static int rtnl_unregister(int protocol, int msgtype) { struct rtnl_link __rcu **tab; struct rtnl_link *link; int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); rtnl_lock(); tab = rtnl_dereference(rtnl_msg_handlers[protocol]); if (!tab) { rtnl_unlock(); return -ENOENT; } link = rcu_replace_pointer_rtnl(tab[msgindex], NULL); rtnl_unlock(); kfree_rcu(link, rcu); return 0; } /** * rtnl_unregister_all - Unregister all rtnetlink message type of a protocol * @protocol : Protocol family or PF_UNSPEC * * Identical to calling rtnl_unregister() for all registered message types * of a certain protocol family. */ void rtnl_unregister_all(int protocol) { struct rtnl_link __rcu **tab; struct rtnl_link *link; int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); rtnl_lock(); tab = rcu_replace_pointer_rtnl(rtnl_msg_handlers[protocol], NULL); if (!tab) { rtnl_unlock(); return; } for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) { link = rcu_replace_pointer_rtnl(tab[msgindex], NULL); kfree_rcu(link, rcu); } rtnl_unlock(); synchronize_net(); kfree(tab); } EXPORT_SYMBOL_GPL(rtnl_unregister_all); /** * __rtnl_register_many - Register rtnetlink message types * @handlers: Array of struct rtnl_msg_handlers * @n: The length of @handlers * * Registers the specified function pointers (at least one of them has * to be non-NULL) to be called whenever a request message for the * specified protocol family and message type is received. * * The special protocol family PF_UNSPEC may be used to define fallback * function pointers for the case when no entry for the specific protocol * family exists. * * When one element of @handlers fails to register, * 1) built-in: panics. * 2) modules : the previous successful registrations are unwinded * and an error is returned. * * Use rtnl_register_many(). */ int __rtnl_register_many(const struct rtnl_msg_handler *handlers, int n) { const struct rtnl_msg_handler *handler; int i, err; for (i = 0, handler = handlers; i < n; i++, handler++) { err = rtnl_register_internal(handler->owner, handler->protocol, handler->msgtype, handler->doit, handler->dumpit, handler->flags); if (err) { if (!handler->owner) panic("Unable to register rtnetlink message " "handlers, %pS\n", handlers); __rtnl_unregister_many(handlers, i); break; } } return err; } EXPORT_SYMBOL_GPL(__rtnl_register_many); void __rtnl_unregister_many(const struct rtnl_msg_handler *handlers, int n) { const struct rtnl_msg_handler *handler; int i; for (i = n - 1, handler = handlers + n - 1; i >= 0; i--, handler--) rtnl_unregister(handler->protocol, handler->msgtype); } EXPORT_SYMBOL_GPL(__rtnl_unregister_many); static DEFINE_MUTEX(link_ops_mutex); static LIST_HEAD(link_ops); static struct rtnl_link_ops *rtnl_link_ops_get(const char *kind, int *srcu_index) { struct rtnl_link_ops *ops; rcu_read_lock(); list_for_each_entry_rcu(ops, &link_ops, list) { if (!strcmp(ops->kind, kind)) { *srcu_index = srcu_read_lock(&ops->srcu); goto unlock; } } ops = NULL; unlock: rcu_read_unlock(); return ops; } static void rtnl_link_ops_put(struct rtnl_link_ops *ops, int srcu_index) { srcu_read_unlock(&ops->srcu, srcu_index); } /** * rtnl_link_register - Register rtnl_link_ops with rtnetlink. * @ops: struct rtnl_link_ops * to register * * Returns 0 on success or a negative error code. */ int rtnl_link_register(struct rtnl_link_ops *ops) { struct rtnl_link_ops *tmp; int err; /* Sanity-check max sizes to avoid stack buffer overflow. */ if (WARN_ON(ops->maxtype > RTNL_MAX_TYPE || ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)) return -EINVAL; /* The check for alloc/setup is here because if ops * does not have that filled up, it is not possible * to use the ops for creating device. So do not * fill up dellink as well. That disables rtnl_dellink. */ if ((ops->alloc || ops->setup) && !ops->dellink) ops->dellink = unregister_netdevice_queue; err = init_srcu_struct(&ops->srcu); if (err) return err; mutex_lock(&link_ops_mutex); list_for_each_entry(tmp, &link_ops, list) { if (!strcmp(ops->kind, tmp->kind)) { err = -EEXIST; goto unlock; } } list_add_tail_rcu(&ops->list, &link_ops); unlock: mutex_unlock(&link_ops_mutex); return err; } EXPORT_SYMBOL_GPL(rtnl_link_register); static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) { struct net_device *dev; LIST_HEAD(list_kill); for_each_netdev(net, dev) { if (dev->rtnl_link_ops == ops) ops->dellink(dev, &list_kill); } unregister_netdevice_many(&list_kill); } /* Return with the rtnl_lock held when there are no network * devices unregistering in any network namespace. */ static void rtnl_lock_unregistering_all(void) { DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(&netdev_unregistering_wq, &wait); for (;;) { rtnl_lock(); /* We held write locked pernet_ops_rwsem, and parallel * setup_net() and cleanup_net() are not possible. */ if (!atomic_read(&dev_unreg_count)) break; __rtnl_unlock(); wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } remove_wait_queue(&netdev_unregistering_wq, &wait); } /** * rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. * @ops: struct rtnl_link_ops * to unregister */ void rtnl_link_unregister(struct rtnl_link_ops *ops) { struct net *net; mutex_lock(&link_ops_mutex); list_del_rcu(&ops->list); mutex_unlock(&link_ops_mutex); synchronize_srcu(&ops->srcu); cleanup_srcu_struct(&ops->srcu); /* Close the race with setup_net() and cleanup_net() */ down_write(&pernet_ops_rwsem); rtnl_lock_unregistering_all(); for_each_net(net) __rtnl_kill_links(net, ops); rtnl_unlock(); up_write(&pernet_ops_rwsem); } EXPORT_SYMBOL_GPL(rtnl_link_unregister); static size_t rtnl_link_get_slave_info_data_size(const struct net_device *dev) { struct net_device *master_dev; const struct rtnl_link_ops *ops; size_t size = 0; rcu_read_lock(); master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev); if (!master_dev) goto out; ops = master_dev->rtnl_link_ops; if (!ops || !ops->get_slave_size) goto out; /* IFLA_INFO_SLAVE_DATA + nested data */ size = nla_total_size(sizeof(struct nlattr)) + ops->get_slave_size(master_dev, dev); out: rcu_read_unlock(); return size; } static size_t rtnl_link_get_size(const struct net_device *dev) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; size_t size; if (!ops) return 0; size = nla_total_size(sizeof(struct nlattr)) + /* IFLA_LINKINFO */ nla_total_size(strlen(ops->kind) + 1); /* IFLA_INFO_KIND */ if (ops->get_size) /* IFLA_INFO_DATA + nested data */ size += nla_total_size(sizeof(struct nlattr)) + ops->get_size(dev); if (ops->get_xstats_size) /* IFLA_INFO_XSTATS */ size += nla_total_size(ops->get_xstats_size(dev)); size += rtnl_link_get_slave_info_data_size(dev); return size; } static LIST_HEAD(rtnl_af_ops); static struct rtnl_af_ops *rtnl_af_lookup(const int family, int *srcu_index) { struct rtnl_af_ops *ops; ASSERT_RTNL(); rcu_read_lock(); list_for_each_entry_rcu(ops, &rtnl_af_ops, list) { if (ops->family == family) { *srcu_index = srcu_read_lock(&ops->srcu); goto unlock; } } ops = NULL; unlock: rcu_read_unlock(); return ops; } static void rtnl_af_put(struct rtnl_af_ops *ops, int srcu_index) { srcu_read_unlock(&ops->srcu, srcu_index); } /** * rtnl_af_register - Register rtnl_af_ops with rtnetlink. * @ops: struct rtnl_af_ops * to register * * Return: 0 on success or a negative error code. */ int rtnl_af_register(struct rtnl_af_ops *ops) { int err = init_srcu_struct(&ops->srcu); if (err) return err; rtnl_lock(); list_add_tail_rcu(&ops->list, &rtnl_af_ops); rtnl_unlock(); return 0; } EXPORT_SYMBOL_GPL(rtnl_af_register); /** * rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink. * @ops: struct rtnl_af_ops * to unregister */ void rtnl_af_unregister(struct rtnl_af_ops *ops) { rtnl_lock(); list_del_rcu(&ops->list); rtnl_unlock(); synchronize_rcu(); synchronize_srcu(&ops->srcu); cleanup_srcu_struct(&ops->srcu); } EXPORT_SYMBOL_GPL(rtnl_af_unregister); static size_t rtnl_link_get_af_size(const struct net_device *dev, u32 ext_filter_mask) { struct rtnl_af_ops *af_ops; size_t size; /* IFLA_AF_SPEC */ size = nla_total_size(sizeof(struct nlattr)); rcu_read_lock(); list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { if (af_ops->get_link_af_size) { /* AF_* + nested data */ size += nla_total_size(sizeof(struct nlattr)) + af_ops->get_link_af_size(dev, ext_filter_mask); } } rcu_read_unlock(); return size; } static bool rtnl_have_link_slave_info(const struct net_device *dev) { struct net_device *master_dev; bool ret = false; rcu_read_lock(); master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev); if (master_dev && master_dev->rtnl_link_ops) ret = true; rcu_read_unlock(); return ret; } static int rtnl_link_slave_info_fill(struct sk_buff *skb, const struct net_device *dev) { struct net_device *master_dev; const struct rtnl_link_ops *ops; struct nlattr *slave_data; int err; master_dev = netdev_master_upper_dev_get((struct net_device *) dev); if (!master_dev) return 0; ops = master_dev->rtnl_link_ops; if (!ops) return 0; if (nla_put_string(skb, IFLA_INFO_SLAVE_KIND, ops->kind) < 0) return -EMSGSIZE; if (ops->fill_slave_info) { slave_data = nla_nest_start_noflag(skb, IFLA_INFO_SLAVE_DATA); if (!slave_data) return -EMSGSIZE; err = ops->fill_slave_info(skb, master_dev, dev); if (err < 0) goto err_cancel_slave_data; nla_nest_end(skb, slave_data); } return 0; err_cancel_slave_data: nla_nest_cancel(skb, slave_data); return err; } static int rtnl_link_info_fill(struct sk_buff *skb, const struct net_device *dev) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; struct nlattr *data; int err; if (!ops) return 0; if (nla_put_string(skb, IFLA_INFO_KIND, ops->kind) < 0) return -EMSGSIZE; if (ops->fill_xstats) { err = ops->fill_xstats(skb, dev); if (err < 0) return err; } if (ops->fill_info) { data = nla_nest_start_noflag(skb, IFLA_INFO_DATA); if (data == NULL) return -EMSGSIZE; err = ops->fill_info(skb, dev); if (err < 0) goto err_cancel_data; nla_nest_end(skb, data); } return 0; err_cancel_data: nla_nest_cancel(skb, data); return err; } static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *linkinfo; int err = -EMSGSIZE; linkinfo = nla_nest_start_noflag(skb, IFLA_LINKINFO); if (linkinfo == NULL) goto out; err = rtnl_link_info_fill(skb, dev); if (err < 0) goto err_cancel_link; err = rtnl_link_slave_info_fill(skb, dev); if (err < 0) goto err_cancel_link; nla_nest_end(skb, linkinfo); return 0; err_cancel_link: nla_nest_cancel(skb, linkinfo); out: return err; } int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, int echo) { struct sock *rtnl = net->rtnl; return nlmsg_notify(rtnl, skb, pid, group, echo, GFP_KERNEL); } int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid) { struct sock *rtnl = net->rtnl; return nlmsg_unicast(rtnl, skb, pid); } EXPORT_SYMBOL(rtnl_unicast); void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, const struct nlmsghdr *nlh, gfp_t flags) { struct sock *rtnl = net->rtnl; nlmsg_notify(rtnl, skb, pid, group, nlmsg_report(nlh), flags); } EXPORT_SYMBOL(rtnl_notify); void rtnl_set_sk_err(struct net *net, u32 group, int error) { struct sock *rtnl = net->rtnl; netlink_set_err(rtnl, 0, group, error); } EXPORT_SYMBOL(rtnl_set_sk_err); int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) { struct nlattr *mx; int i, valid = 0; /* nothing is dumped for dst_default_metrics, so just skip the loop */ if (metrics == dst_default_metrics.metrics) return 0; mx = nla_nest_start_noflag(skb, RTA_METRICS); if (mx == NULL) return -ENOBUFS; for (i = 0; i < RTAX_MAX; i++) { if (metrics[i]) { if (i == RTAX_CC_ALGO - 1) { char tmp[TCP_CA_NAME_MAX], *name; name = tcp_ca_get_name_by_key(metrics[i], tmp); if (!name) continue; if (nla_put_string(skb, i + 1, name)) goto nla_put_failure; } else if (i == RTAX_FEATURES - 1) { u32 user_features = metrics[i] & RTAX_FEATURE_MASK; if (!user_features) continue; BUILD_BUG_ON(RTAX_FEATURE_MASK & DST_FEATURE_MASK); if (nla_put_u32(skb, i + 1, user_features)) goto nla_put_failure; } else { if (nla_put_u32(skb, i + 1, metrics[i])) goto nla_put_failure; } valid++; } } if (!valid) { nla_nest_cancel(skb, mx); return 0; } return nla_nest_end(skb, mx); nla_put_failure: nla_nest_cancel(skb, mx); return -EMSGSIZE; } EXPORT_SYMBOL(rtnetlink_put_metrics); int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, long expires, u32 error) { struct rta_cacheinfo ci = { .rta_error = error, .rta_id = id, }; unsigned long delta; if (dst) { delta = jiffies - READ_ONCE(dst->lastuse); ci.rta_lastuse = jiffies_delta_to_clock_t(delta); ci.rta_used = dst->__use; ci.rta_clntref = rcuref_read(&dst->__rcuref); } if (expires) { unsigned long clock; clock = jiffies_to_clock_t(abs(expires)); clock = min_t(unsigned long, clock, INT_MAX); ci.rta_expires = (expires > 0) ? clock : -clock; } return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci); } EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo); void netif_set_operstate(struct net_device *dev, int newstate) { unsigned int old = READ_ONCE(dev->operstate); do { if (old == newstate) return; } while (!try_cmpxchg(&dev->operstate, &old, newstate)); netif_state_change(dev); } EXPORT_SYMBOL(netif_set_operstate); static void set_operstate(struct net_device *dev, unsigned char transition) { unsigned char operstate = READ_ONCE(dev->operstate); switch (transition) { case IF_OPER_UP: if ((operstate == IF_OPER_DORMANT || operstate == IF_OPER_TESTING || operstate == IF_OPER_UNKNOWN) && !netif_dormant(dev) && !netif_testing(dev)) operstate = IF_OPER_UP; break; case IF_OPER_TESTING: if (netif_oper_up(dev)) operstate = IF_OPER_TESTING; break; case IF_OPER_DORMANT: if (netif_oper_up(dev)) operstate = IF_OPER_DORMANT; break; } netif_set_operstate(dev, operstate); } static unsigned int rtnl_dev_get_flags(const struct net_device *dev) { return (dev->flags & ~(IFF_PROMISC | IFF_ALLMULTI)) | (dev->gflags & (IFF_PROMISC | IFF_ALLMULTI)); } static unsigned int rtnl_dev_combine_flags(const struct net_device *dev, const struct ifinfomsg *ifm) { unsigned int flags = ifm->ifi_flags; /* bugwards compatibility: ifi_change == 0 is treated as ~0 */ if (ifm->ifi_change) flags = (flags & ifm->ifi_change) | (rtnl_dev_get_flags(dev) & ~ifm->ifi_change); return flags; } static void copy_rtnl_link_stats(struct rtnl_link_stats *a, const struct rtnl_link_stats64 *b) { a->rx_packets = b->rx_packets; a->tx_packets = b->tx_packets; a->rx_bytes = b->rx_bytes; a->tx_bytes = b->tx_bytes; a->rx_errors = b->rx_errors; a->tx_errors = b->tx_errors; a->rx_dropped = b->rx_dropped; a->tx_dropped = b->tx_dropped; a->multicast = b->multicast; a->collisions = b->collisions; a->rx_length_errors = b->rx_length_errors; a->rx_over_errors = b->rx_over_errors; a->rx_crc_errors = b->rx_crc_errors; a->rx_frame_errors = b->rx_frame_errors; a->rx_fifo_errors = b->rx_fifo_errors; a->rx_missed_errors = b->rx_missed_errors; a->tx_aborted_errors = b->tx_aborted_errors; a->tx_carrier_errors = b->tx_carrier_errors; a->tx_fifo_errors = b->tx_fifo_errors; a->tx_heartbeat_errors = b->tx_heartbeat_errors; a->tx_window_errors = b->tx_window_errors; a->rx_compressed = b->rx_compressed; a->tx_compressed = b->tx_compressed; a->rx_nohandler = b->rx_nohandler; } /* All VF info */ static inline int rtnl_vfinfo_size(const struct net_device *dev, u32 ext_filter_mask) { if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF)) { int num_vfs = dev_num_vf(dev->dev.parent); size_t size = nla_total_size(0); size += num_vfs * (nla_total_size(0) + nla_total_size(sizeof(struct ifla_vf_mac)) + nla_total_size(sizeof(struct ifla_vf_broadcast)) + nla_total_size(sizeof(struct ifla_vf_vlan)) + nla_total_size(0) + /* nest IFLA_VF_VLAN_LIST */ nla_total_size(MAX_VLAN_LIST_LEN * sizeof(struct ifla_vf_vlan_info)) + nla_total_size(sizeof(struct ifla_vf_spoofchk)) + nla_total_size(sizeof(struct ifla_vf_tx_rate)) + nla_total_size(sizeof(struct ifla_vf_rate)) + nla_total_size(sizeof(struct ifla_vf_link_state)) + nla_total_size(sizeof(struct ifla_vf_rss_query_en)) + nla_total_size(sizeof(struct ifla_vf_trust))); if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) { size += num_vfs * (nla_total_size(0) + /* nest IFLA_VF_STATS */ /* IFLA_VF_STATS_RX_PACKETS */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_TX_PACKETS */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_RX_BYTES */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_TX_BYTES */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_BROADCAST */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_MULTICAST */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_RX_DROPPED */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_TX_DROPPED */ nla_total_size_64bit(sizeof(__u64))); } if (dev->netdev_ops->ndo_get_vf_guid) size += num_vfs * 2 * nla_total_size(sizeof(struct ifla_vf_guid)); return size; } else return 0; } static size_t rtnl_port_size(const struct net_device *dev, u32 ext_filter_mask) { size_t port_size = nla_total_size(4) /* PORT_VF */ + nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */ + nla_total_size(PORT_UUID_MAX) /* PORT_INSTANCE_UUID */ + nla_total_size(PORT_UUID_MAX) /* PORT_HOST_UUID */ + nla_total_size(1) /* PROT_VDP_REQUEST */ + nla_total_size(2); /* PORT_VDP_RESPONSE */ size_t vf_ports_size = nla_total_size(sizeof(struct nlattr)); size_t vf_port_size = nla_total_size(sizeof(struct nlattr)) + port_size; size_t port_self_size = nla_total_size(sizeof(struct nlattr)) + port_size; if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent || !(ext_filter_mask & RTEXT_FILTER_VF)) return 0; if (dev_num_vf(dev->dev.parent)) return port_self_size + vf_ports_size + vf_port_size * dev_num_vf(dev->dev.parent); else return port_self_size; } static size_t rtnl_xdp_size(void) { size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */ nla_total_size(1) + /* XDP_ATTACHED */ nla_total_size(4) + /* XDP_PROG_ID (or 1st mode) */ nla_total_size(4); /* XDP_<mode>_PROG_ID */ return xdp_size; } static size_t rtnl_prop_list_size(const struct net_device *dev) { struct netdev_name_node *name_node; unsigned int cnt = 0; rcu_read_lock(); list_for_each_entry_rcu(name_node, &dev->name_node->list, list) cnt++; rcu_read_unlock(); if (!cnt) return 0; return nla_total_size(0) + cnt * nla_total_size(ALTIFNAMSIZ); } static size_t rtnl_proto_down_size(const struct net_device *dev) { size_t size = nla_total_size(1); /* Assume dev->proto_down_reason is not zero. */ size += nla_total_size(0) + nla_total_size(4); return size; } static size_t rtnl_devlink_port_size(const struct net_device *dev) { size_t size = nla_total_size(0); /* nest IFLA_DEVLINK_PORT */ if (dev->devlink_port) size += devlink_nl_port_handle_size(dev->devlink_port); return size; } static size_t rtnl_dpll_pin_size(const struct net_device *dev) { size_t size = nla_total_size(0); /* nest IFLA_DPLL_PIN */ size += dpll_netdev_pin_handle_size(dev); return size; } static noinline size_t if_nlmsg_size(const struct net_device *dev, u32 ext_filter_mask) { return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + nla_total_size(IFALIASZ) /* IFLA_IFALIAS */ + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */ + nla_total_size_64bit(sizeof(struct rtnl_link_ifmap)) + nla_total_size(sizeof(struct rtnl_link_stats)) + nla_total_size_64bit(sizeof(struct rtnl_link_stats64)) + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */ + nla_total_size(4) /* IFLA_TXQLEN */ + nla_total_size(4) /* IFLA_WEIGHT */ + nla_total_size(4) /* IFLA_MTU */ + nla_total_size(4) /* IFLA_LINK */ + nla_total_size(4) /* IFLA_MASTER */ + nla_total_size(1) /* IFLA_CARRIER */ + nla_total_size(4) /* IFLA_PROMISCUITY */ + nla_total_size(4) /* IFLA_ALLMULTI */ + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */ + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */ + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */ + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */ + nla_total_size(4) /* IFLA_GRO_MAX_SIZE */ + nla_total_size(4) /* IFLA_GSO_IPV4_MAX_SIZE */ + nla_total_size(4) /* IFLA_GRO_IPV4_MAX_SIZE */ + nla_total_size(4) /* IFLA_TSO_MAX_SIZE */ + nla_total_size(4) /* IFLA_TSO_MAX_SEGS */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ + nla_total_size(1) /* IFLA_NETNS_IMMUTABLE */ + nla_total_size(4) /* IFLA_CARRIER_CHANGES */ + nla_total_size(4) /* IFLA_LINK_NETNSID */ + nla_total_size(4) /* IFLA_GROUP */ + nla_total_size(ext_filter_mask & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */ + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */ + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ + rtnl_link_get_af_size(dev, ext_filter_mask) /* IFLA_AF_SPEC */ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */ + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */ + rtnl_xdp_size() /* IFLA_XDP */ + nla_total_size(4) /* IFLA_EVENT */ + nla_total_size(4) /* IFLA_NEW_NETNSID */ + nla_total_size(4) /* IFLA_NEW_IFINDEX */ + rtnl_proto_down_size(dev) /* proto down */ + nla_total_size(4) /* IFLA_TARGET_NETNSID */ + nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */ + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */ + nla_total_size(4) /* IFLA_MIN_MTU */ + nla_total_size(4) /* IFLA_MAX_MTU */ + rtnl_prop_list_size(dev) + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */ + rtnl_devlink_port_size(dev) + rtnl_dpll_pin_size(dev) + nla_total_size(8) /* IFLA_MAX_PACING_OFFLOAD_HORIZON */ + nla_total_size(2) /* IFLA_HEADROOM */ + nla_total_size(2) /* IFLA_TAILROOM */ + 0; } static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) { struct nlattr *vf_ports; struct nlattr *vf_port; int vf; int err; vf_ports = nla_nest_start_noflag(skb, IFLA_VF_PORTS); if (!vf_ports) return -EMSGSIZE; for (vf = 0; vf < dev_num_vf(dev->dev.parent); vf++) { vf_port = nla_nest_start_noflag(skb, IFLA_VF_PORT); if (!vf_port) goto nla_put_failure; if (nla_put_u32(skb, IFLA_PORT_VF, vf)) goto nla_put_failure; err = dev->netdev_ops->ndo_get_vf_port(dev, vf, skb); if (err == -EMSGSIZE) goto nla_put_failure; if (err) { nla_nest_cancel(skb, vf_port); continue; } nla_nest_end(skb, vf_port); } nla_nest_end(skb, vf_ports); return 0; nla_put_failure: nla_nest_cancel(skb, vf_ports); return -EMSGSIZE; } static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev) { struct nlattr *port_self; int err; port_self = nla_nest_start_noflag(skb, IFLA_PORT_SELF); if (!port_self) return -EMSGSIZE; err = dev->netdev_ops->ndo_get_vf_port(dev, PORT_SELF_VF, skb); if (err) { nla_nest_cancel(skb, port_self); return (err == -EMSGSIZE) ? err : 0; } nla_nest_end(skb, port_self); return 0; } static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev, u32 ext_filter_mask) { int err; if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent || !(ext_filter_mask & RTEXT_FILTER_VF)) return 0; err = rtnl_port_self_fill(skb, dev); if (err) return err; if (dev_num_vf(dev->dev.parent)) { err = rtnl_vf_ports_fill(skb, dev); if (err) return err; } return 0; } static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev) { int err; struct netdev_phys_item_id ppid; err = dev_get_phys_port_id(dev, &ppid); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } if (nla_put(skb, IFLA_PHYS_PORT_ID, ppid.id_len, ppid.id)) return -EMSGSIZE; return 0; } static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev) { char name[IFNAMSIZ]; int err; err = dev_get_phys_port_name(dev, name, sizeof(name)); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } if (nla_put_string(skb, IFLA_PHYS_PORT_NAME, name)) return -EMSGSIZE; return 0; } static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) { struct netdev_phys_item_id ppid = { }; int err; err = netif_get_port_parent_id(dev, &ppid, false); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } if (nla_put(skb, IFLA_PHYS_SWITCH_ID, ppid.id_len, ppid.id)) return -EMSGSIZE; return 0; } static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb, struct net_device *dev) { struct rtnl_link_stats64 *sp; struct nlattr *attr; attr = nla_reserve_64bit(skb, IFLA_STATS64, sizeof(struct rtnl_link_stats64), IFLA_PAD); if (!attr) return -EMSGSIZE; sp = nla_data(attr); dev_get_stats(dev, sp); attr = nla_reserve(skb, IFLA_STATS, sizeof(struct rtnl_link_stats)); if (!attr) return -EMSGSIZE; copy_rtnl_link_stats(nla_data(attr), sp); return 0; } static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, struct net_device *dev, int vfs_num, u32 ext_filter_mask) { struct ifla_vf_rss_query_en vf_rss_query_en; struct nlattr *vf, *vfstats, *vfvlanlist; struct ifla_vf_link_state vf_linkstate; struct ifla_vf_vlan_info vf_vlan_info; struct ifla_vf_spoofchk vf_spoofchk; struct ifla_vf_tx_rate vf_tx_rate; struct ifla_vf_stats vf_stats; struct ifla_vf_trust vf_trust; struct ifla_vf_vlan vf_vlan; struct ifla_vf_rate vf_rate; struct ifla_vf_mac vf_mac; struct ifla_vf_broadcast vf_broadcast; struct ifla_vf_info ivi; struct ifla_vf_guid node_guid; struct ifla_vf_guid port_guid; memset(&ivi, 0, sizeof(ivi)); /* Not all SR-IOV capable drivers support the * spoofcheck and "RSS query enable" query. Preset to * -1 so the user space tool can detect that the driver * didn't report anything. */ ivi.spoofchk = -1; ivi.rss_query_en = -1; ivi.trusted = -1; /* The default value for VF link state is "auto" * IFLA_VF_LINK_STATE_AUTO which equals zero */ ivi.linkstate = 0; /* VLAN Protocol by default is 802.1Q */ ivi.vlan_proto = htons(ETH_P_8021Q); if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi)) return 0; memset(&vf_vlan_info, 0, sizeof(vf_vlan_info)); memset(&node_guid, 0, sizeof(node_guid)); memset(&port_guid, 0, sizeof(port_guid)); vf_mac.vf = vf_vlan.vf = vf_vlan_info.vf = vf_rate.vf = vf_tx_rate.vf = vf_spoofchk.vf = vf_linkstate.vf = vf_rss_query_en.vf = vf_trust.vf = node_guid.vf = port_guid.vf = ivi.vf; memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len); vf_vlan.vlan = ivi.vlan; vf_vlan.qos = ivi.qos; vf_vlan_info.vlan = ivi.vlan; vf_vlan_info.qos = ivi.qos; vf_vlan_info.vlan_proto = ivi.vlan_proto; vf_tx_rate.rate = ivi.max_tx_rate; vf_rate.min_tx_rate = ivi.min_tx_rate; vf_rate.max_tx_rate = ivi.max_tx_rate; vf_spoofchk.setting = ivi.spoofchk; vf_linkstate.link_state = ivi.linkstate; vf_rss_query_en.setting = ivi.rss_query_en; vf_trust.setting = ivi.trusted; vf = nla_nest_start_noflag(skb, IFLA_VF_INFO); if (!vf) return -EMSGSIZE; if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || nla_put(skb, IFLA_VF_BROADCAST, sizeof(vf_broadcast), &vf_broadcast) || nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate), &vf_rate) || nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), &vf_tx_rate) || nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk), &vf_spoofchk) || nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate), &vf_linkstate) || nla_put(skb, IFLA_VF_RSS_QUERY_EN, sizeof(vf_rss_query_en), &vf_rss_query_en) || nla_put(skb, IFLA_VF_TRUST, sizeof(vf_trust), &vf_trust)) goto nla_put_vf_failure; if (dev->netdev_ops->ndo_get_vf_guid && !dev->netdev_ops->ndo_get_vf_guid(dev, vfs_num, &node_guid, &port_guid)) { if (nla_put(skb, IFLA_VF_IB_NODE_GUID, sizeof(node_guid), &node_guid) || nla_put(skb, IFLA_VF_IB_PORT_GUID, sizeof(port_guid), &port_guid)) goto nla_put_vf_failure; } vfvlanlist = nla_nest_start_noflag(skb, IFLA_VF_VLAN_LIST); if (!vfvlanlist) goto nla_put_vf_failure; if (nla_put(skb, IFLA_VF_VLAN_INFO, sizeof(vf_vlan_info), &vf_vlan_info)) { nla_nest_cancel(skb, vfvlanlist); goto nla_put_vf_failure; } nla_nest_end(skb, vfvlanlist); if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) { memset(&vf_stats, 0, sizeof(vf_stats)); if (dev->netdev_ops->ndo_get_vf_stats) dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num, &vf_stats); vfstats = nla_nest_start_noflag(skb, IFLA_VF_STATS); if (!vfstats) goto nla_put_vf_failure; if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS, vf_stats.rx_packets, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS, vf_stats.tx_packets, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_BYTES, vf_stats.rx_bytes, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_BYTES, vf_stats.tx_bytes, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, vf_stats.broadcast, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, vf_stats.multicast, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED, vf_stats.rx_dropped, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED, vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) { nla_nest_cancel(skb, vfstats); goto nla_put_vf_failure; } nla_nest_end(skb, vfstats); } nla_nest_end(skb, vf); return 0; nla_put_vf_failure: nla_nest_cancel(skb, vf); return -EMSGSIZE; } static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb, struct net_device *dev, u32 ext_filter_mask) { struct nlattr *vfinfo; int i, num_vfs; if (!dev->dev.parent || ((ext_filter_mask & RTEXT_FILTER_VF) == 0)) return 0; num_vfs = dev_num_vf(dev->dev.parent); if (nla_put_u32(skb, IFLA_NUM_VF, num_vfs)) return -EMSGSIZE; if (!dev->netdev_ops->ndo_get_vf_config) return 0; vfinfo = nla_nest_start_noflag(skb, IFLA_VFINFO_LIST); if (!vfinfo) return -EMSGSIZE; for (i = 0; i < num_vfs; i++) { if (rtnl_fill_vfinfo(skb, dev, i, ext_filter_mask)) { nla_nest_cancel(skb, vfinfo); return -EMSGSIZE; } } nla_nest_end(skb, vfinfo); return 0; } static int rtnl_fill_link_ifmap(struct sk_buff *skb, const struct net_device *dev) { struct rtnl_link_ifmap map; memset(&map, 0, sizeof(map)); map.mem_start = READ_ONCE(dev->mem_start); map.mem_end = READ_ONCE(dev->mem_end); map.base_addr = READ_ONCE(dev->base_addr); map.irq = READ_ONCE(dev->irq); map.dma = READ_ONCE(dev->dma); map.port = READ_ONCE(dev->if_port); if (nla_put_64bit(skb, IFLA_MAP, sizeof(map), &map, IFLA_PAD)) return -EMSGSIZE; return 0; } static u32 rtnl_xdp_prog_skb(struct net_device *dev) { const struct bpf_prog *generic_xdp_prog; u32 res = 0; rcu_read_lock(); generic_xdp_prog = rcu_dereference(dev->xdp_prog); if (generic_xdp_prog) res = generic_xdp_prog->aux->id; rcu_read_unlock(); return res; } static u32 rtnl_xdp_prog_drv(struct net_device *dev) { return dev_xdp_prog_id(dev, XDP_MODE_DRV); } static u32 rtnl_xdp_prog_hw(struct net_device *dev) { return dev_xdp_prog_id(dev, XDP_MODE_HW); } static int rtnl_xdp_report_one(struct sk_buff *skb, struct net_device *dev, u32 *prog_id, u8 *mode, u8 tgt_mode, u32 attr, u32 (*get_prog_id)(struct net_device *dev)) { u32 curr_id; int err; curr_id = get_prog_id(dev); if (!curr_id) return 0; *prog_id = curr_id; err = nla_put_u32(skb, attr, curr_id); if (err) return err; if (*mode != XDP_ATTACHED_NONE) *mode = XDP_ATTACHED_MULTI; else *mode = tgt_mode; return 0; } static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) { struct nlattr *xdp; u32 prog_id; int err; u8 mode; xdp = nla_nest_start_noflag(skb, IFLA_XDP); if (!xdp) return -EMSGSIZE; prog_id = 0; mode = XDP_ATTACHED_NONE; err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_SKB, IFLA_XDP_SKB_PROG_ID, rtnl_xdp_prog_skb); if (err) goto err_cancel; err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_DRV, IFLA_XDP_DRV_PROG_ID, rtnl_xdp_prog_drv); if (err) goto err_cancel; err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_HW, IFLA_XDP_HW_PROG_ID, rtnl_xdp_prog_hw); if (err) goto err_cancel; err = nla_put_u8(skb, IFLA_XDP_ATTACHED, mode); if (err) goto err_cancel; if (prog_id && mode != XDP_ATTACHED_MULTI) { err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id); if (err) goto err_cancel; } nla_nest_end(skb, xdp); return 0; err_cancel: nla_nest_cancel(skb, xdp); return err; } static u32 rtnl_get_event(unsigned long event) { u32 rtnl_event_type = IFLA_EVENT_NONE; switch (event) { case NETDEV_REBOOT: rtnl_event_type = IFLA_EVENT_REBOOT; break; case NETDEV_FEAT_CHANGE: rtnl_event_type = IFLA_EVENT_FEATURES; break; case NETDEV_BONDING_FAILOVER: rtnl_event_type = IFLA_EVENT_BONDING_FAILOVER; break; case NETDEV_NOTIFY_PEERS: rtnl_event_type = IFLA_EVENT_NOTIFY_PEERS; break; case NETDEV_RESEND_IGMP: rtnl_event_type = IFLA_EVENT_IGMP_RESEND; break; case NETDEV_CHANGEINFODATA: rtnl_event_type = IFLA_EVENT_BONDING_OPTIONS; break; default: break; } return rtnl_event_type; } static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev) { const struct net_device *upper_dev; int ret = 0; rcu_read_lock(); upper_dev = netdev_master_upper_dev_get_rcu(dev); if (upper_dev) ret = nla_put_u32(skb, IFLA_MASTER, READ_ONCE(upper_dev->ifindex)); rcu_read_unlock(); return ret; } static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev, bool force) { int iflink = dev_get_iflink(dev); if (force || READ_ONCE(dev->ifindex) != iflink) return nla_put_u32(skb, IFLA_LINK, iflink); return 0; } static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb, struct net_device *dev) { char buf[IFALIASZ]; int ret; ret = dev_get_alias(dev, buf, sizeof(buf)); return ret > 0 ? nla_put_string(skb, IFLA_IFALIAS, buf) : 0; } static int rtnl_fill_link_netnsid(struct sk_buff *skb, const struct net_device *dev, struct net *src_net, gfp_t gfp) { bool put_iflink = false; if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net) { struct net *link_net = dev->rtnl_link_ops->get_link_net(dev); if (!net_eq(dev_net(dev), link_net)) { int id = peernet2id_alloc(src_net, link_net, gfp); if (nla_put_s32(skb, IFLA_LINK_NETNSID, id)) return -EMSGSIZE; put_iflink = true; } } return nla_put_iflink(skb, dev, put_iflink); } static int rtnl_fill_link_af(struct sk_buff *skb, const struct net_device *dev, u32 ext_filter_mask) { const struct rtnl_af_ops *af_ops; struct nlattr *af_spec; af_spec = nla_nest_start_noflag(skb, IFLA_AF_SPEC); if (!af_spec) return -EMSGSIZE; list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { struct nlattr *af; int err; if (!af_ops->fill_link_af) continue; af = nla_nest_start_noflag(skb, af_ops->family); if (!af) return -EMSGSIZE; err = af_ops->fill_link_af(skb, dev, ext_filter_mask); /* * Caller may return ENODATA to indicate that there * was no data to be dumped. This is not an error, it * means we should trim the attribute header and * continue. */ if (err == -ENODATA) nla_nest_cancel(skb, af); else if (err < 0) return -EMSGSIZE; nla_nest_end(skb, af); } nla_nest_end(skb, af_spec); return 0; } static int rtnl_fill_alt_ifnames(struct sk_buff *skb, const struct net_device *dev) { struct netdev_name_node *name_node; int count = 0; list_for_each_entry_rcu(name_node, &dev->name_node->list, list) { if (nla_put_string(skb, IFLA_ALT_IFNAME, name_node->name)) return -EMSGSIZE; count++; } return count; } /* RCU protected. */ static int rtnl_fill_prop_list(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *prop_list; int ret; prop_list = nla_nest_start(skb, IFLA_PROP_LIST); if (!prop_list) return -EMSGSIZE; ret = rtnl_fill_alt_ifnames(skb, dev); if (ret <= 0) goto nest_cancel; nla_nest_end(skb, prop_list); return 0; nest_cancel: nla_nest_cancel(skb, prop_list); return ret; } static int rtnl_fill_proto_down(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *pr; u32 preason; if (nla_put_u8(skb, IFLA_PROTO_DOWN, READ_ONCE(dev->proto_down))) goto nla_put_failure; preason = READ_ONCE(dev->proto_down_reason); if (!preason) return 0; pr = nla_nest_start(skb, IFLA_PROTO_DOWN_REASON); if (!pr) return -EMSGSIZE; if (nla_put_u32(skb, IFLA_PROTO_DOWN_REASON_VALUE, preason)) { nla_nest_cancel(skb, pr); goto nla_put_failure; } nla_nest_end(skb, pr); return 0; nla_put_failure: return -EMSGSIZE; } static int rtnl_fill_devlink_port(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *devlink_port_nest; int ret; devlink_port_nest = nla_nest_start(skb, IFLA_DEVLINK_PORT); if (!devlink_port_nest) return -EMSGSIZE; if (dev->devlink_port) { ret = devlink_nl_port_handle_fill(skb, dev->devlink_port); if (ret < 0) goto nest_cancel; } nla_nest_end(skb, devlink_port_nest); return 0; nest_cancel: nla_nest_cancel(skb, devlink_port_nest); return ret; } static int rtnl_fill_dpll_pin(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *dpll_pin_nest; int ret; dpll_pin_nest = nla_nest_start(skb, IFLA_DPLL_PIN); if (!dpll_pin_nest) return -EMSGSIZE; ret = dpll_netdev_add_pin_handle(skb, dev); if (ret < 0) goto nest_cancel; nla_nest_end(skb, dpll_pin_nest); return 0; nest_cancel: nla_nest_cancel(skb, dpll_pin_nest); return ret; } static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct net *src_net, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask, u32 event, int *new_nsid, int new_ifindex, int tgt_netnsid, gfp_t gfp) { char devname[IFNAMSIZ]; struct ifinfomsg *ifm; struct nlmsghdr *nlh; struct Qdisc *qdisc; ASSERT_RTNL(); nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); if (nlh == NULL) return -EMSGSIZE; ifm = nlmsg_data(nlh); ifm->ifi_family = AF_UNSPEC; ifm->__ifi_pad = 0; ifm->ifi_type = READ_ONCE(dev->type); ifm->ifi_index = READ_ONCE(dev->ifindex); ifm->ifi_flags = netif_get_flags(dev); ifm->ifi_change = change; if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid)) goto nla_put_failure; netdev_copy_name(dev, devname); if (nla_put_string(skb, IFLA_IFNAME, devname)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_TXQLEN, READ_ONCE(dev->tx_queue_len)) || nla_put_u8(skb, IFLA_OPERSTATE, netif_running(dev) ? READ_ONCE(dev->operstate) : IF_OPER_DOWN) || nla_put_u8(skb, IFLA_LINKMODE, READ_ONCE(dev->link_mode)) || nla_put_u8(skb, IFLA_NETNS_IMMUTABLE, dev->netns_immutable) || nla_put_u32(skb, IFLA_MTU, READ_ONCE(dev->mtu)) || nla_put_u32(skb, IFLA_MIN_MTU, READ_ONCE(dev->min_mtu)) || nla_put_u32(skb, IFLA_MAX_MTU, READ_ONCE(dev->max_mtu)) || nla_put_u32(skb, IFLA_GROUP, READ_ONCE(dev->group)) || nla_put_u32(skb, IFLA_PROMISCUITY, READ_ONCE(dev->promiscuity)) || nla_put_u32(skb, IFLA_ALLMULTI, READ_ONCE(dev->allmulti)) || nla_put_u32(skb, IFLA_NUM_TX_QUEUES, READ_ONCE(dev->num_tx_queues)) || nla_put_u32(skb, IFLA_GSO_MAX_SEGS, READ_ONCE(dev->gso_max_segs)) || nla_put_u32(skb, IFLA_GSO_MAX_SIZE, READ_ONCE(dev->gso_max_size)) || nla_put_u32(skb, IFLA_GRO_MAX_SIZE, READ_ONCE(dev->gro_max_size)) || nla_put_u32(skb, IFLA_GSO_IPV4_MAX_SIZE, READ_ONCE(dev->gso_ipv4_max_size)) || nla_put_u32(skb, IFLA_GRO_IPV4_MAX_SIZE, READ_ONCE(dev->gro_ipv4_max_size)) || nla_put_u32(skb, IFLA_TSO_MAX_SIZE, READ_ONCE(dev->tso_max_size)) || nla_put_u32(skb, IFLA_TSO_MAX_SEGS, READ_ONCE(dev->tso_max_segs)) || nla_put_uint(skb, IFLA_MAX_PACING_OFFLOAD_HORIZON, READ_ONCE(dev->max_pacing_offload_horizon)) || #ifdef CONFIG_RPS nla_put_u32(skb, IFLA_NUM_RX_QUEUES, READ_ONCE(dev->num_rx_queues)) || #endif put_master_ifindex(skb, dev) || nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || nla_put_ifalias(skb, dev) || nla_put_u32(skb, IFLA_CARRIER_CHANGES, atomic_read(&dev->carrier_up_count) + atomic_read(&dev->carrier_down_count)) || nla_put_u32(skb, IFLA_CARRIER_UP_COUNT, atomic_read(&dev->carrier_up_count)) || nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT, atomic_read(&dev->carrier_down_count)) || nla_put_u16(skb, IFLA_HEADROOM, READ_ONCE(dev->needed_headroom)) || nla_put_u16(skb, IFLA_TAILROOM, READ_ONCE(dev->needed_tailroom))) goto nla_put_failure; if (rtnl_fill_proto_down(skb, dev)) goto nla_put_failure; if (event != IFLA_EVENT_NONE) { if (nla_put_u32(skb, IFLA_EVENT, event)) goto nla_put_failure; } if (dev->addr_len) { if (nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr) || nla_put(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast)) goto nla_put_failure; } if (rtnl_phys_port_id_fill(skb, dev)) goto nla_put_failure; if (rtnl_phys_port_name_fill(skb, dev)) goto nla_put_failure; if (rtnl_phys_switch_id_fill(skb, dev)) goto nla_put_failure; if (rtnl_fill_stats(skb, dev)) goto nla_put_failure; if (rtnl_fill_vf(skb, dev, ext_filter_mask)) goto nla_put_failure; if (rtnl_port_fill(skb, dev, ext_filter_mask)) goto nla_put_failure; if (rtnl_xdp_fill(skb, dev)) goto nla_put_failure; if (dev->rtnl_link_ops || rtnl_have_link_slave_info(dev)) { if (rtnl_link_fill(skb, dev) < 0) goto nla_put_failure; } if (new_nsid && nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0) goto nla_put_failure; if (new_ifindex && nla_put_s32(skb, IFLA_NEW_IFINDEX, new_ifindex) < 0) goto nla_put_failure; if (memchr_inv(dev->perm_addr, '\0', dev->addr_len) && nla_put(skb, IFLA_PERM_ADDRESS, dev->addr_len, dev->perm_addr)) goto nla_put_failure; rcu_read_lock(); if (rtnl_fill_link_netnsid(skb, dev, src_net, GFP_ATOMIC)) goto nla_put_failure_rcu; qdisc = rcu_dereference(dev->qdisc); if (qdisc && nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) goto nla_put_failure_rcu; if (rtnl_fill_link_af(skb, dev, ext_filter_mask)) goto nla_put_failure_rcu; if (rtnl_fill_link_ifmap(skb, dev)) goto nla_put_failure_rcu; if (rtnl_fill_prop_list(skb, dev)) goto nla_put_failure_rcu; rcu_read_unlock(); if (dev->dev.parent && nla_put_string(skb, IFLA_PARENT_DEV_NAME, dev_name(dev->dev.parent))) goto nla_put_failure; if (dev->dev.parent && dev->dev.parent->bus && nla_put_string(skb, IFLA_PARENT_DEV_BUS_NAME, dev->dev.parent->bus->name)) goto nla_put_failure; if (rtnl_fill_devlink_port(skb, dev)) goto nla_put_failure; if (rtnl_fill_dpll_pin(skb, dev)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure_rcu: rcu_read_unlock(); nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_UNSPEC] = { .strict_start_type = IFLA_DPLL_PIN }, [IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 }, [IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [IFLA_MAP] = { .len = sizeof(struct rtnl_link_ifmap) }, [IFLA_MTU] = { .type = NLA_U32 }, [IFLA_LINK] = { .type = NLA_U32 }, [IFLA_MASTER] = { .type = NLA_U32 }, [IFLA_CARRIER] = { .type = NLA_U8 }, [IFLA_TXQLEN] = { .type = NLA_U32 }, [IFLA_WEIGHT] = { .type = NLA_U32 }, [IFLA_OPERSTATE] = { .type = NLA_U8 }, [IFLA_LINKMODE] = { .type = NLA_U8 }, [IFLA_LINKINFO] = { .type = NLA_NESTED }, [IFLA_NET_NS_PID] = { .type = NLA_U32 }, [IFLA_NET_NS_FD] = { .type = NLA_U32 }, /* IFLA_IFALIAS is a string, but policy is set to NLA_BINARY to * allow 0-length string (needed to remove an alias). */ [IFLA_IFALIAS] = { .type = NLA_BINARY, .len = IFALIASZ - 1 }, [IFLA_VFINFO_LIST] = {. type = NLA_NESTED }, [IFLA_VF_PORTS] = { .type = NLA_NESTED }, [IFLA_PORT_SELF] = { .type = NLA_NESTED }, [IFLA_AF_SPEC] = { .type = NLA_NESTED }, [IFLA_EXT_MASK] = { .type = NLA_U32 }, [IFLA_PROMISCUITY] = { .type = NLA_U32 }, [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 }, [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, [IFLA_GSO_MAX_SEGS] = { .type = NLA_U32 }, [IFLA_GSO_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1), [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */ [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_LINK_NETNSID] = { .type = NLA_S32 }, [IFLA_PROTO_DOWN] = { .type = NLA_U8 }, [IFLA_XDP] = { .type = NLA_NESTED }, [IFLA_EVENT] = { .type = NLA_U32 }, [IFLA_GROUP] = { .type = NLA_U32 }, [IFLA_TARGET_NETNSID] = { .type = NLA_S32 }, [IFLA_CARRIER_UP_COUNT] = { .type = NLA_U32 }, [IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 }, [IFLA_MIN_MTU] = { .type = NLA_U32 }, [IFLA_MAX_MTU] = { .type = NLA_U32 }, [IFLA_PROP_LIST] = { .type = NLA_NESTED }, [IFLA_ALT_IFNAME] = { .type = NLA_STRING, .len = ALTIFNAMSIZ - 1 }, [IFLA_PERM_ADDRESS] = { .type = NLA_REJECT }, [IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED }, [IFLA_NEW_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1), [IFLA_PARENT_DEV_NAME] = { .type = NLA_NUL_STRING }, [IFLA_GRO_MAX_SIZE] = { .type = NLA_U32 }, [IFLA_TSO_MAX_SIZE] = { .type = NLA_REJECT }, [IFLA_TSO_MAX_SEGS] = { .type = NLA_REJECT }, [IFLA_ALLMULTI] = { .type = NLA_REJECT }, [IFLA_GSO_IPV4_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1), [IFLA_GRO_IPV4_MAX_SIZE] = { .type = NLA_U32 }, [IFLA_NETNS_IMMUTABLE] = { .type = NLA_REJECT }, [IFLA_HEADROOM] = { .type = NLA_REJECT }, [IFLA_TAILROOM] = { .type = NLA_REJECT }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { [IFLA_INFO_KIND] = { .type = NLA_STRING }, [IFLA_INFO_DATA] = { .type = NLA_NESTED }, [IFLA_INFO_SLAVE_KIND] = { .type = NLA_STRING }, [IFLA_INFO_SLAVE_DATA] = { .type = NLA_NESTED }, }; static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { [IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) }, [IFLA_VF_BROADCAST] = { .type = NLA_REJECT }, [IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) }, [IFLA_VF_VLAN_LIST] = { .type = NLA_NESTED }, [IFLA_VF_TX_RATE] = { .len = sizeof(struct ifla_vf_tx_rate) }, [IFLA_VF_SPOOFCHK] = { .len = sizeof(struct ifla_vf_spoofchk) }, [IFLA_VF_RATE] = { .len = sizeof(struct ifla_vf_rate) }, [IFLA_VF_LINK_STATE] = { .len = sizeof(struct ifla_vf_link_state) }, [IFLA_VF_RSS_QUERY_EN] = { .len = sizeof(struct ifla_vf_rss_query_en) }, [IFLA_VF_STATS] = { .type = NLA_NESTED }, [IFLA_VF_TRUST] = { .len = sizeof(struct ifla_vf_trust) }, [IFLA_VF_IB_NODE_GUID] = { .len = sizeof(struct ifla_vf_guid) }, [IFLA_VF_IB_PORT_GUID] = { .len = sizeof(struct ifla_vf_guid) }, }; static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { [IFLA_PORT_VF] = { .type = NLA_U32 }, [IFLA_PORT_PROFILE] = { .type = NLA_STRING, .len = PORT_PROFILE_MAX }, [IFLA_PORT_INSTANCE_UUID] = { .type = NLA_BINARY, .len = PORT_UUID_MAX }, [IFLA_PORT_HOST_UUID] = { .type = NLA_STRING, .len = PORT_UUID_MAX }, [IFLA_PORT_REQUEST] = { .type = NLA_U8, }, [IFLA_PORT_RESPONSE] = { .type = NLA_U16, }, /* Unused, but we need to keep it here since user space could * fill it. It's also broken with regard to NLA_BINARY use in * combination with structs. */ [IFLA_PORT_VSI_TYPE] = { .type = NLA_BINARY, .len = sizeof(struct ifla_port_vsi) }, }; static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = { [IFLA_XDP_UNSPEC] = { .strict_start_type = IFLA_XDP_EXPECTED_FD }, [IFLA_XDP_FD] = { .type = NLA_S32 }, [IFLA_XDP_EXPECTED_FD] = { .type = NLA_S32 }, [IFLA_XDP_ATTACHED] = { .type = NLA_U8 }, [IFLA_XDP_FLAGS] = { .type = NLA_U32 }, [IFLA_XDP_PROG_ID] = { .type = NLA_U32 }, }; static struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla, int *ops_srcu_index) { struct nlattr *linfo[IFLA_INFO_MAX + 1]; struct rtnl_link_ops *ops = NULL; if (nla_parse_nested_deprecated(linfo, IFLA_INFO_MAX, nla, ifla_info_policy, NULL) < 0) return NULL; if (linfo[IFLA_INFO_KIND]) { char kind[MODULE_NAME_LEN]; nla_strscpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind)); ops = rtnl_link_ops_get(kind, ops_srcu_index); } return ops; } static bool link_master_filtered(struct net_device *dev, int master_idx) { struct net_device *master; if (!master_idx) return false; master = netdev_master_upper_dev_get(dev); /* 0 is already used to denote IFLA_MASTER wasn't passed, therefore need * another invalid value for ifindex to denote "no master". */ if (master_idx == -1) return !!master; if (!master || master->ifindex != master_idx) return true; return false; } static bool link_kind_filtered(const struct net_device *dev, const struct rtnl_link_ops *kind_ops) { if (kind_ops && dev->rtnl_link_ops != kind_ops) return true; return false; } static bool link_dump_filtered(struct net_device *dev, int master_idx, const struct rtnl_link_ops *kind_ops) { if (link_master_filtered(dev, master_idx) || link_kind_filtered(dev, kind_ops)) return true; return false; } /** * rtnl_get_net_ns_capable - Get netns if sufficiently privileged. * @sk: netlink socket * @netnsid: network namespace identifier * * Returns the network namespace identified by netnsid on success or an error * pointer on failure. */ struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid) { struct net *net; net = get_net_ns_by_id(sock_net(sk), netnsid); if (!net) return ERR_PTR(-EINVAL); /* For now, the caller is required to have CAP_NET_ADMIN in * the user namespace owning the target net ns. */ if (!sk_ns_capable(sk, net->user_ns, CAP_NET_ADMIN)) { put_net(net); return ERR_PTR(-EACCES); } return net; } EXPORT_SYMBOL_GPL(rtnl_get_net_ns_capable); static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh, bool strict_check, struct nlattr **tb, struct netlink_ext_ack *extack) { int hdrlen; if (strict_check) { struct ifinfomsg *ifm; ifm = nlmsg_payload(nlh, sizeof(*ifm)); if (!ifm) { NL_SET_ERR_MSG(extack, "Invalid header for link dump"); return -EINVAL; } if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change) { NL_SET_ERR_MSG(extack, "Invalid values in header for link dump request"); return -EINVAL; } if (ifm->ifi_index) { NL_SET_ERR_MSG(extack, "Filter by device index not supported for link dumps"); return -EINVAL; } return nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); } /* A hack to preserve kernel<->userspace interface. * The correct header is ifinfomsg. It is consistent with rtnl_getlink. * However, before Linux v3.9 the code here assumed rtgenmsg and that's * what iproute2 < v3.9.0 used. * We can detect the old iproute2. Even including the IFLA_EXT_MASK * attribute, its netlink message is shorter than struct ifinfomsg. */ hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ? sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); return nlmsg_parse_deprecated(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, extack); } static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; struct rtnl_link_ops *kind_ops = NULL; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); unsigned int flags = NLM_F_MULTI; struct nlattr *tb[IFLA_MAX+1]; struct { unsigned long ifindex; } *ctx = (void *)cb->ctx; struct net *tgt_net = net; u32 ext_filter_mask = 0; struct net_device *dev; int ops_srcu_index; int master_idx = 0; int netnsid = -1; int err, i; err = rtnl_valid_dump_ifinfo_req(nlh, cb->strict_check, tb, extack); if (err < 0) { if (cb->strict_check) return err; goto walk_entries; } for (i = 0; i <= IFLA_MAX; ++i) { if (!tb[i]) continue; /* new attributes should only be added with strict checking */ switch (i) { case IFLA_TARGET_NETNSID: netnsid = nla_get_s32(tb[i]); tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid); if (IS_ERR(tgt_net)) { NL_SET_ERR_MSG(extack, "Invalid target network namespace id"); err = PTR_ERR(tgt_net); netnsid = -1; goto out; } break; case IFLA_EXT_MASK: ext_filter_mask = nla_get_u32(tb[i]); break; case IFLA_MASTER: master_idx = nla_get_u32(tb[i]); break; case IFLA_LINKINFO: kind_ops = linkinfo_to_kind_ops(tb[i], &ops_srcu_index); break; default: if (cb->strict_check) { NL_SET_ERR_MSG(extack, "Unsupported attribute in link dump request"); err = -EINVAL; goto out; } } } if (master_idx || kind_ops) flags |= NLM_F_DUMP_FILTERED; walk_entries: err = 0; for_each_netdev_dump(tgt_net, dev, ctx->ifindex) { if (link_dump_filtered(dev, master_idx, kind_ops)) continue; err = rtnl_fill_ifinfo(skb, dev, net, RTM_NEWLINK, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, 0, flags, ext_filter_mask, 0, NULL, 0, netnsid, GFP_KERNEL); if (err < 0) break; } cb->seq = tgt_net->dev_base_seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); out: if (kind_ops) rtnl_link_ops_put(kind_ops, ops_srcu_index); if (netnsid >= 0) put_net(tgt_net); return err; } int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer, struct netlink_ext_ack *exterr) { const struct ifinfomsg *ifmp; const struct nlattr *attrs; size_t len; ifmp = nla_data(nla_peer); attrs = nla_data(nla_peer) + sizeof(struct ifinfomsg); len = nla_len(nla_peer) - sizeof(struct ifinfomsg); if (ifmp->ifi_index < 0) { NL_SET_ERR_MSG_ATTR(exterr, nla_peer, "ifindex can't be negative"); return -EINVAL; } return nla_parse_deprecated(tb, IFLA_MAX, attrs, len, ifla_policy, exterr); } EXPORT_SYMBOL(rtnl_nla_parse_ifinfomsg); static struct net *rtnl_link_get_net_ifla(struct nlattr *tb[]) { struct net *net = NULL; /* Examine the link attributes and figure out which * network namespace we are talking about. */ if (tb[IFLA_NET_NS_PID]) net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); else if (tb[IFLA_NET_NS_FD]) net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD])); return net; } struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) { struct net *net = rtnl_link_get_net_ifla(tb); if (!net) net = get_net(src_net); return net; } EXPORT_SYMBOL(rtnl_link_get_net); /* Figure out which network namespace we are talking about by * examining the link attributes in the following order: * * 1. IFLA_NET_NS_PID * 2. IFLA_NET_NS_FD * 3. IFLA_TARGET_NETNSID */ static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net, struct nlattr *tb[]) { struct net *net; if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) return rtnl_link_get_net(src_net, tb); if (!tb[IFLA_TARGET_NETNSID]) return get_net(src_net); net = get_net_ns_by_id(src_net, nla_get_u32(tb[IFLA_TARGET_NETNSID])); if (!net) return ERR_PTR(-EINVAL); return net; } static struct net *rtnl_link_get_net_capable(const struct sk_buff *skb, struct net *src_net, struct nlattr *tb[], int cap) { struct net *net; net = rtnl_link_get_net_by_nlattr(src_net, tb); if (IS_ERR(net)) return net; if (!netlink_ns_capable(skb, net->user_ns, cap)) { put_net(net); return ERR_PTR(-EPERM); } return net; } /* Verify that rtnetlink requests do not pass additional properties * potentially referring to different network namespaces. */ static int rtnl_ensure_unique_netns(struct nlattr *tb[], struct netlink_ext_ack *extack, bool netns_id_only) { if (netns_id_only) { if (!tb[IFLA_NET_NS_PID] && !tb[IFLA_NET_NS_FD]) return 0; NL_SET_ERR_MSG(extack, "specified netns attribute not supported"); return -EOPNOTSUPP; } if (tb[IFLA_TARGET_NETNSID] && (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD])) goto invalid_attr; if (tb[IFLA_NET_NS_PID] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_FD])) goto invalid_attr; if (tb[IFLA_NET_NS_FD] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_PID])) goto invalid_attr; return 0; invalid_attr: NL_SET_ERR_MSG(extack, "multiple netns identifying attributes specified"); return -EINVAL; } static int rtnl_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate, int max_tx_rate) { const struct net_device_ops *ops = dev->netdev_ops; if (!ops->ndo_set_vf_rate) return -EOPNOTSUPP; if (max_tx_rate && max_tx_rate < min_tx_rate) return -EINVAL; return ops->ndo_set_vf_rate(dev, vf, min_tx_rate, max_tx_rate); } static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS] && nla_len(tb[IFLA_ADDRESS]) < dev->addr_len) return -EINVAL; if (tb[IFLA_BROADCAST] && nla_len(tb[IFLA_BROADCAST]) < dev->addr_len) return -EINVAL; if (tb[IFLA_GSO_MAX_SIZE] && nla_get_u32(tb[IFLA_GSO_MAX_SIZE]) > dev->tso_max_size) { NL_SET_ERR_MSG(extack, "too big gso_max_size"); return -EINVAL; } if (tb[IFLA_GSO_MAX_SEGS] && (nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > GSO_MAX_SEGS || nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > dev->tso_max_segs)) { NL_SET_ERR_MSG(extack, "too big gso_max_segs"); return -EINVAL; } if (tb[IFLA_GRO_MAX_SIZE] && nla_get_u32(tb[IFLA_GRO_MAX_SIZE]) > GRO_MAX_SIZE) { NL_SET_ERR_MSG(extack, "too big gro_max_size"); return -EINVAL; } if (tb[IFLA_GSO_IPV4_MAX_SIZE] && nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]) > dev->tso_max_size) { NL_SET_ERR_MSG(extack, "too big gso_ipv4_max_size"); return -EINVAL; } if (tb[IFLA_GRO_IPV4_MAX_SIZE] && nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]) > GRO_MAX_SIZE) { NL_SET_ERR_MSG(extack, "too big gro_ipv4_max_size"); return -EINVAL; } if (tb[IFLA_AF_SPEC]) { struct nlattr *af; int rem, err; nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { struct rtnl_af_ops *af_ops; int af_ops_srcu_index; af_ops = rtnl_af_lookup(nla_type(af), &af_ops_srcu_index); if (!af_ops) return -EAFNOSUPPORT; if (!af_ops->set_link_af) err = -EOPNOTSUPP; else if (af_ops->validate_link_af) err = af_ops->validate_link_af(dev, af, extack); else err = 0; rtnl_af_put(af_ops, af_ops_srcu_index); if (err < 0) return err; } } return 0; } static int handle_infiniband_guid(struct net_device *dev, struct ifla_vf_guid *ivt, int guid_type) { const struct net_device_ops *ops = dev->netdev_ops; return ops->ndo_set_vf_guid(dev, ivt->vf, ivt->guid, guid_type); } static int handle_vf_guid(struct net_device *dev, struct ifla_vf_guid *ivt, int guid_type) { if (dev->type != ARPHRD_INFINIBAND) return -EOPNOTSUPP; return handle_infiniband_guid(dev, ivt, guid_type); } static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) { const struct net_device_ops *ops = dev->netdev_ops; int err = -EINVAL; if (tb[IFLA_VF_MAC]) { struct ifla_vf_mac *ivm = nla_data(tb[IFLA_VF_MAC]); if (ivm->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_mac) err = ops->ndo_set_vf_mac(dev, ivm->vf, ivm->mac); if (err < 0) return err; } if (tb[IFLA_VF_VLAN]) { struct ifla_vf_vlan *ivv = nla_data(tb[IFLA_VF_VLAN]); if (ivv->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_vlan) err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan, ivv->qos, htons(ETH_P_8021Q)); if (err < 0) return err; } if (tb[IFLA_VF_VLAN_LIST]) { struct ifla_vf_vlan_info *ivvl[MAX_VLAN_LIST_LEN]; struct nlattr *attr; int rem, len = 0; err = -EOPNOTSUPP; if (!ops->ndo_set_vf_vlan) return err; nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) { if (nla_type(attr) != IFLA_VF_VLAN_INFO || nla_len(attr) < sizeof(struct ifla_vf_vlan_info)) { return -EINVAL; } if (len >= MAX_VLAN_LIST_LEN) return -EOPNOTSUPP; ivvl[len] = nla_data(attr); len++; } if (len == 0) return -EINVAL; if (ivvl[0]->vf >= INT_MAX) return -EINVAL; err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan, ivvl[0]->qos, ivvl[0]->vlan_proto); if (err < 0) return err; } if (tb[IFLA_VF_TX_RATE]) { struct ifla_vf_tx_rate *ivt = nla_data(tb[IFLA_VF_TX_RATE]); struct ifla_vf_info ivf; if (ivt->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_get_vf_config) err = ops->ndo_get_vf_config(dev, ivt->vf, &ivf); if (err < 0) return err; err = rtnl_set_vf_rate(dev, ivt->vf, ivf.min_tx_rate, ivt->rate); if (err < 0) return err; } if (tb[IFLA_VF_RATE]) { struct ifla_vf_rate *ivt = nla_data(tb[IFLA_VF_RATE]); if (ivt->vf >= INT_MAX) return -EINVAL; err = rtnl_set_vf_rate(dev, ivt->vf, ivt->min_tx_rate, ivt->max_tx_rate); if (err < 0) return err; } if (tb[IFLA_VF_SPOOFCHK]) { struct ifla_vf_spoofchk *ivs = nla_data(tb[IFLA_VF_SPOOFCHK]); if (ivs->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_spoofchk) err = ops->ndo_set_vf_spoofchk(dev, ivs->vf, ivs->setting); if (err < 0) return err; } if (tb[IFLA_VF_LINK_STATE]) { struct ifla_vf_link_state *ivl = nla_data(tb[IFLA_VF_LINK_STATE]); if (ivl->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_link_state) err = ops->ndo_set_vf_link_state(dev, ivl->vf, ivl->link_state); if (err < 0) return err; } if (tb[IFLA_VF_RSS_QUERY_EN]) { struct ifla_vf_rss_query_en *ivrssq_en; err = -EOPNOTSUPP; ivrssq_en = nla_data(tb[IFLA_VF_RSS_QUERY_EN]); if (ivrssq_en->vf >= INT_MAX) return -EINVAL; if (ops->ndo_set_vf_rss_query_en) err = ops->ndo_set_vf_rss_query_en(dev, ivrssq_en->vf, ivrssq_en->setting); if (err < 0) return err; } if (tb[IFLA_VF_TRUST]) { struct ifla_vf_trust *ivt = nla_data(tb[IFLA_VF_TRUST]); if (ivt->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_trust) err = ops->ndo_set_vf_trust(dev, ivt->vf, ivt->setting); if (err < 0) return err; } if (tb[IFLA_VF_IB_NODE_GUID]) { struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_NODE_GUID]); if (ivt->vf >= INT_MAX) return -EINVAL; if (!ops->ndo_set_vf_guid) return -EOPNOTSUPP; return handle_vf_guid(dev, ivt, IFLA_VF_IB_NODE_GUID); } if (tb[IFLA_VF_IB_PORT_GUID]) { struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_PORT_GUID]); if (ivt->vf >= INT_MAX) return -EINVAL; if (!ops->ndo_set_vf_guid) return -EOPNOTSUPP; return handle_vf_guid(dev, ivt, IFLA_VF_IB_PORT_GUID); } return err; } static int do_set_master(struct net_device *dev, int ifindex, struct netlink_ext_ack *extack) { struct net_device *upper_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops; int err; /* Release the lower lock, the upper is responsible for locking * the lower if needed. None of the existing upper devices * use netdev instance lock, so don't grab it. */ if (upper_dev) { if (upper_dev->ifindex == ifindex) return 0; ops = upper_dev->netdev_ops; if (ops->ndo_del_slave) { netdev_unlock_ops(dev); err = ops->ndo_del_slave(upper_dev, dev); netdev_lock_ops(dev); if (err) return err; } else { return -EOPNOTSUPP; } } if (ifindex) { upper_dev = __dev_get_by_index(dev_net(dev), ifindex); if (!upper_dev) return -EINVAL; ops = upper_dev->netdev_ops; if (ops->ndo_add_slave) { netdev_unlock_ops(dev); err = ops->ndo_add_slave(upper_dev, dev, extack); netdev_lock_ops(dev); if (err) return err; } else { return -EOPNOTSUPP; } } return 0; } static const struct nla_policy ifla_proto_down_reason_policy[IFLA_PROTO_DOWN_REASON_VALUE + 1] = { [IFLA_PROTO_DOWN_REASON_MASK] = { .type = NLA_U32 }, [IFLA_PROTO_DOWN_REASON_VALUE] = { .type = NLA_U32 }, }; static int do_set_proto_down(struct net_device *dev, struct nlattr *nl_proto_down, struct nlattr *nl_proto_down_reason, struct netlink_ext_ack *extack) { struct nlattr *pdreason[IFLA_PROTO_DOWN_REASON_MAX + 1]; unsigned long mask = 0; u32 value; bool proto_down; int err; if (!dev->change_proto_down) { NL_SET_ERR_MSG(extack, "Protodown not supported by device"); return -EOPNOTSUPP; } if (nl_proto_down_reason) { err = nla_parse_nested_deprecated(pdreason, IFLA_PROTO_DOWN_REASON_MAX, nl_proto_down_reason, ifla_proto_down_reason_policy, NULL); if (err < 0) return err; if (!pdreason[IFLA_PROTO_DOWN_REASON_VALUE]) { NL_SET_ERR_MSG(extack, "Invalid protodown reason value"); return -EINVAL; } value = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_VALUE]); if (pdreason[IFLA_PROTO_DOWN_REASON_MASK]) mask = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_MASK]); netdev_change_proto_down_reason_locked(dev, mask, value); } if (nl_proto_down) { proto_down = nla_get_u8(nl_proto_down); /* Don't turn off protodown if there are active reasons */ if (!proto_down && dev->proto_down_reason) { NL_SET_ERR_MSG(extack, "Cannot clear protodown, active reasons"); return -EBUSY; } err = netif_change_proto_down(dev, proto_down); if (err) return err; } return 0; } #define DO_SETLINK_MODIFIED 0x01 /* notify flag means notify + modified. */ #define DO_SETLINK_NOTIFY 0x03 static int do_setlink(const struct sk_buff *skb, struct net_device *dev, struct net *tgt_net, struct ifinfomsg *ifm, struct netlink_ext_ack *extack, struct nlattr **tb, int status) { const struct net_device_ops *ops = dev->netdev_ops; char ifname[IFNAMSIZ]; int err; err = validate_linkmsg(dev, tb, extack); if (err < 0) return err; if (tb[IFLA_IFNAME]) nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else ifname[0] = '\0'; if (!net_eq(tgt_net, dev_net(dev))) { const char *pat = ifname[0] ? ifname : NULL; int new_ifindex; new_ifindex = nla_get_s32_default(tb[IFLA_NEW_IFINDEX], 0); err = __dev_change_net_namespace(dev, tgt_net, pat, new_ifindex, extack); if (err) return err; status |= DO_SETLINK_MODIFIED; } netdev_lock_ops(dev); if (tb[IFLA_MAP]) { struct rtnl_link_ifmap *u_map; struct ifmap k_map; if (!ops->ndo_set_config) { err = -EOPNOTSUPP; goto errout; } if (!netif_device_present(dev)) { err = -ENODEV; goto errout; } u_map = nla_data(tb[IFLA_MAP]); k_map.mem_start = (unsigned long) u_map->mem_start; k_map.mem_end = (unsigned long) u_map->mem_end; k_map.base_addr = (unsigned short) u_map->base_addr; k_map.irq = (unsigned char) u_map->irq; k_map.dma = (unsigned char) u_map->dma; k_map.port = (unsigned char) u_map->port; err = ops->ndo_set_config(dev, &k_map); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_ADDRESS]) { struct sockaddr_storage ss = { }; netdev_unlock_ops(dev); /* dev_addr_sem is an outer lock, enforce proper ordering */ down_write(&dev_addr_sem); netdev_lock_ops(dev); ss.ss_family = dev->type; memcpy(ss.__data, nla_data(tb[IFLA_ADDRESS]), dev->addr_len); err = netif_set_mac_address(dev, &ss, extack); if (err) { up_write(&dev_addr_sem); goto errout; } status |= DO_SETLINK_MODIFIED; up_write(&dev_addr_sem); } if (tb[IFLA_MTU]) { err = netif_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack); if (err < 0) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_GROUP]) { netif_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); status |= DO_SETLINK_NOTIFY; } /* * Interface selected by interface index but interface * name provided implies that a name change has been * requested. */ if (ifm->ifi_index > 0 && ifname[0]) { err = netif_change_name(dev, ifname); if (err < 0) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_IFALIAS]) { err = netif_set_alias(dev, nla_data(tb[IFLA_IFALIAS]), nla_len(tb[IFLA_IFALIAS])); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_BROADCAST]) { nla_memcpy(dev->broadcast, tb[IFLA_BROADCAST], dev->addr_len); call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); } if (ifm->ifi_flags || ifm->ifi_change) { err = netif_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), extack); if (err < 0) goto errout; } if (tb[IFLA_MASTER]) { err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); if (err) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_CARRIER]) { err = netif_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER])); if (err) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_TXQLEN]) { unsigned int value = nla_get_u32(tb[IFLA_TXQLEN]); err = netif_change_tx_queue_len(dev, value); if (err) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_GSO_MAX_SIZE]) { u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]); if (dev->gso_max_size ^ max_size) { netif_set_gso_max_size(dev, max_size); status |= DO_SETLINK_MODIFIED; } } if (tb[IFLA_GSO_MAX_SEGS]) { u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); if (dev->gso_max_segs ^ max_segs) { netif_set_gso_max_segs(dev, max_segs); status |= DO_SETLINK_MODIFIED; } } if (tb[IFLA_GRO_MAX_SIZE]) { u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_MAX_SIZE]); if (dev->gro_max_size ^ gro_max_size) { netif_set_gro_max_size(dev, gro_max_size); status |= DO_SETLINK_MODIFIED; } } if (tb[IFLA_GSO_IPV4_MAX_SIZE]) { u32 max_size = nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]); if (dev->gso_ipv4_max_size ^ max_size) { netif_set_gso_ipv4_max_size(dev, max_size); status |= DO_SETLINK_MODIFIED; } } if (tb[IFLA_GRO_IPV4_MAX_SIZE]) { u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]); if (dev->gro_ipv4_max_size ^ gro_max_size) { netif_set_gro_ipv4_max_size(dev, gro_max_size); status |= DO_SETLINK_MODIFIED; } } if (tb[IFLA_OPERSTATE]) set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); if (tb[IFLA_LINKMODE]) { unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]); if (dev->link_mode ^ value) status |= DO_SETLINK_NOTIFY; WRITE_ONCE(dev->link_mode, value); } if (tb[IFLA_VFINFO_LIST]) { struct nlattr *vfinfo[IFLA_VF_MAX + 1]; struct nlattr *attr; int rem; nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) { if (nla_type(attr) != IFLA_VF_INFO || nla_len(attr) < NLA_HDRLEN) { err = -EINVAL; goto errout; } err = nla_parse_nested_deprecated(vfinfo, IFLA_VF_MAX, attr, ifla_vf_policy, NULL); if (err < 0) goto errout; err = do_setvfinfo(dev, vfinfo); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } } err = 0; if (tb[IFLA_VF_PORTS]) { struct nlattr *port[IFLA_PORT_MAX+1]; struct nlattr *attr; int vf; int rem; err = -EOPNOTSUPP; if (!ops->ndo_set_vf_port) goto errout; nla_for_each_nested(attr, tb[IFLA_VF_PORTS], rem) { if (nla_type(attr) != IFLA_VF_PORT || nla_len(attr) < NLA_HDRLEN) { err = -EINVAL; goto errout; } err = nla_parse_nested_deprecated(port, IFLA_PORT_MAX, attr, ifla_port_policy, NULL); if (err < 0) goto errout; if (!port[IFLA_PORT_VF]) { err = -EOPNOTSUPP; goto errout; } vf = nla_get_u32(port[IFLA_PORT_VF]); err = ops->ndo_set_vf_port(dev, vf, port); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } } err = 0; if (tb[IFLA_PORT_SELF]) { struct nlattr *port[IFLA_PORT_MAX+1]; err = nla_parse_nested_deprecated(port, IFLA_PORT_MAX, tb[IFLA_PORT_SELF], ifla_port_policy, NULL); if (err < 0) goto errout; err = -EOPNOTSUPP; if (ops->ndo_set_vf_port) err = ops->ndo_set_vf_port(dev, PORT_SELF_VF, port); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_AF_SPEC]) { struct nlattr *af; int rem; nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { struct rtnl_af_ops *af_ops; int af_ops_srcu_index; af_ops = rtnl_af_lookup(nla_type(af), &af_ops_srcu_index); if (!af_ops) { err = -EAFNOSUPPORT; goto errout; } err = af_ops->set_link_af(dev, af, extack); rtnl_af_put(af_ops, af_ops_srcu_index); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } } err = 0; if (tb[IFLA_PROTO_DOWN] || tb[IFLA_PROTO_DOWN_REASON]) { err = do_set_proto_down(dev, tb[IFLA_PROTO_DOWN], tb[IFLA_PROTO_DOWN_REASON], extack); if (err) goto errout; status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_XDP]) { struct nlattr *xdp[IFLA_XDP_MAX + 1]; u32 xdp_flags = 0; err = nla_parse_nested_deprecated(xdp, IFLA_XDP_MAX, tb[IFLA_XDP], ifla_xdp_policy, NULL); if (err < 0) goto errout; if (xdp[IFLA_XDP_ATTACHED] || xdp[IFLA_XDP_PROG_ID]) { err = -EINVAL; goto errout; } if (xdp[IFLA_XDP_FLAGS]) { xdp_flags = nla_get_u32(xdp[IFLA_XDP_FLAGS]); if (xdp_flags & ~XDP_FLAGS_MASK) { err = -EINVAL; goto errout; } if (hweight32(xdp_flags & XDP_FLAGS_MODES) > 1) { err = -EINVAL; goto errout; } } if (xdp[IFLA_XDP_FD]) { int expected_fd = -1; if (xdp_flags & XDP_FLAGS_REPLACE) { if (!xdp[IFLA_XDP_EXPECTED_FD]) { err = -EINVAL; goto errout; } expected_fd = nla_get_s32(xdp[IFLA_XDP_EXPECTED_FD]); } err = dev_change_xdp_fd(dev, extack, nla_get_s32(xdp[IFLA_XDP_FD]), expected_fd, xdp_flags); if (err) goto errout; status |= DO_SETLINK_NOTIFY; } } errout: if (status & DO_SETLINK_MODIFIED) { if ((status & DO_SETLINK_NOTIFY) == DO_SETLINK_NOTIFY) netif_state_change(dev); if (err < 0) net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n", dev->name); } netdev_unlock_ops(dev); return err; } static struct net_device *rtnl_dev_get(struct net *net, struct nlattr *tb[]) { char ifname[ALTIFNAMSIZ]; if (tb[IFLA_IFNAME]) nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else if (tb[IFLA_ALT_IFNAME]) nla_strscpy(ifname, tb[IFLA_ALT_IFNAME], ALTIFNAMSIZ); else return NULL; return __dev_get_by_name(net, ifname); } static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct ifinfomsg *ifm = nlmsg_data(nlh); struct net *net = sock_net(skb->sk); struct nlattr *tb[IFLA_MAX+1]; struct net_device *dev = NULL; struct rtnl_nets rtnl_nets; struct net *tgt_net; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err < 0) goto errout; err = rtnl_ensure_unique_netns(tb, extack, false); if (err < 0) goto errout; tgt_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); if (IS_ERR(tgt_net)) { err = PTR_ERR(tgt_net); goto errout; } rtnl_nets_init(&rtnl_nets); rtnl_nets_add(&rtnl_nets, get_net(net)); rtnl_nets_add(&rtnl_nets, tgt_net); rtnl_nets_lock(&rtnl_nets); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(net, tb); else err = -EINVAL; if (dev) err = do_setlink(skb, dev, tgt_net, ifm, extack, tb, 0); else if (!err) err = -ENODEV; rtnl_nets_unlock(&rtnl_nets); rtnl_nets_destroy(&rtnl_nets); errout: return err; } static int rtnl_group_dellink(const struct net *net, int group) { struct net_device *dev, *aux; LIST_HEAD(list_kill); bool found = false; if (!group) return -EPERM; for_each_netdev(net, dev) { if (dev->group == group) { const struct rtnl_link_ops *ops; found = true; ops = dev->rtnl_link_ops; if (!ops || !ops->dellink) return -EOPNOTSUPP; } } if (!found) return -ENODEV; for_each_netdev_safe(net, dev, aux) { if (dev->group == group) { const struct rtnl_link_ops *ops; ops = dev->rtnl_link_ops; ops->dellink(dev, &list_kill); } } unregister_netdevice_many(&list_kill); return 0; } int rtnl_delete_link(struct net_device *dev, u32 portid, const struct nlmsghdr *nlh) { const struct rtnl_link_ops *ops; LIST_HEAD(list_kill); ops = dev->rtnl_link_ops; if (!ops || !ops->dellink) return -EOPNOTSUPP; ops->dellink(dev, &list_kill); unregister_netdevice_many_notify(&list_kill, portid, nlh); return 0; } EXPORT_SYMBOL_GPL(rtnl_delete_link); static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct ifinfomsg *ifm = nlmsg_data(nlh); struct net *net = sock_net(skb->sk); u32 portid = NETLINK_CB(skb).portid; struct nlattr *tb[IFLA_MAX+1]; struct net_device *dev = NULL; struct net *tgt_net = net; int netnsid = -1; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err < 0) return err; err = rtnl_ensure_unique_netns(tb, extack, true); if (err < 0) return err; if (tb[IFLA_TARGET_NETNSID]) { netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]); tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid); if (IS_ERR(tgt_net)) return PTR_ERR(tgt_net); } rtnl_net_lock(tgt_net); if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(tgt_net, tb); if (dev) err = rtnl_delete_link(dev, portid, nlh); else if (ifm->ifi_index > 0 || tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) err = -ENODEV; else if (tb[IFLA_GROUP]) err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP])); else err = -EINVAL; rtnl_net_unlock(tgt_net); if (netnsid >= 0) put_net(tgt_net); return err; } int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm, u32 portid, const struct nlmsghdr *nlh) { unsigned int old_flags, changed; int err; old_flags = dev->flags; if (ifm && (ifm->ifi_flags || ifm->ifi_change)) { err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), NULL); if (err < 0) return err; } changed = old_flags ^ dev->flags; if (dev->rtnl_link_initializing) { dev->rtnl_link_initializing = false; changed = ~0U; } __dev_notify_flags(dev, old_flags, changed, portid, nlh); return 0; } EXPORT_SYMBOL(rtnl_configure_link); struct net_device *rtnl_create_link(struct net *net, const char *ifname, unsigned char name_assign_type, const struct rtnl_link_ops *ops, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct net_device *dev; unsigned int num_tx_queues = 1; unsigned int num_rx_queues = 1; int err; if (tb[IFLA_NUM_TX_QUEUES]) num_tx_queues = nla_get_u32(tb[IFLA_NUM_TX_QUEUES]); else if (ops->get_num_tx_queues) num_tx_queues = ops->get_num_tx_queues(); if (tb[IFLA_NUM_RX_QUEUES]) num_rx_queues = nla_get_u32(tb[IFLA_NUM_RX_QUEUES]); else if (ops->get_num_rx_queues) num_rx_queues = ops->get_num_rx_queues(); if (num_tx_queues < 1 || num_tx_queues > 4096) { NL_SET_ERR_MSG(extack, "Invalid number of transmit queues"); return ERR_PTR(-EINVAL); } if (num_rx_queues < 1 || num_rx_queues > 4096) { NL_SET_ERR_MSG(extack, "Invalid number of receive queues"); return ERR_PTR(-EINVAL); } if (ops->alloc) { dev = ops->alloc(tb, ifname, name_assign_type, num_tx_queues, num_rx_queues); if (IS_ERR(dev)) return dev; } else { dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type, ops->setup, num_tx_queues, num_rx_queues); } if (!dev) return ERR_PTR(-ENOMEM); err = validate_linkmsg(dev, tb, extack); if (err < 0) { free_netdev(dev); return ERR_PTR(err); } dev_net_set(dev, net); dev->rtnl_link_ops = ops; dev->rtnl_link_initializing = true; if (tb[IFLA_MTU]) { u32 mtu = nla_get_u32(tb[IFLA_MTU]); err = dev_validate_mtu(dev, mtu, extack); if (err) { free_netdev(dev); return ERR_PTR(err); } dev->mtu = mtu; } if (tb[IFLA_ADDRESS]) { __dev_addr_set(dev, nla_data(tb[IFLA_ADDRESS]), nla_len(tb[IFLA_ADDRESS])); dev->addr_assign_type = NET_ADDR_SET; } if (tb[IFLA_BROADCAST]) memcpy(dev->broadcast, nla_data(tb[IFLA_BROADCAST]), nla_len(tb[IFLA_BROADCAST])); if (tb[IFLA_TXQLEN]) dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); if (tb[IFLA_OPERSTATE]) set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); if (tb[IFLA_LINKMODE]) dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); if (tb[IFLA_GROUP]) netif_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); if (tb[IFLA_GSO_MAX_SIZE]) netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE])); if (tb[IFLA_GSO_MAX_SEGS]) netif_set_gso_max_segs(dev, nla_get_u32(tb[IFLA_GSO_MAX_SEGS])); if (tb[IFLA_GRO_MAX_SIZE]) netif_set_gro_max_size(dev, nla_get_u32(tb[IFLA_GRO_MAX_SIZE])); if (tb[IFLA_GSO_IPV4_MAX_SIZE]) netif_set_gso_ipv4_max_size(dev, nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE])); if (tb[IFLA_GRO_IPV4_MAX_SIZE]) netif_set_gro_ipv4_max_size(dev, nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE])); return dev; } EXPORT_SYMBOL(rtnl_create_link); struct rtnl_newlink_tbs { struct nlattr *tb[IFLA_MAX + 1]; struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; struct nlattr *attr[RTNL_MAX_TYPE + 1]; struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; }; static int rtnl_changelink(const struct sk_buff *skb, struct nlmsghdr *nlh, const struct rtnl_link_ops *ops, struct net_device *dev, struct net *tgt_net, struct rtnl_newlink_tbs *tbs, struct nlattr **data, struct netlink_ext_ack *extack) { struct nlattr ** const linkinfo = tbs->linkinfo; struct nlattr ** const tb = tbs->tb; int status = 0; int err; if (nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; if (linkinfo[IFLA_INFO_DATA]) { if (!ops || ops != dev->rtnl_link_ops || !ops->changelink) return -EOPNOTSUPP; err = ops->changelink(dev, tb, data, extack); if (err < 0) return err; status |= DO_SETLINK_NOTIFY; } if (linkinfo[IFLA_INFO_SLAVE_DATA]) { const struct rtnl_link_ops *m_ops = NULL; struct nlattr **slave_data = NULL; struct net_device *master_dev; master_dev = netdev_master_upper_dev_get(dev); if (master_dev) m_ops = master_dev->rtnl_link_ops; if (!m_ops || !m_ops->slave_changelink) return -EOPNOTSUPP; if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) return -EINVAL; if (m_ops->slave_maxtype) { err = nla_parse_nested_deprecated(tbs->slave_attr, m_ops->slave_maxtype, linkinfo[IFLA_INFO_SLAVE_DATA], m_ops->slave_policy, extack); if (err < 0) return err; slave_data = tbs->slave_attr; } err = m_ops->slave_changelink(master_dev, dev, tb, slave_data, extack); if (err < 0) return err; status |= DO_SETLINK_NOTIFY; } return do_setlink(skb, dev, tgt_net, nlmsg_data(nlh), extack, tb, status); } static int rtnl_group_changelink(const struct sk_buff *skb, struct net *net, struct net *tgt_net, int group, struct ifinfomsg *ifm, struct netlink_ext_ack *extack, struct nlattr **tb) { struct net_device *dev, *aux; int err; for_each_netdev_safe(net, dev, aux) { if (dev->group == group) { err = do_setlink(skb, dev, tgt_net, ifm, extack, tb, 0); if (err < 0) return err; } } return 0; } static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, const struct rtnl_link_ops *ops, struct net *tgt_net, struct net *link_net, struct net *peer_net, const struct nlmsghdr *nlh, struct nlattr **tb, struct nlattr **data, struct netlink_ext_ack *extack) { unsigned char name_assign_type = NET_NAME_USER; struct rtnl_newlink_params params = { .src_net = sock_net(skb->sk), .link_net = link_net, .peer_net = peer_net, .tb = tb, .data = data, }; u32 portid = NETLINK_CB(skb).portid; struct net_device *dev; char ifname[IFNAMSIZ]; int err; if (!ops->alloc && !ops->setup) return -EOPNOTSUPP; if (tb[IFLA_IFNAME]) { nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); } else { snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); name_assign_type = NET_NAME_ENUM; } dev = rtnl_create_link(tgt_net, ifname, name_assign_type, ops, tb, extack); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto out; } dev->ifindex = ifm->ifi_index; if (ops->newlink) err = ops->newlink(dev, &params, extack); else err = register_netdevice(dev); if (err < 0) { free_netdev(dev); goto out; } netdev_lock_ops(dev); err = rtnl_configure_link(dev, ifm, portid, nlh); if (err < 0) goto out_unregister; if (tb[IFLA_MASTER]) { err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); if (err) goto out_unregister; } netdev_unlock_ops(dev); out: return err; out_unregister: netdev_unlock_ops(dev); if (ops->newlink) { LIST_HEAD(list_kill); ops->dellink(dev, &list_kill); unregister_netdevice_many(&list_kill); } else { unregister_netdevice(dev); } goto out; } static struct net *rtnl_get_peer_net(const struct rtnl_link_ops *ops, struct nlattr *tbp[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_MAX + 1]; int err; if (!data || !data[ops->peer_type]) return rtnl_link_get_net_ifla(tbp); err = rtnl_nla_parse_ifinfomsg(tb, data[ops->peer_type], extack); if (err < 0) return ERR_PTR(err); if (ops->validate) { err = ops->validate(tb, NULL, extack); if (err < 0) return ERR_PTR(err); } return rtnl_link_get_net_ifla(tb); } static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, const struct rtnl_link_ops *ops, struct net *tgt_net, struct net *link_net, struct net *peer_net, struct rtnl_newlink_tbs *tbs, struct nlattr **data, struct netlink_ext_ack *extack) { struct nlattr ** const tb = tbs->tb; struct net *net = sock_net(skb->sk); struct net *device_net; struct net_device *dev; struct ifinfomsg *ifm; bool link_specified; /* When creating, lookup for existing device in target net namespace */ device_net = (nlh->nlmsg_flags & NLM_F_CREATE) && (nlh->nlmsg_flags & NLM_F_EXCL) ? tgt_net : net; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) { link_specified = true; dev = __dev_get_by_index(device_net, ifm->ifi_index); } else if (ifm->ifi_index < 0) { NL_SET_ERR_MSG(extack, "ifindex can't be negative"); return -EINVAL; } else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) { link_specified = true; dev = rtnl_dev_get(device_net, tb); } else { link_specified = false; dev = NULL; } if (dev) return rtnl_changelink(skb, nlh, ops, dev, tgt_net, tbs, data, extack); if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { /* No dev found and NLM_F_CREATE not set. Requested dev does not exist, * or it's for a group */ if (link_specified || !tb[IFLA_GROUP]) return -ENODEV; return rtnl_group_changelink(skb, net, tgt_net, nla_get_u32(tb[IFLA_GROUP]), ifm, extack, tb); } if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) return -EOPNOTSUPP; if (!ops) { NL_SET_ERR_MSG(extack, "Unknown device type"); return -EOPNOTSUPP; } return rtnl_newlink_create(skb, ifm, ops, tgt_net, link_net, peer_net, nlh, tb, data, extack); } static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *tgt_net, *link_net = NULL, *peer_net = NULL; struct nlattr **tb, **linkinfo, **data = NULL; struct rtnl_link_ops *ops = NULL; struct rtnl_newlink_tbs *tbs; struct rtnl_nets rtnl_nets; int ops_srcu_index; int ret; tbs = kmalloc(sizeof(*tbs), GFP_KERNEL); if (!tbs) return -ENOMEM; tb = tbs->tb; ret = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, ifla_policy, extack); if (ret < 0) goto free; ret = rtnl_ensure_unique_netns(tb, extack, false); if (ret < 0) goto free; linkinfo = tbs->linkinfo; if (tb[IFLA_LINKINFO]) { ret = nla_parse_nested_deprecated(linkinfo, IFLA_INFO_MAX, tb[IFLA_LINKINFO], ifla_info_policy, NULL); if (ret < 0) goto free; } else { memset(linkinfo, 0, sizeof(tbs->linkinfo)); } if (linkinfo[IFLA_INFO_KIND]) { char kind[MODULE_NAME_LEN]; nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); ops = rtnl_link_ops_get(kind, &ops_srcu_index); #ifdef CONFIG_MODULES if (!ops) { request_module("rtnl-link-%s", kind); ops = rtnl_link_ops_get(kind, &ops_srcu_index); } #endif } rtnl_nets_init(&rtnl_nets); if (ops) { if (ops->maxtype > RTNL_MAX_TYPE) { ret = -EINVAL; goto put_ops; } if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { ret = nla_parse_nested_deprecated(tbs->attr, ops->maxtype, linkinfo[IFLA_INFO_DATA], ops->policy, extack); if (ret < 0) goto put_ops; data = tbs->attr; } if (ops->validate) { ret = ops->validate(tb, data, extack); if (ret < 0) goto put_ops; } if (ops->peer_type) { peer_net = rtnl_get_peer_net(ops, tb, data, extack); if (IS_ERR(peer_net)) { ret = PTR_ERR(peer_net); goto put_ops; } if (peer_net) rtnl_nets_add(&rtnl_nets, peer_net); } } tgt_net = rtnl_link_get_net_capable(skb, sock_net(skb->sk), tb, CAP_NET_ADMIN); if (IS_ERR(tgt_net)) { ret = PTR_ERR(tgt_net); goto put_net; } rtnl_nets_add(&rtnl_nets, tgt_net); if (tb[IFLA_LINK_NETNSID]) { int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); link_net = get_net_ns_by_id(tgt_net, id); if (!link_net) { NL_SET_ERR_MSG(extack, "Unknown network namespace id"); ret = -EINVAL; goto put_net; } rtnl_nets_add(&rtnl_nets, link_net); if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; goto put_net; } } rtnl_nets_lock(&rtnl_nets); ret = __rtnl_newlink(skb, nlh, ops, tgt_net, link_net, peer_net, tbs, data, extack); rtnl_nets_unlock(&rtnl_nets); put_net: rtnl_nets_destroy(&rtnl_nets); put_ops: if (ops) rtnl_link_ops_put(ops, ops_srcu_index); free: kfree(tbs); return ret; } static int rtnl_valid_getlink_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct ifinfomsg *ifm; int i, err; ifm = nlmsg_payload(nlh, sizeof(*ifm)); if (!ifm) { NL_SET_ERR_MSG(extack, "Invalid header for get link"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change) { NL_SET_ERR_MSG(extack, "Invalid values in header for get link request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err) return err; for (i = 0; i <= IFLA_MAX; i++) { if (!tb[i]) continue; switch (i) { case IFLA_IFNAME: case IFLA_ALT_IFNAME: case IFLA_EXT_MASK: case IFLA_TARGET_NETNSID: break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in get link request"); return -EINVAL; } } return 0; } static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct net *tgt_net = net; struct ifinfomsg *ifm; struct nlattr *tb[IFLA_MAX+1]; struct net_device *dev = NULL; struct sk_buff *nskb; int netnsid = -1; int err; u32 ext_filter_mask = 0; err = rtnl_valid_getlink_req(skb, nlh, tb, extack); if (err < 0) return err; err = rtnl_ensure_unique_netns(tb, extack, true); if (err < 0) return err; if (tb[IFLA_TARGET_NETNSID]) { netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]); tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid); if (IS_ERR(tgt_net)) return PTR_ERR(tgt_net); } if (tb[IFLA_EXT_MASK]) ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); err = -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(tgt_net, tb); else goto out; err = -ENODEV; if (dev == NULL) goto out; err = -ENOBUFS; nskb = nlmsg_new_large(if_nlmsg_size(dev, ext_filter_mask)); if (nskb == NULL) goto out; /* Synchronize the carrier state so we don't report a state * that we're not actually going to honour immediately; if * the driver just did a carrier off->on transition, we can * only TX if link watch work has run, but without this we'd * already report carrier on, even if it doesn't work yet. */ linkwatch_sync_dev(dev); err = rtnl_fill_ifinfo(nskb, dev, net, RTM_NEWLINK, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0, NULL, 0, netnsid, GFP_KERNEL); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); kfree_skb(nskb); } else err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid); out: if (netnsid >= 0) put_net(tgt_net); return err; } static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr, bool *changed, struct netlink_ext_ack *extack) { char *alt_ifname; size_t size; int err; err = nla_validate(attr, attr->nla_len, IFLA_MAX, ifla_policy, extack); if (err) return err; if (cmd == RTM_NEWLINKPROP) { size = rtnl_prop_list_size(dev); size += nla_total_size(ALTIFNAMSIZ); if (size >= U16_MAX) { NL_SET_ERR_MSG(extack, "effective property list too long"); return -EINVAL; } } alt_ifname = nla_strdup(attr, GFP_KERNEL_ACCOUNT); if (!alt_ifname) return -ENOMEM; if (cmd == RTM_NEWLINKPROP) { err = netdev_name_node_alt_create(dev, alt_ifname); if (!err) alt_ifname = NULL; } else if (cmd == RTM_DELLINKPROP) { err = netdev_name_node_alt_destroy(dev, alt_ifname); } else { WARN_ON_ONCE(1); err = -EINVAL; } kfree(alt_ifname); if (!err) *changed = true; return err; } static int rtnl_linkprop(int cmd, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[IFLA_MAX + 1]; struct net_device *dev; struct ifinfomsg *ifm; bool changed = false; struct nlattr *attr; int err, rem; err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err) return err; err = rtnl_ensure_unique_netns(tb, extack, true); if (err) return err; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(net, tb); else return -EINVAL; if (!dev) return -ENODEV; if (!tb[IFLA_PROP_LIST]) return 0; nla_for_each_nested(attr, tb[IFLA_PROP_LIST], rem) { switch (nla_type(attr)) { case IFLA_ALT_IFNAME: err = rtnl_alt_ifname(cmd, dev, attr, &changed, extack); if (err) return err; break; } } if (changed) netdev_state_change(dev); return 0; } static int rtnl_newlinkprop(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { return rtnl_linkprop(RTM_NEWLINKPROP, skb, nlh, extack); } static int rtnl_dellinkprop(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { return rtnl_linkprop(RTM_DELLINKPROP, skb, nlh, extack); } static noinline_for_stack u32 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); size_t min_ifinfo_dump_size = 0; u32 ext_filter_mask = 0; struct net_device *dev; struct nlattr *nla; int hdrlen, rem; /* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */ hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ? sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) return NLMSG_GOODSIZE; nla_for_each_attr_type(nla, IFLA_EXT_MASK, nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), rem) { if (nla_len(nla) == sizeof(u32)) ext_filter_mask = nla_get_u32(nla); } if (!ext_filter_mask) return NLMSG_GOODSIZE; /* * traverse the list of net devices and compute the minimum * buffer size based upon the filter mask. */ rcu_read_lock(); for_each_netdev_rcu(net, dev) { min_ifinfo_dump_size = max(min_ifinfo_dump_size, if_nlmsg_size(dev, ext_filter_mask)); } rcu_read_unlock(); return nlmsg_total_size(min_ifinfo_dump_size); } static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) { int idx; int s_idx = cb->family; int type = cb->nlh->nlmsg_type - RTM_BASE; int ret = 0; if (s_idx == 0) s_idx = 1; for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) { struct rtnl_link __rcu **tab; struct rtnl_link *link; rtnl_dumpit_func dumpit; if (idx < s_idx || idx == PF_PACKET) continue; if (type < 0 || type >= RTM_NR_MSGTYPES) continue; tab = rcu_dereference_rtnl(rtnl_msg_handlers[idx]); if (!tab) continue; link = rcu_dereference_rtnl(tab[type]); if (!link) continue; dumpit = link->dumpit; if (!dumpit) continue; if (idx > s_idx) { memset(&cb->args[0], 0, sizeof(cb->args)); cb->prev_seq = 0; cb->seq = 0; } ret = dumpit(skb, cb); if (ret) break; } cb->family = idx; return skb->len ? : ret; } struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, unsigned int change, u32 event, gfp_t flags, int *new_nsid, int new_ifindex, u32 portid, const struct nlmsghdr *nlh) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; u32 seq = 0; skb = nlmsg_new(if_nlmsg_size(dev, 0), flags); if (skb == NULL) goto errout; if (nlmsg_report(nlh)) seq = nlmsg_seq(nlh); else portid = 0; err = rtnl_fill_ifinfo(skb, dev, dev_net(dev), type, portid, seq, change, 0, 0, event, new_nsid, new_ifindex, -1, flags); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } return skb; errout: rtnl_set_sk_err(net, RTNLGRP_LINK, err); return NULL; } void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags, u32 portid, const struct nlmsghdr *nlh) { struct net *net = dev_net(dev); rtnl_notify(skb, net, portid, RTNLGRP_LINK, nlh, flags); } static void rtmsg_ifinfo_event(int type, struct net_device *dev, unsigned int change, u32 event, gfp_t flags, int *new_nsid, int new_ifindex, u32 portid, const struct nlmsghdr *nlh) { struct sk_buff *skb; if (dev->reg_state != NETREG_REGISTERED) return; skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid, new_ifindex, portid, nlh); if (skb) rtmsg_ifinfo_send(skb, dev, flags, portid, nlh); } void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, gfp_t flags, u32 portid, const struct nlmsghdr *nlh) { rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, NULL, 0, portid, nlh); } void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, gfp_t flags, int *new_nsid, int new_ifindex) { rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, new_nsid, new_ifindex, 0, NULL); } static int nlmsg_populate_fdb_fill(struct sk_buff *skb, struct net_device *dev, u8 *addr, u16 vid, u32 pid, u32 seq, int type, unsigned int flags, int nlflags, u16 ndm_state) { struct nlmsghdr *nlh; struct ndmsg *ndm; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), nlflags); if (!nlh) return -EMSGSIZE; ndm = nlmsg_data(nlh); ndm->ndm_family = AF_BRIDGE; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; ndm->ndm_flags = flags; ndm->ndm_type = 0; ndm->ndm_ifindex = dev->ifindex; ndm->ndm_state = ndm_state; if (nla_put(skb, NDA_LLADDR, dev->addr_len, addr)) goto nla_put_failure; if (vid) if (nla_put(skb, NDA_VLAN, sizeof(u16), &vid)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static inline size_t rtnl_fdb_nlmsg_size(const struct net_device *dev) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(dev->addr_len) + /* NDA_LLADDR */ nla_total_size(sizeof(u16)) + /* NDA_VLAN */ 0; } static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type, u16 ndm_state) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(rtnl_fdb_nlmsg_size(dev), GFP_ATOMIC); if (!skb) goto errout; err = nlmsg_populate_fdb_fill(skb, dev, addr, vid, 0, 0, type, NTF_SELF, 0, ndm_state); if (err < 0) { kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); return; errout: rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); } /* * ndo_dflt_fdb_add - default netdevice operation to add an FDB entry */ int ndo_dflt_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 flags) { int err = -EINVAL; /* If aging addresses are supported device will need to * implement its own handler for this. */ if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) { netdev_info(dev, "default FDB implementation only supports local addresses\n"); return err; } if (tb[NDA_FLAGS_EXT]) { netdev_info(dev, "invalid flags given to default FDB implementation\n"); return err; } if (vid) { netdev_info(dev, "vlans aren't supported yet for dev_uc|mc_add()\n"); return err; } if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr)) err = dev_uc_add_excl(dev, addr); else if (is_multicast_ether_addr(addr)) err = dev_mc_add_excl(dev, addr); /* Only return duplicate errors if NLM_F_EXCL is set */ if (err == -EEXIST && !(flags & NLM_F_EXCL)) err = 0; return err; } EXPORT_SYMBOL(ndo_dflt_fdb_add); static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid, struct netlink_ext_ack *extack) { u16 vid = 0; if (vlan_attr) { if (nla_len(vlan_attr) != sizeof(u16)) { NL_SET_ERR_MSG(extack, "invalid vlan attribute size"); return -EINVAL; } vid = nla_get_u16(vlan_attr); if (!vid || vid >= VLAN_VID_MASK) { NL_SET_ERR_MSG(extack, "invalid vlan id"); return -EINVAL; } } *p_vid = vid; return 0; } static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct net_device *dev; u8 *addr; u16 vid; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); if (err < 0) return err; ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex == 0) { NL_SET_ERR_MSG(extack, "invalid ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { NL_SET_ERR_MSG(extack, "invalid address"); return -EINVAL; } if (dev->type != ARPHRD_ETHER) { NL_SET_ERR_MSG(extack, "FDB add only supported for Ethernet devices"); return -EINVAL; } addr = nla_data(tb[NDA_LLADDR]); err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack); if (err) return err; err = -EOPNOTSUPP; /* Support fdb on master device the net/bridge default case */ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && netif_is_bridge_port(dev)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops = br_dev->netdev_ops; bool notified = false; err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags, &notified, extack); if (err) goto out; else ndm->ndm_flags &= ~NTF_MASTER; } /* Embedded bridge, macvlan, and any other device support */ if ((ndm->ndm_flags & NTF_SELF)) { bool notified = false; if (dev->netdev_ops->ndo_fdb_add) err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags, &notified, extack); else err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags); if (!err && !notified) { rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH, ndm->ndm_state); ndm->ndm_flags &= ~NTF_SELF; } } out: return err; } /* * ndo_dflt_fdb_del - default netdevice operation to delete an FDB entry */ int ndo_dflt_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid) { int err = -EINVAL; /* If aging addresses are supported device will need to * implement its own handler for this. */ if (!(ndm->ndm_state & NUD_PERMANENT)) { netdev_info(dev, "default FDB implementation only supports local addresses\n"); return err; } if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr)) err = dev_uc_del(dev, addr); else if (is_multicast_ether_addr(addr)) err = dev_mc_del(dev, addr); return err; } EXPORT_SYMBOL(ndo_dflt_fdb_del); static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { bool del_bulk = !!(nlh->nlmsg_flags & NLM_F_BULK); struct net *net = sock_net(skb->sk); const struct net_device_ops *ops; struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct net_device *dev; __u8 *addr = NULL; int err; u16 vid; if (!del_bulk) { err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); } else { /* For bulk delete, the drivers will parse the message with * policy. */ err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); } if (err < 0) return err; ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex == 0) { NL_SET_ERR_MSG(extack, "invalid ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } if (!del_bulk) { if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { NL_SET_ERR_MSG(extack, "invalid address"); return -EINVAL; } addr = nla_data(tb[NDA_LLADDR]); err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack); if (err) return err; } if (dev->type != ARPHRD_ETHER) { NL_SET_ERR_MSG(extack, "FDB delete only supported for Ethernet devices"); return -EINVAL; } err = -EOPNOTSUPP; /* Support fdb on master device the net/bridge default case */ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && netif_is_bridge_port(dev)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); bool notified = false; ops = br_dev->netdev_ops; if (!del_bulk) { if (ops->ndo_fdb_del) err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, &notified, extack); } else { if (ops->ndo_fdb_del_bulk) err = ops->ndo_fdb_del_bulk(nlh, dev, extack); } if (err) goto out; else ndm->ndm_flags &= ~NTF_MASTER; } /* Embedded bridge, macvlan, and any other device support */ if (ndm->ndm_flags & NTF_SELF) { bool notified = false; ops = dev->netdev_ops; if (!del_bulk) { if (ops->ndo_fdb_del) err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, &notified, extack); else err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid); } else { /* in case err was cleared by NTF_MASTER call */ err = -EOPNOTSUPP; if (ops->ndo_fdb_del_bulk) err = ops->ndo_fdb_del_bulk(nlh, dev, extack); } if (!err) { if (!del_bulk && !notified) rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH, ndm->ndm_state); ndm->ndm_flags &= ~NTF_SELF; } } out: return err; } static int nlmsg_populate_fdb(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, int *idx, struct netdev_hw_addr_list *list) { struct ndo_fdb_dump_context *ctx = (void *)cb->ctx; struct netdev_hw_addr *ha; u32 portid, seq; int err; portid = NETLINK_CB(cb->skb).portid; seq = cb->nlh->nlmsg_seq; list_for_each_entry(ha, &list->list, list) { if (*idx < ctx->fdb_idx) goto skip; err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0, portid, seq, RTM_NEWNEIGH, NTF_SELF, NLM_F_MULTI, NUD_PERMANENT); if (err < 0) return err; skip: *idx += 1; } return 0; } /** * ndo_dflt_fdb_dump - default netdevice operation to dump an FDB table. * @skb: socket buffer to store message in * @cb: netlink callback * @dev: netdevice * @filter_dev: ignored * @idx: the number of FDB table entries dumped is added to *@idx * * Default netdevice operation to dump the existing unicast address list. * Returns number of addresses from list put in skb. */ int ndo_dflt_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx) { int err; if (dev->type != ARPHRD_ETHER) return -EINVAL; netif_addr_lock_bh(dev); err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc); if (err) goto out; err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->mc); out: netif_addr_unlock_bh(dev); return err; } EXPORT_SYMBOL(ndo_dflt_fdb_dump); static int valid_fdb_dump_strict(const struct nlmsghdr *nlh, int *br_idx, int *brport_idx, struct netlink_ext_ack *extack) { struct nlattr *tb[NDA_MAX + 1]; struct ndmsg *ndm; int err, i; ndm = nlmsg_payload(nlh, sizeof(*ndm)); if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for fdb dump request"); return -EINVAL; } if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_flags || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for fdb dump request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, NULL, extack); if (err < 0) return err; *brport_idx = ndm->ndm_ifindex; for (i = 0; i <= NDA_MAX; ++i) { if (!tb[i]) continue; switch (i) { case NDA_IFINDEX: if (nla_len(tb[i]) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid IFINDEX attribute in fdb dump request"); return -EINVAL; } *brport_idx = nla_get_u32(tb[NDA_IFINDEX]); break; case NDA_MASTER: if (nla_len(tb[i]) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid MASTER attribute in fdb dump request"); return -EINVAL; } *br_idx = nla_get_u32(tb[NDA_MASTER]); break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb dump request"); return -EINVAL; } } return 0; } static int valid_fdb_dump_legacy(const struct nlmsghdr *nlh, int *br_idx, int *brport_idx, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_MAX+1]; int err; /* A hack to preserve kernel<->userspace interface. * Before Linux v4.12 this code accepted ndmsg since iproute2 v3.3.0. * However, ndmsg is shorter than ifinfomsg thus nlmsg_parse() bails. * So, check for ndmsg with an optional u32 attribute (not used here). * Fortunately these sizes don't conflict with the size of ifinfomsg * with an optional attribute. */ if (nlmsg_len(nlh) != sizeof(struct ndmsg) && (nlmsg_len(nlh) != sizeof(struct ndmsg) + nla_attr_size(sizeof(u32)))) { struct ifinfomsg *ifm; err = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, ifla_policy, extack); if (err < 0) { return -EINVAL; } else if (err == 0) { if (tb[IFLA_MASTER]) *br_idx = nla_get_u32(tb[IFLA_MASTER]); } ifm = nlmsg_data(nlh); *brport_idx = ifm->ifi_index; } return 0; } static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) { const struct net_device_ops *ops = NULL, *cops = NULL; struct ndo_fdb_dump_context *ctx = (void *)cb->ctx; struct net_device *dev, *br_dev = NULL; struct net *net = sock_net(skb->sk); int brport_idx = 0; int br_idx = 0; int fidx = 0; int err; NL_ASSERT_CTX_FITS(struct ndo_fdb_dump_context); if (cb->strict_check) err = valid_fdb_dump_strict(cb->nlh, &br_idx, &brport_idx, cb->extack); else err = valid_fdb_dump_legacy(cb->nlh, &br_idx, &brport_idx, cb->extack); if (err < 0) return err; if (br_idx) { br_dev = __dev_get_by_index(net, br_idx); if (!br_dev) return -ENODEV; ops = br_dev->netdev_ops; } for_each_netdev_dump(net, dev, ctx->ifindex) { if (brport_idx && (dev->ifindex != brport_idx)) continue; if (!br_idx) { /* user did not specify a specific bridge */ if (netif_is_bridge_port(dev)) { br_dev = netdev_master_upper_dev_get(dev); cops = br_dev->netdev_ops; } } else { if (dev != br_dev && !netif_is_bridge_port(dev)) continue; if (br_dev != netdev_master_upper_dev_get(dev) && !netif_is_bridge_master(dev)) continue; cops = ops; } if (netif_is_bridge_port(dev)) { if (cops && cops->ndo_fdb_dump) { err = cops->ndo_fdb_dump(skb, cb, br_dev, dev, &fidx); if (err == -EMSGSIZE) break; } } if (dev->netdev_ops->ndo_fdb_dump) err = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL, &fidx); else err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, &fidx); if (err == -EMSGSIZE) break; cops = NULL; /* reset fdb offset to 0 for rest of the interfaces */ ctx->fdb_idx = 0; fidx = 0; } ctx->fdb_idx = fidx; return skb->len; } static int valid_fdb_get_strict(const struct nlmsghdr *nlh, struct nlattr **tb, u8 *ndm_flags, int *br_idx, int *brport_idx, u8 **addr, u16 *vid, struct netlink_ext_ack *extack) { struct ndmsg *ndm; int err, i; ndm = nlmsg_payload(nlh, sizeof(*ndm)); if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for fdb get request"); return -EINVAL; } if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for fdb get request"); return -EINVAL; } if (ndm->ndm_flags & ~(NTF_MASTER | NTF_SELF)) { NL_SET_ERR_MSG(extack, "Invalid flags in header for fdb get request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, nda_policy, extack); if (err < 0) return err; *ndm_flags = ndm->ndm_flags; *brport_idx = ndm->ndm_ifindex; for (i = 0; i <= NDA_MAX; ++i) { if (!tb[i]) continue; switch (i) { case NDA_MASTER: *br_idx = nla_get_u32(tb[i]); break; case NDA_LLADDR: if (nla_len(tb[i]) != ETH_ALEN) { NL_SET_ERR_MSG(extack, "Invalid address in fdb get request"); return -EINVAL; } *addr = nla_data(tb[i]); break; case NDA_VLAN: err = fdb_vid_parse(tb[i], vid, extack); if (err) return err; break; case NDA_VNI: break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb get request"); return -EINVAL; } } return 0; } static int rtnl_fdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net_device *dev = NULL, *br_dev = NULL; const struct net_device_ops *ops = NULL; struct net *net = sock_net(in_skb->sk); struct nlattr *tb[NDA_MAX + 1]; struct sk_buff *skb; int brport_idx = 0; u8 ndm_flags = 0; int br_idx = 0; u8 *addr = NULL; u16 vid = 0; int err; err = valid_fdb_get_strict(nlh, tb, &ndm_flags, &br_idx, &brport_idx, &addr, &vid, extack); if (err < 0) return err; if (!addr) { NL_SET_ERR_MSG(extack, "Missing lookup address for fdb get request"); return -EINVAL; } if (brport_idx) { dev = __dev_get_by_index(net, brport_idx); if (!dev) { NL_SET_ERR_MSG(extack, "Unknown device ifindex"); return -ENODEV; } } if (br_idx) { if (dev) { NL_SET_ERR_MSG(extack, "Master and device are mutually exclusive"); return -EINVAL; } br_dev = __dev_get_by_index(net, br_idx); if (!br_dev) { NL_SET_ERR_MSG(extack, "Invalid master ifindex"); return -EINVAL; } ops = br_dev->netdev_ops; } if (dev) { if (!ndm_flags || (ndm_flags & NTF_MASTER)) { if (!netif_is_bridge_port(dev)) { NL_SET_ERR_MSG(extack, "Device is not a bridge port"); return -EINVAL; } br_dev = netdev_master_upper_dev_get(dev); if (!br_dev) { NL_SET_ERR_MSG(extack, "Master of device not found"); return -EINVAL; } ops = br_dev->netdev_ops; } else { if (!(ndm_flags & NTF_SELF)) { NL_SET_ERR_MSG(extack, "Missing NTF_SELF"); return -EINVAL; } ops = dev->netdev_ops; } } if (!br_dev && !dev) { NL_SET_ERR_MSG(extack, "No device specified"); return -ENODEV; } if (!ops || !ops->ndo_fdb_get) { NL_SET_ERR_MSG(extack, "Fdb get operation not supported by device"); return -EOPNOTSUPP; } skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (br_dev) dev = br_dev; err = ops->ndo_fdb_get(skb, tb, dev, addr, vid, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, extack); if (err) goto out; return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); out: kfree_skb(skb); return err; } static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask, unsigned int attrnum, unsigned int flag) { if (mask & flag) return nla_put_u8(skb, attrnum, !!(flags & flag)); return 0; } int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u16 mode, u32 flags, u32 mask, int nlflags, u32 filter_mask, int (*vlan_fill)(struct sk_buff *skb, struct net_device *dev, u32 filter_mask)) { struct nlmsghdr *nlh; struct ifinfomsg *ifm; struct nlattr *br_afspec; struct nlattr *protinfo; u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; struct net_device *br_dev = netdev_master_upper_dev_get(dev); int err = 0; nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), nlflags); if (nlh == NULL) return -EMSGSIZE; ifm = nlmsg_data(nlh); ifm->ifi_family = AF_BRIDGE; ifm->__ifi_pad = 0; ifm->ifi_type = dev->type; ifm->ifi_index = dev->ifindex; ifm->ifi_flags = netif_get_flags(dev); ifm->ifi_change = 0; if (nla_put_string(skb, IFLA_IFNAME, dev->name) || nla_put_u32(skb, IFLA_MTU, dev->mtu) || nla_put_u8(skb, IFLA_OPERSTATE, operstate) || (br_dev && nla_put_u32(skb, IFLA_MASTER, br_dev->ifindex)) || (dev->addr_len && nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || (dev->ifindex != dev_get_iflink(dev) && nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev)))) goto nla_put_failure; br_afspec = nla_nest_start_noflag(skb, IFLA_AF_SPEC); if (!br_afspec) goto nla_put_failure; if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF)) { nla_nest_cancel(skb, br_afspec); goto nla_put_failure; } if (mode != BRIDGE_MODE_UNDEF) { if (nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) { nla_nest_cancel(skb, br_afspec); goto nla_put_failure; } } if (vlan_fill) { err = vlan_fill(skb, dev, filter_mask); if (err) { nla_nest_cancel(skb, br_afspec); goto nla_put_failure; } } nla_nest_end(skb, br_afspec); protinfo = nla_nest_start(skb, IFLA_PROTINFO); if (!protinfo) goto nla_put_failure; if (brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_MODE, BR_HAIRPIN_MODE) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_GUARD, BR_BPDU_GUARD) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_FAST_LEAVE, BR_MULTICAST_FAST_LEAVE) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_LEARNING, BR_LEARNING) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_LEARNING_SYNC, BR_LEARNING_SYNC) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_PROXYARP, BR_PROXYARP) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_BCAST_FLOOD, BR_BCAST_FLOOD)) { nla_nest_cancel(skb, protinfo); goto nla_put_failure; } nla_nest_end(skb, protinfo); nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return err ? err : -EMSGSIZE; } EXPORT_SYMBOL_GPL(ndo_dflt_bridge_getlink); static int valid_bridge_getlink_req(const struct nlmsghdr *nlh, bool strict_check, u32 *filter_mask, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_MAX+1]; int err, i; if (strict_check) { struct ifinfomsg *ifm; ifm = nlmsg_payload(nlh, sizeof(*ifm)); if (!ifm) { NL_SET_ERR_MSG(extack, "Invalid header for bridge link dump"); return -EINVAL; } if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change || ifm->ifi_index) { NL_SET_ERR_MSG(extack, "Invalid values in header for bridge link dump request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, ifla_policy, extack); } else { err = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, ifla_policy, extack); } if (err < 0) return err; /* new attributes should only be added with strict checking */ for (i = 0; i <= IFLA_MAX; ++i) { if (!tb[i]) continue; switch (i) { case IFLA_EXT_MASK: *filter_mask = nla_get_u32(tb[i]); break; default: if (strict_check) { NL_SET_ERR_MSG(extack, "Unsupported attribute in bridge link dump request"); return -EINVAL; } } } return 0; } static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct net_device *dev; int idx = 0; u32 portid = NETLINK_CB(cb->skb).portid; u32 seq = nlh->nlmsg_seq; u32 filter_mask = 0; int err; err = valid_bridge_getlink_req(nlh, cb->strict_check, &filter_mask, cb->extack); if (err < 0 && cb->strict_check) return err; rcu_read_lock(); for_each_netdev_rcu(net, dev) { const struct net_device_ops *ops = dev->netdev_ops; struct net_device *br_dev = netdev_master_upper_dev_get(dev); if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) { if (idx >= cb->args[0]) { err = br_dev->netdev_ops->ndo_bridge_getlink( skb, portid, seq, dev, filter_mask, NLM_F_MULTI); if (err < 0 && err != -EOPNOTSUPP) { if (likely(skb->len)) break; goto out_err; } } idx++; } if (ops->ndo_bridge_getlink) { if (idx >= cb->args[0]) { err = ops->ndo_bridge_getlink(skb, portid, seq, dev, filter_mask, NLM_F_MULTI); if (err < 0 && err != -EOPNOTSUPP) { if (likely(skb->len)) break; goto out_err; } } idx++; } } err = skb->len; out_err: rcu_read_unlock(); cb->args[0] = idx; return err; } static inline size_t bridge_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + nla_total_size(sizeof(u32)) /* IFLA_MASTER */ + nla_total_size(sizeof(u32)) /* IFLA_MTU */ + nla_total_size(sizeof(u32)) /* IFLA_LINK */ + nla_total_size(sizeof(u32)) /* IFLA_OPERSTATE */ + nla_total_size(sizeof(u8)) /* IFLA_PROTINFO */ + nla_total_size(sizeof(struct nlattr)) /* IFLA_AF_SPEC */ + nla_total_size(sizeof(u16)) /* IFLA_BRIDGE_FLAGS */ + nla_total_size(sizeof(u16)); /* IFLA_BRIDGE_MODE */ } static int rtnl_bridge_notify(struct net_device *dev) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -EOPNOTSUPP; if (!dev->netdev_ops->ndo_bridge_getlink) return 0; skb = nlmsg_new(bridge_nlmsg_size(), GFP_ATOMIC); if (!skb) { err = -ENOMEM; goto errout; } err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0, 0); if (err < 0) goto errout; /* Notification info is only filled for bridge ports, not the bridge * device itself. Therefore, a zero notification length is valid and * should not result in an error. */ if (!skb->len) goto errout; rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); return 0; errout: WARN_ON(err == -EMSGSIZE); kfree_skb(skb); if (err) rtnl_set_sk_err(net, RTNLGRP_LINK, err); return err; } static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; struct net_device *dev; struct nlattr *br_spec, *attr, *br_flags_attr = NULL; int rem, err = -EOPNOTSUPP; u16 flags = 0; if (nlmsg_len(nlh) < sizeof(*ifm)) return -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_family != AF_BRIDGE) return -EPFNOSUPPORT; dev = __dev_get_by_index(net, ifm->ifi_index); if (!dev) { NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (br_spec) { nla_for_each_nested(attr, br_spec, rem) { if (nla_type(attr) == IFLA_BRIDGE_FLAGS && !br_flags_attr) { if (nla_len(attr) < sizeof(flags)) return -EINVAL; br_flags_attr = attr; flags = nla_get_u16(attr); } if (nla_type(attr) == IFLA_BRIDGE_MODE) { if (nla_len(attr) < sizeof(u16)) return -EINVAL; } } } if (!flags || (flags & BRIDGE_FLAGS_MASTER)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); if (!br_dev || !br_dev->netdev_ops->ndo_bridge_setlink) { err = -EOPNOTSUPP; goto out; } err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags, extack); if (err) goto out; flags &= ~BRIDGE_FLAGS_MASTER; } if ((flags & BRIDGE_FLAGS_SELF)) { if (!dev->netdev_ops->ndo_bridge_setlink) err = -EOPNOTSUPP; else err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags, extack); if (!err) { flags &= ~BRIDGE_FLAGS_SELF; /* Generate event to notify upper layer of bridge * change */ err = rtnl_bridge_notify(dev); } } if (br_flags_attr) memcpy(nla_data(br_flags_attr), &flags, sizeof(flags)); out: return err; } static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; struct net_device *dev; struct nlattr *br_spec, *attr = NULL; int rem, err = -EOPNOTSUPP; u16 flags = 0; bool have_flags = false; if (nlmsg_len(nlh) < sizeof(*ifm)) return -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_family != AF_BRIDGE) return -EPFNOSUPPORT; dev = __dev_get_by_index(net, ifm->ifi_index); if (!dev) { NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (br_spec) { nla_for_each_nested_type(attr, IFLA_BRIDGE_FLAGS, br_spec, rem) { if (nla_len(attr) < sizeof(flags)) return -EINVAL; have_flags = true; flags = nla_get_u16(attr); break; } } if (!flags || (flags & BRIDGE_FLAGS_MASTER)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); if (!br_dev || !br_dev->netdev_ops->ndo_bridge_dellink) { err = -EOPNOTSUPP; goto out; } err = br_dev->netdev_ops->ndo_bridge_dellink(dev, nlh, flags); if (err) goto out; flags &= ~BRIDGE_FLAGS_MASTER; } if ((flags & BRIDGE_FLAGS_SELF)) { if (!dev->netdev_ops->ndo_bridge_dellink) err = -EOPNOTSUPP; else err = dev->netdev_ops->ndo_bridge_dellink(dev, nlh, flags); if (!err) { flags &= ~BRIDGE_FLAGS_SELF; /* Generate event to notify upper layer of bridge * change */ err = rtnl_bridge_notify(dev); } } if (have_flags) memcpy(nla_data(attr), &flags, sizeof(flags)); out: return err; } static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr) { return (mask & IFLA_STATS_FILTER_BIT(attrid)) && (!idxattr || idxattr == attrid); } static bool rtnl_offload_xstats_have_ndo(const struct net_device *dev, int attr_id) { return dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats && dev->netdev_ops->ndo_get_offload_stats && dev->netdev_ops->ndo_has_offload_stats(dev, attr_id); } static unsigned int rtnl_offload_xstats_get_size_ndo(const struct net_device *dev, int attr_id) { return rtnl_offload_xstats_have_ndo(dev, attr_id) ? sizeof(struct rtnl_link_stats64) : 0; } static int rtnl_offload_xstats_fill_ndo(struct net_device *dev, int attr_id, struct sk_buff *skb) { unsigned int size = rtnl_offload_xstats_get_size_ndo(dev, attr_id); struct nlattr *attr = NULL; void *attr_data; int err; if (!size) return -ENODATA; attr = nla_reserve_64bit(skb, attr_id, size, IFLA_OFFLOAD_XSTATS_UNSPEC); if (!attr) return -EMSGSIZE; attr_data = nla_data(attr); memset(attr_data, 0, size); err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev, attr_data); if (err) return err; return 0; } static unsigned int rtnl_offload_xstats_get_size_stats(const struct net_device *dev, enum netdev_offload_xstats_type type) { bool enabled = netdev_offload_xstats_enabled(dev, type); return enabled ? sizeof(struct rtnl_hw_stats64) : 0; } struct rtnl_offload_xstats_request_used { bool request; bool used; }; static int rtnl_offload_xstats_get_stats(struct net_device *dev, enum netdev_offload_xstats_type type, struct rtnl_offload_xstats_request_used *ru, struct rtnl_hw_stats64 *stats, struct netlink_ext_ack *extack) { bool request; bool used; int err; request = netdev_offload_xstats_enabled(dev, type); if (!request) { used = false; goto out; } err = netdev_offload_xstats_get(dev, type, stats, &used, extack); if (err) return err; out: if (ru) { ru->request = request; ru->used = used; } return 0; } static int rtnl_offload_xstats_fill_hw_s_info_one(struct sk_buff *skb, int attr_id, struct rtnl_offload_xstats_request_used *ru) { struct nlattr *nest; nest = nla_nest_start(skb, attr_id); if (!nest) return -EMSGSIZE; if (nla_put_u8(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST, ru->request)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED, ru->used)) goto nla_put_failure; nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int rtnl_offload_xstats_fill_hw_s_info(struct sk_buff *skb, struct net_device *dev, struct netlink_ext_ack *extack) { enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; struct rtnl_offload_xstats_request_used ru_l3; struct nlattr *nest; int err; err = rtnl_offload_xstats_get_stats(dev, t_l3, &ru_l3, NULL, extack); if (err) return err; nest = nla_nest_start(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO); if (!nest) return -EMSGSIZE; if (rtnl_offload_xstats_fill_hw_s_info_one(skb, IFLA_OFFLOAD_XSTATS_L3_STATS, &ru_l3)) goto nla_put_failure; nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int rtnl_offload_xstats_fill(struct sk_buff *skb, struct net_device *dev, int *prividx, u32 off_filter_mask, struct netlink_ext_ack *extack) { enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; int attr_id_hw_s_info = IFLA_OFFLOAD_XSTATS_HW_S_INFO; int attr_id_l3_stats = IFLA_OFFLOAD_XSTATS_L3_STATS; int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT; bool have_data = false; int err; if (*prividx <= attr_id_cpu_hit && (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_cpu_hit))) { err = rtnl_offload_xstats_fill_ndo(dev, attr_id_cpu_hit, skb); if (!err) { have_data = true; } else if (err != -ENODATA) { *prividx = attr_id_cpu_hit; return err; } } if (*prividx <= attr_id_hw_s_info && (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_hw_s_info))) { *prividx = attr_id_hw_s_info; err = rtnl_offload_xstats_fill_hw_s_info(skb, dev, extack); if (err) return err; have_data = true; *prividx = 0; } if (*prividx <= attr_id_l3_stats && (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_l3_stats))) { unsigned int size_l3; struct nlattr *attr; *prividx = attr_id_l3_stats; size_l3 = rtnl_offload_xstats_get_size_stats(dev, t_l3); if (!size_l3) goto skip_l3_stats; attr = nla_reserve_64bit(skb, attr_id_l3_stats, size_l3, IFLA_OFFLOAD_XSTATS_UNSPEC); if (!attr) return -EMSGSIZE; err = rtnl_offload_xstats_get_stats(dev, t_l3, NULL, nla_data(attr), extack); if (err) return err; have_data = true; skip_l3_stats: *prividx = 0; } if (!have_data) return -ENODATA; *prividx = 0; return 0; } static unsigned int rtnl_offload_xstats_get_size_hw_s_info_one(const struct net_device *dev, enum netdev_offload_xstats_type type) { return nla_total_size(0) + /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST */ nla_total_size(sizeof(u8)) + /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED */ nla_total_size(sizeof(u8)) + 0; } static unsigned int rtnl_offload_xstats_get_size_hw_s_info(const struct net_device *dev) { enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; return nla_total_size(0) + /* IFLA_OFFLOAD_XSTATS_L3_STATS */ rtnl_offload_xstats_get_size_hw_s_info_one(dev, t_l3) + 0; } static int rtnl_offload_xstats_get_size(const struct net_device *dev, u32 off_filter_mask) { enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT; int nla_size = 0; int size; if (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_cpu_hit)) { size = rtnl_offload_xstats_get_size_ndo(dev, attr_id_cpu_hit); nla_size += nla_total_size_64bit(size); } if (off_filter_mask & IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO)) nla_size += rtnl_offload_xstats_get_size_hw_s_info(dev); if (off_filter_mask & IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_L3_STATS)) { size = rtnl_offload_xstats_get_size_stats(dev, t_l3); nla_size += nla_total_size_64bit(size); } if (nla_size != 0) nla_size += nla_total_size(0); return nla_size; } struct rtnl_stats_dump_filters { /* mask[0] filters outer attributes. Then individual nests have their * filtering mask at the index of the nested attribute. */ u32 mask[IFLA_STATS_MAX + 1]; }; static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, const struct rtnl_stats_dump_filters *filters, int *idxattr, int *prividx, struct netlink_ext_ack *extack) { unsigned int filter_mask = filters->mask[0]; struct if_stats_msg *ifsm; struct nlmsghdr *nlh; struct nlattr *attr; int s_prividx = *prividx; int err; ASSERT_RTNL(); nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifsm), flags); if (!nlh) return -EMSGSIZE; ifsm = nlmsg_data(nlh); ifsm->family = PF_UNSPEC; ifsm->pad1 = 0; ifsm->pad2 = 0; ifsm->ifindex = dev->ifindex; ifsm->filter_mask = filter_mask; if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, *idxattr)) { struct rtnl_link_stats64 *sp; attr = nla_reserve_64bit(skb, IFLA_STATS_LINK_64, sizeof(struct rtnl_link_stats64), IFLA_STATS_UNSPEC); if (!attr) { err = -EMSGSIZE; goto nla_put_failure; } sp = nla_data(attr); dev_get_stats(dev, sp); } if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, *idxattr)) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; if (ops && ops->fill_linkxstats) { *idxattr = IFLA_STATS_LINK_XSTATS; attr = nla_nest_start_noflag(skb, IFLA_STATS_LINK_XSTATS); if (!attr) { err = -EMSGSIZE; goto nla_put_failure; } err = ops->fill_linkxstats(skb, dev, prividx, *idxattr); nla_nest_end(skb, attr); if (err) goto nla_put_failure; *idxattr = 0; } } if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE, *idxattr)) { const struct rtnl_link_ops *ops = NULL; const struct net_device *master; master = netdev_master_upper_dev_get(dev); if (master) ops = master->rtnl_link_ops; if (ops && ops->fill_linkxstats) { *idxattr = IFLA_STATS_LINK_XSTATS_SLAVE; attr = nla_nest_start_noflag(skb, IFLA_STATS_LINK_XSTATS_SLAVE); if (!attr) { err = -EMSGSIZE; goto nla_put_failure; } err = ops->fill_linkxstats(skb, dev, prividx, *idxattr); nla_nest_end(skb, attr); if (err) goto nla_put_failure; *idxattr = 0; } } if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, *idxattr)) { u32 off_filter_mask; off_filter_mask = filters->mask[IFLA_STATS_LINK_OFFLOAD_XSTATS]; *idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS; attr = nla_nest_start_noflag(skb, IFLA_STATS_LINK_OFFLOAD_XSTATS); if (!attr) { err = -EMSGSIZE; goto nla_put_failure; } err = rtnl_offload_xstats_fill(skb, dev, prividx, off_filter_mask, extack); if (err == -ENODATA) nla_nest_cancel(skb, attr); else nla_nest_end(skb, attr); if (err && err != -ENODATA) goto nla_put_failure; *idxattr = 0; } if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, *idxattr)) { struct rtnl_af_ops *af_ops; *idxattr = IFLA_STATS_AF_SPEC; attr = nla_nest_start_noflag(skb, IFLA_STATS_AF_SPEC); if (!attr) { err = -EMSGSIZE; goto nla_put_failure; } rcu_read_lock(); list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { if (af_ops->fill_stats_af) { struct nlattr *af; af = nla_nest_start_noflag(skb, af_ops->family); if (!af) { rcu_read_unlock(); err = -EMSGSIZE; goto nla_put_failure; } err = af_ops->fill_stats_af(skb, dev); if (err == -ENODATA) { nla_nest_cancel(skb, af); } else if (err < 0) { rcu_read_unlock(); goto nla_put_failure; } nla_nest_end(skb, af); } } rcu_read_unlock(); nla_nest_end(skb, attr); *idxattr = 0; } nlmsg_end(skb, nlh); return 0; nla_put_failure: /* not a multi message or no progress mean a real error */ if (!(flags & NLM_F_MULTI) || s_prividx == *prividx) nlmsg_cancel(skb, nlh); else nlmsg_end(skb, nlh); return err; } static size_t if_nlmsg_stats_size(const struct net_device *dev, const struct rtnl_stats_dump_filters *filters) { size_t size = NLMSG_ALIGN(sizeof(struct if_stats_msg)); unsigned int filter_mask = filters->mask[0]; if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, 0)) size += nla_total_size_64bit(sizeof(struct rtnl_link_stats64)); if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, 0)) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; int attr = IFLA_STATS_LINK_XSTATS; if (ops && ops->get_linkxstats_size) { size += nla_total_size(ops->get_linkxstats_size(dev, attr)); /* for IFLA_STATS_LINK_XSTATS */ size += nla_total_size(0); } } if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE, 0)) { struct net_device *_dev = (struct net_device *)dev; const struct rtnl_link_ops *ops = NULL; const struct net_device *master; /* netdev_master_upper_dev_get can't take const */ master = netdev_master_upper_dev_get(_dev); if (master) ops = master->rtnl_link_ops; if (ops && ops->get_linkxstats_size) { int attr = IFLA_STATS_LINK_XSTATS_SLAVE; size += nla_total_size(ops->get_linkxstats_size(dev, attr)); /* for IFLA_STATS_LINK_XSTATS_SLAVE */ size += nla_total_size(0); } } if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0)) { u32 off_filter_mask; off_filter_mask = filters->mask[IFLA_STATS_LINK_OFFLOAD_XSTATS]; size += rtnl_offload_xstats_get_size(dev, off_filter_mask); } if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, 0)) { struct rtnl_af_ops *af_ops; /* for IFLA_STATS_AF_SPEC */ size += nla_total_size(0); rcu_read_lock(); list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { if (af_ops->get_stats_af_size) { size += nla_total_size( af_ops->get_stats_af_size(dev)); /* for AF_* */ size += nla_total_size(0); } } rcu_read_unlock(); } return size; } #define RTNL_STATS_OFFLOAD_XSTATS_VALID ((1 << __IFLA_OFFLOAD_XSTATS_MAX) - 1) static const struct nla_policy rtnl_stats_get_policy_filters[IFLA_STATS_MAX + 1] = { [IFLA_STATS_LINK_OFFLOAD_XSTATS] = NLA_POLICY_MASK(NLA_U32, RTNL_STATS_OFFLOAD_XSTATS_VALID), }; static const struct nla_policy rtnl_stats_get_policy[IFLA_STATS_GETSET_MAX + 1] = { [IFLA_STATS_GET_FILTERS] = NLA_POLICY_NESTED(rtnl_stats_get_policy_filters), }; static const struct nla_policy ifla_stats_set_policy[IFLA_STATS_GETSET_MAX + 1] = { [IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS] = NLA_POLICY_MAX(NLA_U8, 1), }; static int rtnl_stats_get_parse_filters(struct nlattr *ifla_filters, struct rtnl_stats_dump_filters *filters, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_STATS_MAX + 1]; int err; int at; err = nla_parse_nested(tb, IFLA_STATS_MAX, ifla_filters, rtnl_stats_get_policy_filters, extack); if (err < 0) return err; for (at = 1; at <= IFLA_STATS_MAX; at++) { if (tb[at]) { if (!(filters->mask[0] & IFLA_STATS_FILTER_BIT(at))) { NL_SET_ERR_MSG(extack, "Filtered attribute not enabled in filter_mask"); return -EINVAL; } filters->mask[at] = nla_get_u32(tb[at]); } } return 0; } static int rtnl_stats_get_parse(const struct nlmsghdr *nlh, u32 filter_mask, struct rtnl_stats_dump_filters *filters, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1]; int err; int i; filters->mask[0] = filter_mask; for (i = 1; i < ARRAY_SIZE(filters->mask); i++) filters->mask[i] = -1U; err = nlmsg_parse(nlh, sizeof(struct if_stats_msg), tb, IFLA_STATS_GETSET_MAX, rtnl_stats_get_policy, extack); if (err < 0) return err; if (tb[IFLA_STATS_GET_FILTERS]) { err = rtnl_stats_get_parse_filters(tb[IFLA_STATS_GET_FILTERS], filters, extack); if (err) return err; } return 0; } static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check, bool is_dump, struct netlink_ext_ack *extack) { struct if_stats_msg *ifsm; ifsm = nlmsg_payload(nlh, sizeof(*ifsm)); if (!ifsm) { NL_SET_ERR_MSG(extack, "Invalid header for stats dump"); return -EINVAL; } if (!strict_check) return 0; /* only requests using strict checks can pass data to influence * the dump. The legacy exception is filter_mask. */ if (ifsm->pad1 || ifsm->pad2 || (is_dump && ifsm->ifindex)) { NL_SET_ERR_MSG(extack, "Invalid values in header for stats dump request"); return -EINVAL; } if (ifsm->filter_mask >= IFLA_STATS_FILTER_BIT(IFLA_STATS_MAX + 1)) { NL_SET_ERR_MSG(extack, "Invalid stats requested through filter mask"); return -EINVAL; } return 0; } static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct rtnl_stats_dump_filters filters; struct net *net = sock_net(skb->sk); struct net_device *dev = NULL; int idxattr = 0, prividx = 0; struct if_stats_msg *ifsm; struct sk_buff *nskb; int err; err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb), false, extack); if (err) return err; ifsm = nlmsg_data(nlh); if (ifsm->ifindex > 0) dev = __dev_get_by_index(net, ifsm->ifindex); else return -EINVAL; if (!dev) return -ENODEV; if (!ifsm->filter_mask) { NL_SET_ERR_MSG(extack, "Filter mask must be set for stats get"); return -EINVAL; } err = rtnl_stats_get_parse(nlh, ifsm->filter_mask, &filters, extack); if (err) return err; nskb = nlmsg_new(if_nlmsg_stats_size(dev, &filters), GFP_KERNEL); if (!nskb) return -ENOBUFS; err = rtnl_fill_statsinfo(nskb, dev, RTM_NEWSTATS, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, 0, &filters, &idxattr, &prividx, extack); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_stats_size */ WARN_ON(err == -EMSGSIZE); kfree_skb(nskb); } else { err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid); } return err; } static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; struct rtnl_stats_dump_filters filters; struct net *net = sock_net(skb->sk); unsigned int flags = NLM_F_MULTI; struct if_stats_msg *ifsm; struct { unsigned long ifindex; int idxattr; int prividx; } *ctx = (void *)cb->ctx; struct net_device *dev; int err; cb->seq = net->dev_base_seq; err = rtnl_valid_stats_req(cb->nlh, cb->strict_check, true, extack); if (err) return err; ifsm = nlmsg_data(cb->nlh); if (!ifsm->filter_mask) { NL_SET_ERR_MSG(extack, "Filter mask must be set for stats dump"); return -EINVAL; } err = rtnl_stats_get_parse(cb->nlh, ifsm->filter_mask, &filters, extack); if (err) return err; for_each_netdev_dump(net, dev, ctx->ifindex) { err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, flags, &filters, &ctx->idxattr, &ctx->prividx, extack); /* If we ran out of room on the first message, * we're in trouble. */ WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); if (err < 0) break; ctx->prividx = 0; ctx->idxattr = 0; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); } return err; } void rtnl_offload_xstats_notify(struct net_device *dev) { struct rtnl_stats_dump_filters response_filters = {}; struct net *net = dev_net(dev); int idxattr = 0, prividx = 0; struct sk_buff *skb; int err = -ENOBUFS; ASSERT_RTNL(); response_filters.mask[0] |= IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS); response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |= IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO); skb = nlmsg_new(if_nlmsg_stats_size(dev, &response_filters), GFP_KERNEL); if (!skb) goto errout; err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, 0, 0, 0, 0, &response_filters, &idxattr, &prividx, NULL); if (err < 0) { kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_STATS, NULL, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, RTNLGRP_STATS, err); } EXPORT_SYMBOL(rtnl_offload_xstats_notify); static int rtnl_stats_set(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; struct rtnl_stats_dump_filters response_filters = {}; struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1]; struct net *net = sock_net(skb->sk); struct net_device *dev = NULL; struct if_stats_msg *ifsm; bool notify = false; int err; err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb), false, extack); if (err) return err; ifsm = nlmsg_data(nlh); if (ifsm->family != AF_UNSPEC) { NL_SET_ERR_MSG(extack, "Address family should be AF_UNSPEC"); return -EINVAL; } if (ifsm->ifindex > 0) dev = __dev_get_by_index(net, ifsm->ifindex); else return -EINVAL; if (!dev) return -ENODEV; if (ifsm->filter_mask) { NL_SET_ERR_MSG(extack, "Filter mask must be 0 for stats set"); return -EINVAL; } err = nlmsg_parse(nlh, sizeof(*ifsm), tb, IFLA_STATS_GETSET_MAX, ifla_stats_set_policy, extack); if (err < 0) return err; if (tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]) { u8 req = nla_get_u8(tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]); if (req) err = netdev_offload_xstats_enable(dev, t_l3, extack); else err = netdev_offload_xstats_disable(dev, t_l3); if (!err) notify = true; else if (err != -EALREADY) return err; response_filters.mask[0] |= IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS); response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |= IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO); } if (notify) rtnl_offload_xstats_notify(dev); return 0; } static int rtnl_mdb_valid_dump_req(const struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct br_port_msg *bpm; bpm = nlmsg_payload(nlh, sizeof(*bpm)); if (!bpm) { NL_SET_ERR_MSG(extack, "Invalid header for mdb dump request"); return -EINVAL; } if (bpm->ifindex) { NL_SET_ERR_MSG(extack, "Filtering by device index is not supported for mdb dump request"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*bpm))) { NL_SET_ERR_MSG(extack, "Invalid data after header in mdb dump request"); return -EINVAL; } return 0; } struct rtnl_mdb_dump_ctx { long idx; }; static int rtnl_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct rtnl_mdb_dump_ctx *ctx = (void *)cb->ctx; struct net *net = sock_net(skb->sk); struct net_device *dev; int idx, s_idx; int err; NL_ASSERT_CTX_FITS(struct rtnl_mdb_dump_ctx); if (cb->strict_check) { err = rtnl_mdb_valid_dump_req(cb->nlh, cb->extack); if (err) return err; } s_idx = ctx->idx; idx = 0; for_each_netdev(net, dev) { if (idx < s_idx) goto skip; if (!dev->netdev_ops->ndo_mdb_dump) goto skip; err = dev->netdev_ops->ndo_mdb_dump(dev, skb, cb); if (err == -EMSGSIZE) goto out; /* Moving on to next device, reset markers and sequence * counters since they are all maintained per-device. */ memset(cb->ctx, 0, sizeof(cb->ctx)); cb->prev_seq = 0; cb->seq = 0; skip: idx++; } out: ctx->idx = idx; return skb->len; } static int rtnl_validate_mdb_entry_get(const struct nlattr *attr, struct netlink_ext_ack *extack) { struct br_mdb_entry *entry = nla_data(attr); if (nla_len(attr) != sizeof(struct br_mdb_entry)) { NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length"); return -EINVAL; } if (entry->ifindex) { NL_SET_ERR_MSG(extack, "Entry ifindex cannot be specified"); return -EINVAL; } if (entry->state) { NL_SET_ERR_MSG(extack, "Entry state cannot be specified"); return -EINVAL; } if (entry->flags) { NL_SET_ERR_MSG(extack, "Entry flags cannot be specified"); return -EINVAL; } if (entry->vid >= VLAN_VID_MASK) { NL_SET_ERR_MSG(extack, "Invalid entry VLAN id"); return -EINVAL; } if (entry->addr.proto != htons(ETH_P_IP) && entry->addr.proto != htons(ETH_P_IPV6) && entry->addr.proto != 0) { NL_SET_ERR_MSG(extack, "Unknown entry protocol"); return -EINVAL; } return 0; } static const struct nla_policy mdba_get_policy[MDBA_GET_ENTRY_MAX + 1] = { [MDBA_GET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, rtnl_validate_mdb_entry_get, sizeof(struct br_mdb_entry)), [MDBA_GET_ENTRY_ATTRS] = { .type = NLA_NESTED }, }; static int rtnl_mdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[MDBA_GET_ENTRY_MAX + 1]; struct net *net = sock_net(in_skb->sk); struct br_port_msg *bpm; struct net_device *dev; int err; err = nlmsg_parse(nlh, sizeof(struct br_port_msg), tb, MDBA_GET_ENTRY_MAX, mdba_get_policy, extack); if (err) return err; bpm = nlmsg_data(nlh); if (!bpm->ifindex) { NL_SET_ERR_MSG(extack, "Invalid ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, bpm->ifindex); if (!dev) { NL_SET_ERR_MSG(extack, "Device doesn't exist"); return -ENODEV; } if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_GET_ENTRY)) { NL_SET_ERR_MSG(extack, "Missing MDBA_GET_ENTRY attribute"); return -EINVAL; } if (!dev->netdev_ops->ndo_mdb_get) { NL_SET_ERR_MSG(extack, "Device does not support MDB operations"); return -EOPNOTSUPP; } return dev->netdev_ops->ndo_mdb_get(dev, tb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, extack); } static int rtnl_validate_mdb_entry(const struct nlattr *attr, struct netlink_ext_ack *extack) { struct br_mdb_entry *entry = nla_data(attr); if (nla_len(attr) != sizeof(struct br_mdb_entry)) { NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length"); return -EINVAL; } if (entry->ifindex == 0) { NL_SET_ERR_MSG(extack, "Zero entry ifindex is not allowed"); return -EINVAL; } if (entry->addr.proto == htons(ETH_P_IP)) { if (!ipv4_is_multicast(entry->addr.u.ip4) && !ipv4_is_zeronet(entry->addr.u.ip4)) { NL_SET_ERR_MSG(extack, "IPv4 entry group address is not multicast or 0.0.0.0"); return -EINVAL; } if (ipv4_is_local_multicast(entry->addr.u.ip4)) { NL_SET_ERR_MSG(extack, "IPv4 entry group address is local multicast"); return -EINVAL; } #if IS_ENABLED(CONFIG_IPV6) } else if (entry->addr.proto == htons(ETH_P_IPV6)) { if (ipv6_addr_is_ll_all_nodes(&entry->addr.u.ip6)) { NL_SET_ERR_MSG(extack, "IPv6 entry group address is link-local all nodes"); return -EINVAL; } #endif } else if (entry->addr.proto == 0) { /* L2 mdb */ if (!is_multicast_ether_addr(entry->addr.u.mac_addr)) { NL_SET_ERR_MSG(extack, "L2 entry group is not multicast"); return -EINVAL; } } else { NL_SET_ERR_MSG(extack, "Unknown entry protocol"); return -EINVAL; } if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) { NL_SET_ERR_MSG(extack, "Unknown entry state"); return -EINVAL; } if (entry->vid >= VLAN_VID_MASK) { NL_SET_ERR_MSG(extack, "Invalid entry VLAN id"); return -EINVAL; } return 0; } static const struct nla_policy mdba_policy[MDBA_SET_ENTRY_MAX + 1] = { [MDBA_SET_ENTRY_UNSPEC] = { .strict_start_type = MDBA_SET_ENTRY_ATTRS + 1 }, [MDBA_SET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, rtnl_validate_mdb_entry, sizeof(struct br_mdb_entry)), [MDBA_SET_ENTRY_ATTRS] = { .type = NLA_NESTED }, }; static int rtnl_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,