1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 // SPDX-License-Identifier: GPL-2.0 /* * Parts of this file are * Copyright (C) 2022-2023 Intel Corporation */ #include <linux/ieee80211.h> #include <linux/export.h> #include <net/cfg80211.h> #include "nl80211.h" #include "core.h" #include "rdev-ops.h" static int ___cfg80211_stop_ap(struct cfg80211_registered_device *rdev, struct net_device *dev, unsigned int link_id, bool notify) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; lockdep_assert_wiphy(wdev->wiphy); if (!rdev->ops->stop_ap) return -EOPNOTSUPP; if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) return -EOPNOTSUPP; if (!wdev->links[link_id].ap.beacon_interval) return -ENOENT; err = rdev_stop_ap(rdev, dev, link_id); if (!err) { wdev->conn_owner_nlportid = 0; wdev->links[link_id].ap.beacon_interval = 0; memset(&wdev->links[link_id].ap.chandef, 0, sizeof(wdev->links[link_id].ap.chandef)); wdev->u.ap.ssid_len = 0; rdev_set_qos_map(rdev, dev, NULL); if (notify) nl80211_send_ap_stopped(wdev, link_id); /* Should we apply the grace period during beaconing interface * shutdown also? */ cfg80211_sched_dfs_chan_update(rdev); } schedule_work(&cfg80211_disconnect_work); return err; } int cfg80211_stop_ap(struct cfg80211_registered_device *rdev, struct net_device *dev, int link_id, bool notify) { unsigned int link; int ret = 0; if (link_id >= 0) return ___cfg80211_stop_ap(rdev, dev, link_id, notify); for_each_valid_link(dev->ieee80211_ptr, link) { int ret1 = ___cfg80211_stop_ap(rdev, dev, link, notify); if (ret1) ret = ret1; /* try the next one also if one errored */ } return ret; }
75 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 /* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/signalfd.h * * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> * */ #ifndef _LINUX_SIGNALFD_H #define _LINUX_SIGNALFD_H #include <uapi/linux/signalfd.h> #include <linux/sched/signal.h> #ifdef CONFIG_SIGNALFD /* * Deliver the signal to listening signalfd. */ static inline void signalfd_notify(struct task_struct *tsk, int sig) { if (unlikely(waitqueue_active(&tsk->sighand->signalfd_wqh))) wake_up(&tsk->sighand->signalfd_wqh); } extern void signalfd_cleanup(struct sighand_struct *sighand); #else /* CONFIG_SIGNALFD */ static inline void signalfd_notify(struct task_struct *tsk, int sig) { } static inline void signalfd_cleanup(struct sighand_struct *sighand) { } #endif /* CONFIG_SIGNALFD */ #endif /* _LINUX_SIGNALFD_H */
123 123 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_GRE_H #define __LINUX_GRE_H #include <linux/skbuff.h> #include <net/ip_tunnels.h> struct gre_base_hdr { __be16 flags; __be16 protocol; } __packed; struct gre_full_hdr { struct gre_base_hdr fixed_header; __be16 csum; __be16 reserved1; __be32 key; __be32 seq; } __packed; #define GRE_HEADER_SECTION 4 #define GREPROTO_CISCO 0 #define GREPROTO_PPTP 1 #define GREPROTO_MAX 2 #define GRE_IP_PROTO_MAX 2 struct gre_protocol { int (*handler)(struct sk_buff *skb); void (*err_handler)(struct sk_buff *skb, u32 info); }; int gre_add_protocol(const struct gre_protocol *proto, u8 version); int gre_del_protocol(const struct gre_protocol *proto, u8 version); struct net_device *gretap_fb_dev_create(struct net *net, const char *name, u8 name_assign_type); int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, bool *csum_err, __be16 proto, int nhs); static inline bool netif_is_gretap(const struct net_device *dev) { return dev->rtnl_link_ops && !strcmp(dev->rtnl_link_ops->kind, "gretap"); } static inline bool netif_is_ip6gretap(const struct net_device *dev) { return dev->rtnl_link_ops && !strcmp(dev->rtnl_link_ops->kind, "ip6gretap"); } static inline int gre_calc_hlen(__be16 o_flags) { int addend = 4; if (o_flags & TUNNEL_CSUM) addend += 4; if (o_flags & TUNNEL_KEY) addend += 4; if (o_flags & TUNNEL_SEQ) addend += 4; return addend; } static inline __be16 gre_flags_to_tnl_flags(__be16 flags) { __be16 tflags = 0; if (flags & GRE_CSUM) tflags |= TUNNEL_CSUM; if (flags & GRE_ROUTING) tflags |= TUNNEL_ROUTING; if (flags & GRE_KEY) tflags |= TUNNEL_KEY; if (flags & GRE_SEQ) tflags |= TUNNEL_SEQ; if (flags & GRE_STRICT) tflags |= TUNNEL_STRICT; if (flags & GRE_REC) tflags |= TUNNEL_REC; if (flags & GRE_VERSION) tflags |= TUNNEL_VERSION; return tflags; } static inline __be16 gre_tnl_flags_to_gre_flags(__be16 tflags) { __be16 flags = 0; if (tflags & TUNNEL_CSUM) flags |= GRE_CSUM; if (tflags & TUNNEL_ROUTING) flags |= GRE_ROUTING; if (tflags & TUNNEL_KEY) flags |= GRE_KEY; if (tflags & TUNNEL_SEQ) flags |= GRE_SEQ; if (tflags & TUNNEL_STRICT) flags |= GRE_STRICT; if (tflags & TUNNEL_REC) flags |= GRE_REC; if (tflags & TUNNEL_VERSION) flags |= GRE_VERSION; return flags; } static inline void gre_build_header(struct sk_buff *skb, int hdr_len, __be16 flags, __be16 proto, __be32 key, __be32 seq) { struct gre_base_hdr *greh; skb_push(skb, hdr_len); skb_set_inner_protocol(skb, proto); skb_reset_transport_header(skb); greh = (struct gre_base_hdr *)skb->data; greh->flags = gre_tnl_flags_to_gre_flags(flags); greh->protocol = proto; if (flags & (TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_SEQ)) { __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4); if (flags & TUNNEL_SEQ) { *ptr = seq; ptr--; } if (flags & TUNNEL_KEY) { *ptr = key; ptr--; } if (flags & TUNNEL_CSUM && !(skb_shinfo(skb)->gso_type & (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) { *ptr = 0; if (skb->ip_summed == CHECKSUM_PARTIAL) { *(__sum16 *)ptr = csum_fold(lco_csum(skb)); } else { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = sizeof(*greh); } } } } #endif
4 4 5 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 // SPDX-License-Identifier: GPL-2.0-or-later /* * Bridge per vlan tunnel port dst_metadata handling code * * Authors: * Roopa Prabhu <roopa@cumulusnetworks.com> */ #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <net/switchdev.h> #include <net/dst_metadata.h> #include "br_private.h" #include "br_private_tunnel.h" static inline int br_vlan_tunid_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct net_bridge_vlan *vle = ptr; __be64 tunid = *(__be64 *)arg->key; return vle->tinfo.tunnel_id != tunid; } static const struct rhashtable_params br_vlan_tunnel_rht_params = { .head_offset = offsetof(struct net_bridge_vlan, tnode), .key_offset = offsetof(struct net_bridge_vlan, tinfo.tunnel_id), .key_len = sizeof(__be64), .nelem_hint = 3, .obj_cmpfn = br_vlan_tunid_cmp, .automatic_shrinking = true, }; static struct net_bridge_vlan *br_vlan_tunnel_lookup(struct rhashtable *tbl, __be64 tunnel_id) { return rhashtable_lookup_fast(tbl, &tunnel_id, br_vlan_tunnel_rht_params); } static void vlan_tunnel_info_release(struct net_bridge_vlan *vlan) { struct metadata_dst *tdst = rtnl_dereference(vlan->tinfo.tunnel_dst); WRITE_ONCE(vlan->tinfo.tunnel_id, 0); RCU_INIT_POINTER(vlan->tinfo.tunnel_dst, NULL); dst_release(&tdst->dst); } void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg, struct net_bridge_vlan *vlan) { if (!rcu_access_pointer(vlan->tinfo.tunnel_dst)) return; rhashtable_remove_fast(&vg->tunnel_hash, &vlan->tnode, br_vlan_tunnel_rht_params); vlan_tunnel_info_release(vlan); } static int __vlan_tunnel_info_add(struct net_bridge_vlan_group *vg, struct net_bridge_vlan *vlan, u32 tun_id) { struct metadata_dst *metadata = rtnl_dereference(vlan->tinfo.tunnel_dst); __be64 key = key32_to_tunnel_id(cpu_to_be32(tun_id)); int err; if (metadata) return -EEXIST; metadata = __ip_tun_set_dst(0, 0, 0, 0, 0, TUNNEL_KEY, key, 0); if (!metadata) return -EINVAL; metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_BRIDGE; rcu_assign_pointer(vlan->tinfo.tunnel_dst, metadata); WRITE_ONCE(vlan->tinfo.tunnel_id, key); err = rhashtable_lookup_insert_fast(&vg->tunnel_hash, &vlan->tnode, br_vlan_tunnel_rht_params); if (err) goto out; return 0; out: vlan_tunnel_info_release(vlan); return err; } /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. */ int nbp_vlan_tunnel_info_add(const struct net_bridge_port *port, u16 vid, u32 tun_id) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *vlan; ASSERT_RTNL(); vg = nbp_vlan_group(port); vlan = br_vlan_find(vg, vid); if (!vlan) return -EINVAL; return __vlan_tunnel_info_add(vg, vlan, tun_id); } /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. */ int nbp_vlan_tunnel_info_delete(const struct net_bridge_port *port, u16 vid) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; ASSERT_RTNL(); vg = nbp_vlan_group(port); v = br_vlan_find(vg, vid); if (!v) return -ENOENT; vlan_tunnel_info_del(vg, v); return 0; } static void __vlan_tunnel_info_flush(struct net_bridge_vlan_group *vg) { struct net_bridge_vlan *vlan, *tmp; list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist) vlan_tunnel_info_del(vg, vlan); } void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port) { struct net_bridge_vlan_group *vg; ASSERT_RTNL(); vg = nbp_vlan_group(port); __vlan_tunnel_info_flush(vg); } int vlan_tunnel_init(struct net_bridge_vlan_group *vg) { return rhashtable_init(&vg->tunnel_hash, &br_vlan_tunnel_rht_params); } void vlan_tunnel_deinit(struct net_bridge_vlan_group *vg) { rhashtable_destroy(&vg->tunnel_hash); } void br_handle_ingress_vlan_tunnel(struct sk_buff *skb, struct net_bridge_port *p, struct net_bridge_vlan_group *vg) { struct ip_tunnel_info *tinfo = skb_tunnel_info(skb); struct net_bridge_vlan *vlan; if (!vg || !tinfo) return; /* if already tagged, ignore */ if (skb_vlan_tagged(skb)) return; /* lookup vid, given tunnel id */ vlan = br_vlan_tunnel_lookup(&vg->tunnel_hash, tinfo->key.tun_id); if (!vlan) return; skb_dst_drop(skb); __vlan_hwaccel_put_tag(skb, p->br->vlan_proto, vlan->vid); } int br_handle_egress_vlan_tunnel(struct sk_buff *skb, struct net_bridge_vlan *vlan) { struct metadata_dst *tunnel_dst; __be64 tunnel_id; int err; if (!vlan) return 0; tunnel_id = READ_ONCE(vlan->tinfo.tunnel_id); if (!tunnel_id || unlikely(!skb_vlan_tag_present(skb))) return 0; skb_dst_drop(skb); err = skb_vlan_pop(skb); if (err) return err; if (BR_INPUT_SKB_CB(skb)->backup_nhid) { tunnel_dst = __ip_tun_set_dst(0, 0, 0, 0, 0, TUNNEL_KEY, tunnel_id, 0); if (!tunnel_dst) return -ENOMEM; tunnel_dst->u.tun_info.mode |= IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_BRIDGE; tunnel_dst->u.tun_info.key.nhid = BR_INPUT_SKB_CB(skb)->backup_nhid; skb_dst_set(skb, &tunnel_dst->dst); return 0; } tunnel_dst = rcu_dereference(vlan->tinfo.tunnel_dst); if (tunnel_dst && dst_hold_safe(&tunnel_dst->dst)) skb_dst_set(skb, &tunnel_dst->dst); return 0; }
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 // SPDX-License-Identifier: GPL-2.0-or-later /* * zswap.c - zswap driver file * * zswap is a cache that takes pages that are in the process * of being swapped out and attempts to compress and store them in a * RAM-based memory pool. This can result in a significant I/O reduction on * the swap device and, in the case where decompressing from RAM is faster * than reading from the swap device, can also improve workload performance. * * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/cpu.h> #include <linux/highmem.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/types.h> #include <linux/atomic.h> #include <linux/rbtree.h> #include <linux/swap.h> #include <linux/crypto.h> #include <linux/scatterlist.h> #include <linux/mempolicy.h> #include <linux/mempool.h> #include <linux/zpool.h> #include <crypto/acompress.h> #include <linux/zswap.h> #include <linux/mm_types.h> #include <linux/page-flags.h> #include <linux/swapops.h> #include <linux/writeback.h> #include <linux/pagemap.h> #include <linux/workqueue.h> #include <linux/list_lru.h> #include "swap.h" #include "internal.h" /********************************* * statistics **********************************/ /* Total bytes used by the compressed storage */ u64 zswap_pool_total_size; /* The number of compressed pages currently stored in zswap */ atomic_t zswap_stored_pages = ATOMIC_INIT(0); /* The number of same-value filled pages currently stored in zswap */ static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0); /* * The statistics below are not protected from concurrent access for * performance reasons so they may not be a 100% accurate. However, * they do provide useful information on roughly how many times a * certain event is occurring. */ /* Pool limit was hit (see zswap_max_pool_percent) */ static u64 zswap_pool_limit_hit; /* Pages written back when pool limit was reached */ static u64 zswap_written_back_pages; /* Store failed due to a reclaim failure after pool limit was reached */ static u64 zswap_reject_reclaim_fail; /* Store failed due to compression algorithm failure */ static u64 zswap_reject_compress_fail; /* Compressed page was too big for the allocator to (optimally) store */ static u64 zswap_reject_compress_poor; /* Store failed because underlying allocator could not get memory */ static u64 zswap_reject_alloc_fail; /* Store failed because the entry metadata could not be allocated (rare) */ static u64 zswap_reject_kmemcache_fail; /* Duplicate store was encountered (rare) */ static u64 zswap_duplicate_entry; /* Shrinker work queue */ static struct workqueue_struct *shrink_wq; /* Pool limit was hit, we need to calm down */ static bool zswap_pool_reached_full; /********************************* * tunables **********************************/ #define ZSWAP_PARAM_UNSET "" static int zswap_setup(void); /* Enable/disable zswap */ static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON); static int zswap_enabled_param_set(const char *, const struct kernel_param *); static const struct kernel_param_ops zswap_enabled_param_ops = { .set = zswap_enabled_param_set, .get = param_get_bool, }; module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644); /* Crypto compressor to use */ static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; static int zswap_compressor_param_set(const char *, const struct kernel_param *); static const struct kernel_param_ops zswap_compressor_param_ops = { .set = zswap_compressor_param_set, .get = param_get_charp, .free = param_free_charp, }; module_param_cb(compressor, &zswap_compressor_param_ops, &zswap_compressor, 0644); /* Compressed storage zpool to use */ static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; static int zswap_zpool_param_set(const char *, const struct kernel_param *); static const struct kernel_param_ops zswap_zpool_param_ops = { .set = zswap_zpool_param_set, .get = param_get_charp, .free = param_free_charp, }; module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644); /* The maximum percentage of memory that the compressed pool can occupy */ static unsigned int zswap_max_pool_percent = 20; module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); /* The threshold for accepting new pages after the max_pool_percent was hit */ static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */ module_param_named(accept_threshold_percent, zswap_accept_thr_percent, uint, 0644); /* * Enable/disable handling same-value filled pages (enabled by default). * If disabled every page is considered non-same-value filled. */ static bool zswap_same_filled_pages_enabled = true; module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, bool, 0644); /* Enable/disable handling non-same-value filled pages (enabled by default) */ static bool zswap_non_same_filled_pages_enabled = true; module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled, bool, 0644); static bool zswap_exclusive_loads_enabled = IS_ENABLED( CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON); module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644); /* Number of zpools in zswap_pool (empirically determined for scalability) */ #define ZSWAP_NR_ZPOOLS 32 /* Enable/disable memory pressure-based shrinker. */ static bool zswap_shrinker_enabled = IS_ENABLED( CONFIG_ZSWAP_SHRINKER_DEFAULT_ON); module_param_named(shrinker_enabled, zswap_shrinker_enabled, bool, 0644); bool is_zswap_enabled(void) { return zswap_enabled; } /********************************* * data structures **********************************/ struct crypto_acomp_ctx { struct crypto_acomp *acomp; struct acomp_req *req; struct crypto_wait wait; u8 *buffer; struct mutex mutex; }; /* * The lock ordering is zswap_tree.lock -> zswap_pool.lru_lock. * The only case where lru_lock is not acquired while holding tree.lock is * when a zswap_entry is taken off the lru for writeback, in that case it * needs to be verified that it's still valid in the tree. */ struct zswap_pool { struct zpool *zpools[ZSWAP_NR_ZPOOLS]; struct crypto_acomp_ctx __percpu *acomp_ctx; struct kref kref; struct list_head list; struct work_struct release_work; struct work_struct shrink_work; struct hlist_node node; char tfm_name[CRYPTO_MAX_ALG_NAME]; struct list_lru list_lru; struct mem_cgroup *next_shrink; struct shrinker *shrinker; atomic_t nr_stored; }; /* * struct zswap_entry * * This structure contains the metadata for tracking a single compressed * page within zswap. * * rbnode - links the entry into red-black tree for the appropriate swap type * swpentry - associated swap entry, the offset indexes into the red-black tree * refcount - the number of outstanding reference to the entry. This is needed * to protect against premature freeing of the entry by code * concurrent calls to load, invalidate, and writeback. The lock * for the zswap_tree structure that contains the entry must * be held while changing the refcount. Since the lock must * be held, there is no reason to also make refcount atomic. * length - the length in bytes of the compressed page data. Needed during * decompression. For a same value filled page length is 0, and both * pool and lru are invalid and must be ignored. * pool - the zswap_pool the entry's data is in * handle - zpool allocation handle that stores the compressed page data * value - value of the same-value filled pages which have same content * objcg - the obj_cgroup that the compressed memory is charged to * lru - handle to the pool's lru used to evict pages. */ struct zswap_entry { struct rb_node rbnode; swp_entry_t swpentry; int refcount; unsigned int length; struct zswap_pool *pool; union { unsigned long handle; unsigned long value; }; struct obj_cgroup *objcg; struct list_head lru; }; /* * The tree lock in the zswap_tree struct protects a few things: * - the rbtree * - the refcount field of each entry in the tree */ struct zswap_tree { struct rb_root rbroot; spinlock_t lock; }; static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; /* RCU-protected iteration */ static LIST_HEAD(zswap_pools); /* protects zswap_pools list modification */ static DEFINE_SPINLOCK(zswap_pools_lock); /* pool counter to provide unique names to zpool */ static atomic_t zswap_pools_count = ATOMIC_INIT(0); enum zswap_init_type { ZSWAP_UNINIT, ZSWAP_INIT_SUCCEED, ZSWAP_INIT_FAILED }; static enum zswap_init_type zswap_init_state; /* used to ensure the integrity of initialization */ static DEFINE_MUTEX(zswap_init_lock); /* init completed, but couldn't create the initial pool */ static bool zswap_has_pool; /********************************* * helpers and fwd declarations **********************************/ #define zswap_pool_debug(msg, p) \ pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ zpool_get_type((p)->zpools[0])) static int zswap_writeback_entry(struct zswap_entry *entry, struct zswap_tree *tree); static int zswap_pool_get(struct zswap_pool *pool); static void zswap_pool_put(struct zswap_pool *pool); static bool zswap_is_full(void) { return totalram_pages() * zswap_max_pool_percent / 100 < DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); } static bool zswap_can_accept(void) { return totalram_pages() * zswap_accept_thr_percent / 100 * zswap_max_pool_percent / 100 > DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); } static u64 get_zswap_pool_size(struct zswap_pool *pool) { u64 pool_size = 0; int i; for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) pool_size += zpool_get_total_size(pool->zpools[i]); return pool_size; } static void zswap_update_total_size(void) { struct zswap_pool *pool; u64 total = 0; rcu_read_lock(); list_for_each_entry_rcu(pool, &zswap_pools, list) total += get_zswap_pool_size(pool); rcu_read_unlock(); zswap_pool_total_size = total; } /* should be called under RCU */ #ifdef CONFIG_MEMCG static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) { return entry->objcg ? obj_cgroup_memcg(entry->objcg) : NULL; } #else static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) { return NULL; } #endif static inline int entry_to_nid(struct zswap_entry *entry) { return page_to_nid(virt_to_page(entry)); } void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) { struct zswap_pool *pool; /* lock out zswap pools list modification */ spin_lock(&zswap_pools_lock); list_for_each_entry(pool, &zswap_pools, list) { if (pool->next_shrink == memcg) pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); } spin_unlock(&zswap_pools_lock); } /********************************* * zswap entry functions **********************************/ static struct kmem_cache *zswap_entry_cache; static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) { struct zswap_entry *entry; entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); if (!entry) return NULL; entry->refcount = 1; RB_CLEAR_NODE(&entry->rbnode); return entry; } static void zswap_entry_cache_free(struct zswap_entry *entry) { kmem_cache_free(zswap_entry_cache, entry); } /********************************* * zswap lruvec functions **********************************/ void zswap_lruvec_state_init(struct lruvec *lruvec) { atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); } void zswap_folio_swapin(struct folio *folio) { struct lruvec *lruvec; VM_WARN_ON_ONCE(!folio_test_locked(folio)); lruvec = folio_lruvec(folio); atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); } /********************************* * lru functions **********************************/ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) { atomic_long_t *nr_zswap_protected; unsigned long lru_size, old, new; int nid = entry_to_nid(entry); struct mem_cgroup *memcg; struct lruvec *lruvec; /* * Note that it is safe to use rcu_read_lock() here, even in the face of * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection * used in list_lru lookup, only two scenarios are possible: * * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The * new entry will be reparented to memcg's parent's list_lru. * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The * new entry will be added directly to memcg's parent's list_lru. * * Similar reasoning holds for list_lru_del() and list_lru_putback(). */ rcu_read_lock(); memcg = mem_cgroup_from_entry(entry); /* will always succeed */ list_lru_add(list_lru, &entry->lru, nid, memcg); /* Update the protection area */ lru_size = list_lru_count_one(list_lru, nid, memcg); lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected; old = atomic_long_inc_return(nr_zswap_protected); /* * Decay to avoid overflow and adapt to changing workloads. * This is based on LRU reclaim cost decaying heuristics. */ do { new = old > lru_size / 4 ? old / 2 : old; } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new)); rcu_read_unlock(); } static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) { int nid = entry_to_nid(entry); struct mem_cgroup *memcg; rcu_read_lock(); memcg = mem_cgroup_from_entry(entry); /* will always succeed */ list_lru_del(list_lru, &entry->lru, nid, memcg); rcu_read_unlock(); } static void zswap_lru_putback(struct list_lru *list_lru, struct zswap_entry *entry) { int nid = entry_to_nid(entry); spinlock_t *lock = &list_lru->node[nid].lock; struct mem_cgroup *memcg; struct lruvec *lruvec; rcu_read_lock(); memcg = mem_cgroup_from_entry(entry); spin_lock(lock); /* we cannot use list_lru_add here, because it increments node's lru count */ list_lru_putback(list_lru, &entry->lru, nid, memcg); spin_unlock(lock); lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(entry_to_nid(entry))); /* increment the protection area to account for the LRU rotation. */ atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); rcu_read_unlock(); } /********************************* * rbtree functions **********************************/ static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) { struct rb_node *node = root->rb_node; struct zswap_entry *entry; pgoff_t entry_offset; while (node) { entry = rb_entry(node, struct zswap_entry, rbnode); entry_offset = swp_offset(entry->swpentry); if (entry_offset > offset) node = node->rb_left; else if (entry_offset < offset) node = node->rb_right; else return entry; } return NULL; } /* * In the case that a entry with the same offset is found, a pointer to * the existing entry is stored in dupentry and the function returns -EEXIST */ static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, struct zswap_entry **dupentry) { struct rb_node **link = &root->rb_node, *parent = NULL; struct zswap_entry *myentry; pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry); while (*link) { parent = *link; myentry = rb_entry(parent, struct zswap_entry, rbnode); myentry_offset = swp_offset(myentry->swpentry); if (myentry_offset > entry_offset) link = &(*link)->rb_left; else if (myentry_offset < entry_offset) link = &(*link)->rb_right; else { *dupentry = myentry; return -EEXIST; } } rb_link_node(&entry->rbnode, parent, link); rb_insert_color(&entry->rbnode, root); return 0; } static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) { if (!RB_EMPTY_NODE(&entry->rbnode)) { rb_erase(&entry->rbnode, root); RB_CLEAR_NODE(&entry->rbnode); return true; } return false; } static struct zpool *zswap_find_zpool(struct zswap_entry *entry) { int i = 0; if (ZSWAP_NR_ZPOOLS > 1) i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS)); return entry->pool->zpools[i]; } /* * Carries out the common pattern of freeing and entry's zpool allocation, * freeing the entry itself, and decrementing the number of stored pages. */ static void zswap_free_entry(struct zswap_entry *entry) { if (!entry->length) atomic_dec(&zswap_same_filled_pages); else { zswap_lru_del(&entry->pool->list_lru, entry); zpool_free(zswap_find_zpool(entry), entry->handle); atomic_dec(&entry->pool->nr_stored); zswap_pool_put(entry->pool); } if (entry->objcg) { obj_cgroup_uncharge_zswap(entry->objcg, entry->length); obj_cgroup_put(entry->objcg); } zswap_entry_cache_free(entry); atomic_dec(&zswap_stored_pages); zswap_update_total_size(); } /* caller must hold the tree lock */ static void zswap_entry_get(struct zswap_entry *entry) { entry->refcount++; } /* caller must hold the tree lock * remove from the tree and free it, if nobody reference the entry */ static void zswap_entry_put(struct zswap_tree *tree, struct zswap_entry *entry) { int refcount = --entry->refcount; WARN_ON_ONCE(refcount < 0); if (refcount == 0) { WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode)); zswap_free_entry(entry); } } /* caller must hold the tree lock */ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, pgoff_t offset) { struct zswap_entry *entry; entry = zswap_rb_search(root, offset); if (entry) zswap_entry_get(entry); return entry; } /********************************* * shrinker functions **********************************/ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, spinlock_t *lock, void *arg); static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) { struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); unsigned long shrink_ret, nr_protected, lru_size; struct zswap_pool *pool = shrinker->private_data; bool encountered_page_in_swapcache = false; if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(sc->memcg)) { sc->nr_scanned = 0; return SHRINK_STOP; } nr_protected = atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); lru_size = list_lru_shrink_count(&pool->list_lru, sc); /* * Abort if we are shrinking into the protected region. * * This short-circuiting is necessary because if we have too many multiple * concurrent reclaimers getting the freeable zswap object counts at the * same time (before any of them made reasonable progress), the total * number of reclaimed objects might be more than the number of unprotected * objects (i.e the reclaimers will reclaim into the protected area of the * zswap LRU). */ if (nr_protected >= lru_size - sc->nr_to_scan) { sc->nr_scanned = 0; return SHRINK_STOP; } shrink_ret = list_lru_shrink_walk(&pool->list_lru, sc, &shrink_memcg_cb, &encountered_page_in_swapcache); if (encountered_page_in_swapcache) return SHRINK_STOP; return shrink_ret ? shrink_ret : SHRINK_STOP; } static unsigned long zswap_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc) { struct zswap_pool *pool = shrinker->private_data; struct mem_cgroup *memcg = sc->memcg; struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid)); unsigned long nr_backing, nr_stored, nr_freeable, nr_protected; if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg)) return 0; #ifdef CONFIG_MEMCG_KMEM mem_cgroup_flush_stats(memcg); nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); #else /* use pool stats instead of memcg stats */ nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT; nr_stored = atomic_read(&pool->nr_stored); #endif if (!nr_stored) return 0; nr_protected = atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); nr_freeable = list_lru_shrink_count(&pool->list_lru, sc); /* * Subtract the lru size by an estimate of the number of pages * that should be protected. */ nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0; /* * Scale the number of freeable pages by the memory saving factor. * This ensures that the better zswap compresses memory, the fewer * pages we will evict to swap (as it will otherwise incur IO for * relatively small memory saving). */ return mult_frac(nr_freeable, nr_backing, nr_stored); } static void zswap_alloc_shrinker(struct zswap_pool *pool) { pool->shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap"); if (!pool->shrinker) return; pool->shrinker->private_data = pool; pool->shrinker->scan_objects = zswap_shrinker_scan; pool->shrinker->count_objects = zswap_shrinker_count; pool->shrinker->batch = 0; pool->shrinker->seeks = DEFAULT_SEEKS; } /********************************* * per-cpu code **********************************/ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); struct crypto_acomp *acomp; struct acomp_req *req; int ret; mutex_init(&acomp_ctx->mutex); acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); if (!acomp_ctx->buffer) return -ENOMEM; acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); if (IS_ERR(acomp)) { pr_err("could not alloc crypto acomp %s : %ld\n", pool->tfm_name, PTR_ERR(acomp)); ret = PTR_ERR(acomp); goto acomp_fail; } acomp_ctx->acomp = acomp; req = acomp_request_alloc(acomp_ctx->acomp); if (!req) { pr_err("could not alloc crypto acomp_request %s\n", pool->tfm_name); ret = -ENOMEM; goto req_fail; } acomp_ctx->req = req; crypto_init_wait(&acomp_ctx->wait); /* * if the backend of acomp is async zip, crypto_req_done() will wakeup * crypto_wait_req(); if the backend of acomp is scomp, the callback * won't be called, crypto_wait_req() will return without blocking. */ acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, crypto_req_done, &acomp_ctx->wait); return 0; req_fail: crypto_free_acomp(acomp_ctx->acomp); acomp_fail: kfree(acomp_ctx->buffer); return ret; } static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); if (!IS_ERR_OR_NULL(acomp_ctx)) { if (!IS_ERR_OR_NULL(acomp_ctx->req)) acomp_request_free(acomp_ctx->req); if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) crypto_free_acomp(acomp_ctx->acomp); kfree(acomp_ctx->buffer); } return 0; } /********************************* * pool functions **********************************/ static struct zswap_pool *__zswap_pool_current(void) { struct zswap_pool *pool; pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); WARN_ONCE(!pool && zswap_has_pool, "%s: no page storage pool!\n", __func__); return pool; } static struct zswap_pool *zswap_pool_current(void) { assert_spin_locked(&zswap_pools_lock); return __zswap_pool_current(); } static struct zswap_pool *zswap_pool_current_get(void) { struct zswap_pool *pool; rcu_read_lock(); pool = __zswap_pool_current(); if (!zswap_pool_get(pool)) pool = NULL; rcu_read_unlock(); return pool; } static struct zswap_pool *zswap_pool_last_get(void) { struct zswap_pool *pool, *last = NULL; rcu_read_lock(); list_for_each_entry_rcu(pool, &zswap_pools, list) last = pool; WARN_ONCE(!last && zswap_has_pool, "%s: no page storage pool!\n", __func__); if (!zswap_pool_get(last)) last = NULL; rcu_read_unlock(); return last; } /* type and compressor must be null-terminated */ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) { struct zswap_pool *pool; assert_spin_locked(&zswap_pools_lock); list_for_each_entry_rcu(pool, &zswap_pools, list) { if (strcmp(pool->tfm_name, compressor)) continue; /* all zpools share the same type */ if (strcmp(zpool_get_type(pool->zpools[0]), type)) continue; /* if we can't get it, it's about to be destroyed */ if (!zswap_pool_get(pool)) continue; return pool; } return NULL; } /* * If the entry is still valid in the tree, drop the initial ref and remove it * from the tree. This function must be called with an additional ref held, * otherwise it may race with another invalidation freeing the entry. */ static void zswap_invalidate_entry(struct zswap_tree *tree, struct zswap_entry *entry) { if (zswap_rb_erase(&tree->rbroot, entry)) zswap_entry_put(tree, entry); } static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, spinlock_t *lock, void *arg) { struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); bool *encountered_page_in_swapcache = (bool *)arg; struct zswap_tree *tree; pgoff_t swpoffset; enum lru_status ret = LRU_REMOVED_RETRY; int writeback_result; /* * Once the lru lock is dropped, the entry might get freed. The * swpoffset is copied to the stack, and entry isn't deref'd again * until the entry is verified to still be alive in the tree. */ swpoffset = swp_offset(entry->swpentry); tree = zswap_trees[swp_type(entry->swpentry)]; list_lru_isolate(l, item); /* * It's safe to drop the lock here because we return either * LRU_REMOVED_RETRY or LRU_RETRY. */ spin_unlock(lock); /* Check for invalidate() race */ spin_lock(&tree->lock); if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) goto unlock; /* Hold a reference to prevent a free during writeback */ zswap_entry_get(entry); spin_unlock(&tree->lock); writeback_result = zswap_writeback_entry(entry, tree); spin_lock(&tree->lock); if (writeback_result) { zswap_reject_reclaim_fail++; zswap_lru_putback(&entry->pool->list_lru, entry); ret = LRU_RETRY; /* * Encountering a page already in swap cache is a sign that we are shrinking * into the warmer region. We should terminate shrinking (if we're in the dynamic * shrinker context). */ if (writeback_result == -EEXIST && encountered_page_in_swapcache) *encountered_page_in_swapcache = true; goto put_unlock; } zswap_written_back_pages++; if (entry->objcg) count_objcg_event(entry->objcg, ZSWPWB); count_vm_event(ZSWPWB); /* * Writeback started successfully, the page now belongs to the * swapcache. Drop the entry from zswap - unless invalidate already * took it out while we had the tree->lock released for IO. */ zswap_invalidate_entry(tree, entry); put_unlock: /* Drop local reference */ zswap_entry_put(tree, entry); unlock: spin_unlock(&tree->lock); spin_lock(lock); return ret; } static int shrink_memcg(struct mem_cgroup *memcg) { struct zswap_pool *pool; int nid, shrunk = 0; if (!mem_cgroup_zswap_writeback_enabled(memcg)) return -EINVAL; /* * Skip zombies because their LRUs are reparented and we would be * reclaiming from the parent instead of the dead memcg. */ if (memcg && !mem_cgroup_online(memcg)) return -ENOENT; pool = zswap_pool_current_get(); if (!pool) return -EINVAL; for_each_node_state(nid, N_NORMAL_MEMORY) { unsigned long nr_to_walk = 1; shrunk += list_lru_walk_one(&pool->list_lru, nid, memcg, &shrink_memcg_cb, NULL, &nr_to_walk); } zswap_pool_put(pool); return shrunk ? 0 : -EAGAIN; } static void shrink_worker(struct work_struct *w) { struct zswap_pool *pool = container_of(w, typeof(*pool), shrink_work); struct mem_cgroup *memcg; int ret, failures = 0; /* global reclaim will select cgroup in a round-robin fashion. */ do { spin_lock(&zswap_pools_lock); pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); memcg = pool->next_shrink; /* * We need to retry if we have gone through a full round trip, or if we * got an offline memcg (or else we risk undoing the effect of the * zswap memcg offlining cleanup callback). This is not catastrophic * per se, but it will keep the now offlined memcg hostage for a while. * * Note that if we got an online memcg, we will keep the extra * reference in case the original reference obtained by mem_cgroup_iter * is dropped by the zswap memcg offlining callback, ensuring that the * memcg is not killed when we are reclaiming. */ if (!memcg) { spin_unlock(&zswap_pools_lock); if (++failures == MAX_RECLAIM_RETRIES) break; goto resched; } if (!mem_cgroup_tryget_online(memcg)) { /* drop the reference from mem_cgroup_iter() */ mem_cgroup_iter_break(NULL, memcg); pool->next_shrink = NULL; spin_unlock(&zswap_pools_lock); if (++failures == MAX_RECLAIM_RETRIES) break; goto resched; } spin_unlock(&zswap_pools_lock); ret = shrink_memcg(memcg); /* drop the extra reference */ mem_cgroup_put(memcg); if (ret == -EINVAL) break; if (ret && ++failures == MAX_RECLAIM_RETRIES) break; resched: cond_resched(); } while (!zswap_can_accept()); zswap_pool_put(pool); } static struct zswap_pool *zswap_pool_create(char *type, char *compressor) { int i; struct zswap_pool *pool; char name[38]; /* 'zswap' + 32 char (max) num + \0 */ gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; int ret; if (!zswap_has_pool) { /* if either are unset, pool initialization failed, and we * need both params to be set correctly before trying to * create a pool. */ if (!strcmp(type, ZSWAP_PARAM_UNSET)) return NULL; if (!strcmp(compressor, ZSWAP_PARAM_UNSET)) return NULL; } pool = kzalloc(sizeof(*pool), GFP_KERNEL); if (!pool) return NULL; for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) { /* unique name for each pool specifically required by zsmalloc */ snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count)); pool->zpools[i] = zpool_create_pool(type, name, gfp); if (!pool->zpools[i]) { pr_err("%s zpool not available\n", type); goto error; } } pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0])); strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx); if (!pool->acomp_ctx) { pr_err("percpu alloc failed\n"); goto error; } ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); if (ret) goto error; zswap_alloc_shrinker(pool); if (!pool->shrinker) goto error; pr_debug("using %s compressor\n", pool->tfm_name); /* being the current pool takes 1 ref; this func expects the * caller to always add the new pool as the current pool */ kref_init(&pool->kref); INIT_LIST_HEAD(&pool->list); if (list_lru_init_memcg(&pool->list_lru, pool->shrinker)) goto lru_fail; shrinker_register(pool->shrinker); INIT_WORK(&pool->shrink_work, shrink_worker); atomic_set(&pool->nr_stored, 0); zswap_pool_debug("created", pool); return pool; lru_fail: list_lru_destroy(&pool->list_lru); shrinker_free(pool->shrinker); error: if (pool->acomp_ctx) free_percpu(pool->acomp_ctx); while (i--) zpool_destroy_pool(pool->zpools[i]); kfree(pool); return NULL; } static struct zswap_pool *__zswap_pool_create_fallback(void) { bool has_comp, has_zpool; has_comp = crypto_has_acomp(zswap_compressor, 0, 0); if (!has_comp && strcmp(zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) { pr_err("compressor %s not available, using default %s\n", zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT); param_free_charp(&zswap_compressor); zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; has_comp = crypto_has_acomp(zswap_compressor, 0, 0); } if (!has_comp) { pr_err("default compressor %s not available\n", zswap_compressor); param_free_charp(&zswap_compressor); zswap_compressor = ZSWAP_PARAM_UNSET; } has_zpool = zpool_has_pool(zswap_zpool_type); if (!has_zpool && strcmp(zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT)) { pr_err("zpool %s not available, using default %s\n", zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT); param_free_charp(&zswap_zpool_type); zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; has_zpool = zpool_has_pool(zswap_zpool_type); } if (!has_zpool) { pr_err("default zpool %s not available\n", zswap_zpool_type); param_free_charp(&zswap_zpool_type); zswap_zpool_type = ZSWAP_PARAM_UNSET; } if (!has_comp || !has_zpool) return NULL; return zswap_pool_create(zswap_zpool_type, zswap_compressor); } static void zswap_pool_destroy(struct zswap_pool *pool) { int i; zswap_pool_debug("destroying", pool); shrinker_free(pool->shrinker); cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); free_percpu(pool->acomp_ctx); list_lru_destroy(&pool->list_lru); spin_lock(&zswap_pools_lock); mem_cgroup_iter_break(NULL, pool->next_shrink); pool->next_shrink = NULL; spin_unlock(&zswap_pools_lock); for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) zpool_destroy_pool(pool->zpools[i]); kfree(pool); } static int __must_check zswap_pool_get(struct zswap_pool *pool) { if (!pool) return 0; return kref_get_unless_zero(&pool->kref); } static void __zswap_pool_release(struct work_struct *work) { struct zswap_pool *pool = container_of(work, typeof(*pool), release_work); synchronize_rcu(); /* nobody should have been able to get a kref... */ WARN_ON(kref_get_unless_zero(&pool->kref)); /* pool is now off zswap_pools list and has no references. */ zswap_pool_destroy(pool); } static void __zswap_pool_empty(struct kref *kref) { struct zswap_pool *pool; pool = container_of(kref, typeof(*pool), kref); spin_lock(&zswap_pools_lock); WARN_ON(pool == zswap_pool_current()); list_del_rcu(&pool->list); INIT_WORK(&pool->release_work, __zswap_pool_release); schedule_work(&pool->release_work); spin_unlock(&zswap_pools_lock); } static void zswap_pool_put(struct zswap_pool *pool) { kref_put(&pool->kref, __zswap_pool_empty); } /********************************* * param callbacks **********************************/ static bool zswap_pool_changed(const char *s, const struct kernel_param *kp) { /* no change required */ if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool) return false; return true; } /* val must be a null-terminated string */ static int __zswap_param_set(const char *val, const struct kernel_param *kp, char *type, char *compressor) { struct zswap_pool *pool, *put_pool = NULL; char *s = strstrip((char *)val); int ret = 0; bool new_pool = false; mutex_lock(&zswap_init_lock); switch (zswap_init_state) { case ZSWAP_UNINIT: /* if this is load-time (pre-init) param setting, * don't create a pool; that's done during init. */ ret = param_set_charp(s, kp); break; case ZSWAP_INIT_SUCCEED: new_pool = zswap_pool_changed(s, kp); break; case ZSWAP_INIT_FAILED: pr_err("can't set param, initialization failed\n"); ret = -ENODEV; } mutex_unlock(&zswap_init_lock); /* no need to create a new pool, return directly */ if (!new_pool) return ret; if (!type) { if (!zpool_has_pool(s)) { pr_err("zpool %s not available\n", s); return -ENOENT; } type = s; } else if (!compressor) { if (!crypto_has_acomp(s, 0, 0)) { pr_err("compressor %s not available\n", s); return -ENOENT; } compressor = s; } else { WARN_ON(1); return -EINVAL; } spin_lock(&zswap_pools_lock); pool = zswap_pool_find_get(type, compressor); if (pool) { zswap_pool_debug("using existing", pool); WARN_ON(pool == zswap_pool_current()); list_del_rcu(&pool->list); } spin_unlock(&zswap_pools_lock); if (!pool) pool = zswap_pool_create(type, compressor); if (pool) ret = param_set_charp(s, kp); else ret = -EINVAL; spin_lock(&zswap_pools_lock); if (!ret) { put_pool = zswap_pool_current(); list_add_rcu(&pool->list, &zswap_pools); zswap_has_pool = true; } else if (pool) { /* add the possibly pre-existing pool to the end of the pools * list; if it's new (and empty) then it'll be removed and * destroyed by the put after we drop the lock */ list_add_tail_rcu(&pool->list, &zswap_pools); put_pool = pool; } spin_unlock(&zswap_pools_lock); if (!zswap_has_pool && !pool) { /* if initial pool creation failed, and this pool creation also * failed, maybe both compressor and zpool params were bad. * Allow changing this param, so pool creation will succeed * when the other param is changed. We already verified this * param is ok in the zpool_has_pool() or crypto_has_acomp() * checks above. */ ret = param_set_charp(s, kp); } /* drop the ref from either the old current pool, * or the new pool we failed to add */ if (put_pool) zswap_pool_put(put_pool); return ret; } static int zswap_compressor_param_set(const char *val, const struct kernel_param *kp) { return __zswap_param_set(val, kp, zswap_zpool_type, NULL); } static int zswap_zpool_param_set(const char *val, const struct kernel_param *kp) { return __zswap_param_set(val, kp, NULL, zswap_compressor); } static int zswap_enabled_param_set(const char *val, const struct kernel_param *kp) { int ret = -ENODEV; /* if this is load-time (pre-init) param setting, only set param. */ if (system_state != SYSTEM_RUNNING) return param_set_bool(val, kp); mutex_lock(&zswap_init_lock); switch (zswap_init_state) { case ZSWAP_UNINIT: if (zswap_setup()) break; fallthrough; case ZSWAP_INIT_SUCCEED: if (!zswap_has_pool) pr_err("can't enable, no pool configured\n"); else ret = param_set_bool(val, kp); break; case ZSWAP_INIT_FAILED: pr_err("can't enable, initialization failed\n"); } mutex_unlock(&zswap_init_lock); return ret; } static void __zswap_load(struct zswap_entry *entry, struct page *page) { struct zpool *zpool = zswap_find_zpool(entry); struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; u8 *src; acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); mutex_lock(&acomp_ctx->mutex); src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); if (!zpool_can_sleep_mapped(zpool)) { memcpy(acomp_ctx->buffer, src, entry->length); src = acomp_ctx->buffer; zpool_unmap_handle(zpool, entry->handle); } sg_init_one(&input, src, entry->length); sg_init_table(&output, 1); sg_set_page(&output, page, PAGE_SIZE, 0); acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE); BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)); BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); mutex_unlock(&acomp_ctx->mutex); if (zpool_can_sleep_mapped(zpool)) zpool_unmap_handle(zpool, entry->handle); } /********************************* * writeback code **********************************/ /* * Attempts to free an entry by adding a folio to the swap cache, * decompressing the entry data into the folio, and issuing a * bio write to write the folio back to the swap device. * * This can be thought of as a "resumed writeback" of the folio * to the swap device. We are basically resuming the same swap * writeback path that was intercepted with the zswap_store() * in the first place. After the folio has been decompressed into * the swap cache, the compressed version stored by zswap can be * freed. */ static int zswap_writeback_entry(struct zswap_entry *entry, struct zswap_tree *tree) { swp_entry_t swpentry = entry->swpentry; struct folio *folio; struct mempolicy *mpol; bool folio_was_allocated; struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, }; /* try to allocate swap cache folio */ mpol = get_task_policy(current); folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, NO_INTERLEAVE_INDEX, &folio_was_allocated, true); if (!folio) return -ENOMEM; /* * Found an existing folio, we raced with load/swapin. We generally * writeback cold folios from zswap, and swapin means the folio just * became hot. Skip this folio and let the caller find another one. */ if (!folio_was_allocated) { folio_put(folio); return -EEXIST; } /* * folio is locked, and the swapcache is now secured against * concurrent swapping to and from the slot. Verify that the * swap entry hasn't been invalidated and recycled behind our * backs (our zswap_entry reference doesn't prevent that), to * avoid overwriting a new swap folio with old compressed data. */ spin_lock(&tree->lock); if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) { spin_unlock(&tree->lock); delete_from_swap_cache(folio); folio_unlock(folio); folio_put(folio); return -ENOMEM; } spin_unlock(&tree->lock); __zswap_load(entry, &folio->page); /* folio is up to date */ folio_mark_uptodate(folio); /* move it to the tail of the inactive list after end_writeback */ folio_set_reclaim(folio); /* start writeback */ __swap_writepage(folio, &wbc); folio_put(folio); return 0; } static int zswap_is_page_same_filled(void *ptr, unsigned long *value) { unsigned long *page; unsigned long val; unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1; page = (unsigned long *)ptr; val = page[0]; if (val != page[last_pos]) return 0; for (pos = 1; pos < last_pos; pos++) { if (val != page[pos]) return 0; } *value = val; return 1; } static void zswap_fill_page(void *ptr, unsigned long value) { unsigned long *page; page = (unsigned long *)ptr; memset_l(page, value, PAGE_SIZE / sizeof(unsigned long)); } bool zswap_store(struct folio *folio) { swp_entry_t swp = folio->swap; int type = swp_type(swp); pgoff_t offset = swp_offset(swp); struct page *page = &folio->page; struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry, *dupentry; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; struct zswap_pool *pool; struct zpool *zpool; unsigned int dlen = PAGE_SIZE; unsigned long handle, value; char *buf; u8 *src, *dst; gfp_t gfp; int ret; VM_WARN_ON_ONCE(!folio_test_locked(folio)); VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); /* Large folios aren't supported */ if (folio_test_large(folio)) return false; if (!tree) return false; /* * If this is a duplicate, it must be removed before attempting to store * it, otherwise, if the store fails the old page won't be removed from * the tree, and it might be written back overriding the new data. */ spin_lock(&tree->lock); dupentry = zswap_rb_search(&tree->rbroot, offset); if (dupentry) { zswap_duplicate_entry++; zswap_invalidate_entry(tree, dupentry); } spin_unlock(&tree->lock); if (!zswap_enabled) return false; objcg = get_obj_cgroup_from_folio(folio); if (objcg && !obj_cgroup_may_zswap(objcg)) { memcg = get_mem_cgroup_from_objcg(objcg); if (shrink_memcg(memcg)) { mem_cgroup_put(memcg); goto reject; } mem_cgroup_put(memcg); } /* reclaim space if needed */ if (zswap_is_full()) { zswap_pool_limit_hit++; zswap_pool_reached_full = true; goto shrink; } if (zswap_pool_reached_full) { if (!zswap_can_accept()) goto shrink; else zswap_pool_reached_full = false; } /* allocate entry */ entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); if (!entry) { zswap_reject_kmemcache_fail++; goto reject; } if (zswap_same_filled_pages_enabled) { src = kmap_local_page(page); if (zswap_is_page_same_filled(src, &value)) { kunmap_local(src); entry->swpentry = swp_entry(type, offset); entry->length = 0; entry->value = value; atomic_inc(&zswap_same_filled_pages); goto insert_entry; } kunmap_local(src); } if (!zswap_non_same_filled_pages_enabled) goto freepage; /* if entry is successfully added, it keeps the reference */ entry->pool = zswap_pool_current_get(); if (!entry->pool) goto freepage; if (objcg) { memcg = get_mem_cgroup_from_objcg(objcg); if (memcg_list_lru_alloc(memcg, &entry->pool->list_lru, GFP_KERNEL)) { mem_cgroup_put(memcg); goto put_pool; } mem_cgroup_put(memcg); } /* compress */ acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); mutex_lock(&acomp_ctx->mutex); dst = acomp_ctx->buffer; sg_init_table(&input, 1); sg_set_page(&input, &folio->page, PAGE_SIZE, 0); /* * We need PAGE_SIZE * 2 here since there maybe over-compression case, * and hardware-accelerators may won't check the dst buffer size, so * giving the dst buffer with enough length to avoid buffer overflow. */ sg_init_one(&output, dst, PAGE_SIZE * 2); acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); /* * it maybe looks a little bit silly that we send an asynchronous request, * then wait for its completion synchronously. This makes the process look * synchronous in fact. * Theoretically, acomp supports users send multiple acomp requests in one * acomp instance, then get those requests done simultaneously. but in this * case, zswap actually does store and load page by page, there is no * existing method to send the second page before the first page is done * in one thread doing zwap. * but in different threads running on different cpu, we have different * acomp instance, so multiple threads can do (de)compression in parallel. */ ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); dlen = acomp_ctx->req->dlen; if (ret) { zswap_reject_compress_fail++; goto put_dstmem; } /* store */ zpool = zswap_find_zpool(entry); gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; if (zpool_malloc_support_movable(zpool)) gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; ret = zpool_malloc(zpool, dlen, gfp, &handle); if (ret == -ENOSPC) { zswap_reject_compress_poor++; goto put_dstmem; } if (ret) { zswap_reject_alloc_fail++; goto put_dstmem; } buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); memcpy(buf, dst, dlen); zpool_unmap_handle(zpool, handle); mutex_unlock(&acomp_ctx->mutex); /* populate entry */ entry->swpentry = swp_entry(type, offset); entry->handle = handle; entry->length = dlen; insert_entry: entry->objcg = objcg; if (objcg) { obj_cgroup_charge_zswap(objcg, entry->length); /* Account before objcg ref is moved to tree */ count_objcg_event(objcg, ZSWPOUT); } /* map */ spin_lock(&tree->lock); /* * A duplicate entry should have been removed at the beginning of this * function. Since the swap entry should be pinned, if a duplicate is * found again here it means that something went wrong in the swap * cache. */ while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) { WARN_ON(1); zswap_duplicate_entry++; zswap_invalidate_entry(tree, dupentry); } if (entry->length) { INIT_LIST_HEAD(&entry->lru); zswap_lru_add(&entry->pool->list_lru, entry); atomic_inc(&entry->pool->nr_stored); } spin_unlock(&tree->lock); /* update stats */ atomic_inc(&zswap_stored_pages); zswap_update_total_size(); count_vm_event(ZSWPOUT); return true; put_dstmem: mutex_unlock(&acomp_ctx->mutex); put_pool: zswap_pool_put(entry->pool); freepage: zswap_entry_cache_free(entry); reject: if (objcg) obj_cgroup_put(objcg); return false; shrink: pool = zswap_pool_last_get(); if (pool && !queue_work(shrink_wq, &pool->shrink_work)) zswap_pool_put(pool); goto reject; } bool zswap_load(struct folio *folio) { swp_entry_t swp = folio->swap; int type = swp_type(swp); pgoff_t offset = swp_offset(swp); struct page *page = &folio->page; struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry; u8 *dst; VM_WARN_ON_ONCE(!folio_test_locked(folio)); /* find */ spin_lock(&tree->lock); entry = zswap_entry_find_get(&tree->rbroot, offset); if (!entry) { spin_unlock(&tree->lock); return false; } spin_unlock(&tree->lock); if (entry->length) __zswap_load(entry, page); else { dst = kmap_local_page(page); zswap_fill_page(dst, entry->value); kunmap_local(dst); } count_vm_event(ZSWPIN); if (entry->objcg) count_objcg_event(entry->objcg, ZSWPIN); spin_lock(&tree->lock); if (zswap_exclusive_loads_enabled) { zswap_invalidate_entry(tree, entry); folio_mark_dirty(folio); } else if (entry->length) { zswap_lru_del(&entry->pool->list_lru, entry); zswap_lru_add(&entry->pool->list_lru, entry); } zswap_entry_put(tree, entry); spin_unlock(&tree->lock); return true; } void zswap_invalidate(int type, pgoff_t offset) { struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry; /* find */ spin_lock(&tree->lock); entry = zswap_rb_search(&tree->rbroot, offset); if (!entry) { /* entry was written back */ spin_unlock(&tree->lock); return; } zswap_invalidate_entry(tree, entry); spin_unlock(&tree->lock); } void zswap_swapon(int type) { struct zswap_tree *tree; tree = kzalloc(sizeof(*tree), GFP_KERNEL); if (!tree) { pr_err("alloc failed, zswap disabled for swap type %d\n", type); return; } tree->rbroot = RB_ROOT; spin_lock_init(&tree->lock); zswap_trees[type] = tree; } void zswap_swapoff(int type) { struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry, *n; if (!tree) return; /* walk the tree and free everything */ spin_lock(&tree->lock); rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) zswap_free_entry(entry); tree->rbroot = RB_ROOT; spin_unlock(&tree->lock); kfree(tree); zswap_trees[type] = NULL; } /********************************* * debugfs functions **********************************/ #ifdef CONFIG_DEBUG_FS #include <linux/debugfs.h> static struct dentry *zswap_debugfs_root; static int zswap_debugfs_init(void) { if (!debugfs_initialized()) return -ENODEV; zswap_debugfs_root = debugfs_create_dir("zswap", NULL); debugfs_create_u64("pool_limit_hit", 0444, zswap_debugfs_root, &zswap_pool_limit_hit); debugfs_create_u64("reject_reclaim_fail", 0444, zswap_debugfs_root, &zswap_reject_reclaim_fail); debugfs_create_u64("reject_alloc_fail", 0444, zswap_debugfs_root, &zswap_reject_alloc_fail); debugfs_create_u64("reject_kmemcache_fail", 0444, zswap_debugfs_root, &zswap_reject_kmemcache_fail); debugfs_create_u64("reject_compress_fail", 0444, zswap_debugfs_root, &zswap_reject_compress_fail); debugfs_create_u64("reject_compress_poor", 0444, zswap_debugfs_root, &zswap_reject_compress_poor); debugfs_create_u64("written_back_pages", 0444, zswap_debugfs_root, &zswap_written_back_pages); debugfs_create_u64("duplicate_entry", 0444, zswap_debugfs_root, &zswap_duplicate_entry); debugfs_create_u64("pool_total_size", 0444, zswap_debugfs_root, &zswap_pool_total_size); debugfs_create_atomic_t("stored_pages", 0444, zswap_debugfs_root, &zswap_stored_pages); debugfs_create_atomic_t("same_filled_pages", 0444, zswap_debugfs_root, &zswap_same_filled_pages); return 0; } #else static int zswap_debugfs_init(void) { return 0; } #endif /********************************* * module init and exit **********************************/ static int zswap_setup(void) { struct zswap_pool *pool; int ret; zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); if (!zswap_entry_cache) { pr_err("entry cache creation failed\n"); goto cache_fail; } ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE, "mm/zswap_pool:prepare", zswap_cpu_comp_prepare, zswap_cpu_comp_dead); if (ret) goto hp_fail; pool = __zswap_pool_create_fallback(); if (pool) { pr_info("loaded using pool %s/%s\n", pool->tfm_name, zpool_get_type(pool->zpools[0])); list_add(&pool->list, &zswap_pools); zswap_has_pool = true; } else { pr_err("pool creation failed\n"); zswap_enabled = false; } shrink_wq = create_workqueue("zswap-shrink"); if (!shrink_wq) goto fallback_fail; if (zswap_debugfs_init()) pr_warn("debugfs initialization failed\n"); zswap_init_state = ZSWAP_INIT_SUCCEED; return 0; fallback_fail: if (pool) zswap_pool_destroy(pool); hp_fail: kmem_cache_destroy(zswap_entry_cache); cache_fail: /* if built-in, we aren't unloaded on failure; don't allow use */ zswap_init_state = ZSWAP_INIT_FAILED; zswap_enabled = false; return -ENOMEM; } static int __init zswap_init(void) { if (!zswap_enabled) return 0; return zswap_setup(); } /* must be late so crypto has time to come up */ late_initcall(zswap_init); MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>"); MODULE_DESCRIPTION("Compressed cache for swap pages");
38 32 31 39 2 3 1 38 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" #include "bitset.h" struct features_req_info { struct ethnl_req_info base; }; struct features_reply_data { struct ethnl_reply_data base; u32 hw[ETHTOOL_DEV_FEATURE_WORDS]; u32 wanted[ETHTOOL_DEV_FEATURE_WORDS]; u32 active[ETHTOOL_DEV_FEATURE_WORDS]; u32 nochange[ETHTOOL_DEV_FEATURE_WORDS]; u32 all[ETHTOOL_DEV_FEATURE_WORDS]; }; #define FEATURES_REPDATA(__reply_base) \ container_of(__reply_base, struct features_reply_data, base) const struct nla_policy ethnl_features_get_policy[] = { [ETHTOOL_A_FEATURES_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static void ethnl_features_to_bitmap32(u32 *dest, netdev_features_t src) { unsigned int i; for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; i++) dest[i] = src >> (32 * i); } static int features_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct features_reply_data *data = FEATURES_REPDATA(reply_base); struct net_device *dev = reply_base->dev; netdev_features_t all_features; ethnl_features_to_bitmap32(data->hw, dev->hw_features); ethnl_features_to_bitmap32(data->wanted, dev->wanted_features); ethnl_features_to_bitmap32(data->active, dev->features); ethnl_features_to_bitmap32(data->nochange, NETIF_F_NEVER_CHANGE); all_features = GENMASK_ULL(NETDEV_FEATURE_COUNT - 1, 0); ethnl_features_to_bitmap32(data->all, all_features); return 0; } static int features_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct features_reply_data *data = FEATURES_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; unsigned int len = 0; int ret; ret = ethnl_bitset32_size(data->hw, data->all, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) return ret; len += ret; ret = ethnl_bitset32_size(data->wanted, NULL, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) return ret; len += ret; ret = ethnl_bitset32_size(data->active, NULL, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) return ret; len += ret; ret = ethnl_bitset32_size(data->nochange, NULL, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) return ret; len += ret; return len; } static int features_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct features_reply_data *data = FEATURES_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; int ret; ret = ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_HW, data->hw, data->all, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) return ret; ret = ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_WANTED, data->wanted, NULL, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) return ret; ret = ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_ACTIVE, data->active, NULL, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) return ret; return ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_NOCHANGE, data->nochange, NULL, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); } const struct ethnl_request_ops ethnl_features_request_ops = { .request_cmd = ETHTOOL_MSG_FEATURES_GET, .reply_cmd = ETHTOOL_MSG_FEATURES_GET_REPLY, .hdr_attr = ETHTOOL_A_FEATURES_HEADER, .req_info_size = sizeof(struct features_req_info), .reply_data_size = sizeof(struct features_reply_data), .prepare_data = features_prepare_data, .reply_size = features_reply_size, .fill_reply = features_fill_reply, }; /* FEATURES_SET */ const struct nla_policy ethnl_features_set_policy[] = { [ETHTOOL_A_FEATURES_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_FEATURES_WANTED] = { .type = NLA_NESTED }, }; static void ethnl_features_to_bitmap(unsigned long *dest, netdev_features_t val) { const unsigned int words = BITS_TO_LONGS(NETDEV_FEATURE_COUNT); unsigned int i; for (i = 0; i < words; i++) dest[i] = (unsigned long)(val >> (i * BITS_PER_LONG)); } static netdev_features_t ethnl_bitmap_to_features(unsigned long *src) { const unsigned int nft_bits = sizeof(netdev_features_t) * BITS_PER_BYTE; const unsigned int words = BITS_TO_LONGS(NETDEV_FEATURE_COUNT); netdev_features_t ret = 0; unsigned int i; for (i = 0; i < words; i++) ret |= (netdev_features_t)(src[i]) << (i * BITS_PER_LONG); ret &= ~(netdev_features_t)0 >> (nft_bits - NETDEV_FEATURE_COUNT); return ret; } static int features_send_reply(struct net_device *dev, struct genl_info *info, const unsigned long *wanted, const unsigned long *wanted_mask, const unsigned long *active, const unsigned long *active_mask, bool compact) { struct sk_buff *rskb; void *reply_payload; int reply_len = 0; int ret; reply_len = ethnl_reply_header_size(); ret = ethnl_bitset_size(wanted, wanted_mask, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) goto err; reply_len += ret; ret = ethnl_bitset_size(active, active_mask, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) goto err; reply_len += ret; ret = -ENOMEM; rskb = ethnl_reply_init(reply_len, dev, ETHTOOL_MSG_FEATURES_SET_REPLY, ETHTOOL_A_FEATURES_HEADER, info, &reply_payload); if (!rskb) goto err; ret = ethnl_put_bitset(rskb, ETHTOOL_A_FEATURES_WANTED, wanted, wanted_mask, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) goto nla_put_failure; ret = ethnl_put_bitset(rskb, ETHTOOL_A_FEATURES_ACTIVE, active, active_mask, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) goto nla_put_failure; genlmsg_end(rskb, reply_payload); ret = genlmsg_reply(rskb, info); return ret; nla_put_failure: nlmsg_free(rskb); WARN_ONCE(1, "calculated message payload length (%d) not sufficient\n", reply_len); err: GENL_SET_ERR_MSG(info, "failed to send reply message"); return ret; } int ethnl_set_features(struct sk_buff *skb, struct genl_info *info) { DECLARE_BITMAP(wanted_diff_mask, NETDEV_FEATURE_COUNT); DECLARE_BITMAP(active_diff_mask, NETDEV_FEATURE_COUNT); DECLARE_BITMAP(old_active, NETDEV_FEATURE_COUNT); DECLARE_BITMAP(old_wanted, NETDEV_FEATURE_COUNT); DECLARE_BITMAP(new_active, NETDEV_FEATURE_COUNT); DECLARE_BITMAP(new_wanted, NETDEV_FEATURE_COUNT); DECLARE_BITMAP(req_wanted, NETDEV_FEATURE_COUNT); DECLARE_BITMAP(req_mask, NETDEV_FEATURE_COUNT); struct ethnl_req_info req_info = {}; struct nlattr **tb = info->attrs; struct net_device *dev; bool mod; int ret; if (!tb[ETHTOOL_A_FEATURES_WANTED]) return -EINVAL; ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_FEATURES_HEADER], genl_info_net(info), info->extack, true); if (ret < 0) return ret; dev = req_info.dev; rtnl_lock(); ret = ethnl_ops_begin(dev); if (ret < 0) goto out_rtnl; ethnl_features_to_bitmap(old_active, dev->features); ethnl_features_to_bitmap(old_wanted, dev->wanted_features); ret = ethnl_parse_bitset(req_wanted, req_mask, NETDEV_FEATURE_COUNT, tb[ETHTOOL_A_FEATURES_WANTED], netdev_features_strings, info->extack); if (ret < 0) goto out_ops; if (ethnl_bitmap_to_features(req_mask) & ~NETIF_F_ETHTOOL_BITS) { GENL_SET_ERR_MSG(info, "attempt to change non-ethtool features"); ret = -EINVAL; goto out_ops; } /* set req_wanted bits not in req_mask from old_wanted */ bitmap_and(req_wanted, req_wanted, req_mask, NETDEV_FEATURE_COUNT); bitmap_andnot(new_wanted, old_wanted, req_mask, NETDEV_FEATURE_COUNT); bitmap_or(req_wanted, new_wanted, req_wanted, NETDEV_FEATURE_COUNT); if (!bitmap_equal(req_wanted, old_wanted, NETDEV_FEATURE_COUNT)) { dev->wanted_features &= ~dev->hw_features; dev->wanted_features |= ethnl_bitmap_to_features(req_wanted) & dev->hw_features; __netdev_update_features(dev); } ethnl_features_to_bitmap(new_active, dev->features); mod = !bitmap_equal(old_active, new_active, NETDEV_FEATURE_COUNT); ret = 0; if (!(req_info.flags & ETHTOOL_FLAG_OMIT_REPLY)) { bool compact = req_info.flags & ETHTOOL_FLAG_COMPACT_BITSETS; bitmap_xor(wanted_diff_mask, req_wanted, new_active, NETDEV_FEATURE_COUNT); bitmap_xor(active_diff_mask, old_active, new_active, NETDEV_FEATURE_COUNT); bitmap_and(wanted_diff_mask, wanted_diff_mask, req_mask, NETDEV_FEATURE_COUNT); bitmap_and(req_wanted, req_wanted, wanted_diff_mask, NETDEV_FEATURE_COUNT); bitmap_and(new_active, new_active, active_diff_mask, NETDEV_FEATURE_COUNT); ret = features_send_reply(dev, info, req_wanted, wanted_diff_mask, new_active, active_diff_mask, compact); } if (mod) netdev_features_change(dev); out_ops: ethnl_ops_complete(dev); out_rtnl: rtnl_unlock(); ethnl_parse_header_dev_put(&req_info); return ret; }
5625 2 2 2 1 1 1 1 1 1 1077 1078 1077 1076 1077 2586 2 2584 5716 128 5624 4990 4987 4998 1077 1074 1078 4287 129 4441 280 280 3997 3998 3994 91 91 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 // SPDX-License-Identifier: GPL-2.0-only /* * Simple NUMA memory policy for the Linux kernel. * * Copyright 2003,2004 Andi Kleen, SuSE Labs. * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. * * NUMA policy allows the user to give hints in which node(s) memory should * be allocated. * * Support four policies per VMA and per process: * * The VMA policy has priority over the process policy for a page fault. * * interleave Allocate memory interleaved over a set of nodes, * with normal fallback if it fails. * For VMA based allocations this interleaves based on the * offset into the backing object or offset into the mapping * for anonymous memory. For process policy an process counter * is used. * * bind Only allocate memory on a specific set of nodes, * no fallback. * FIXME: memory is allocated starting with the first node * to the last. It would be better if bind would truly restrict * the allocation to memory nodes instead * * preferred Try a specific node first before normal fallback. * As a special case NUMA_NO_NODE here means do the allocation * on the local CPU. This is normally identical to default, * but useful to set in a VMA when you have a non default * process policy. * * preferred many Try a set of nodes first before normal fallback. This is * similar to preferred without the special case. * * default Allocate on the local node first, or when on a VMA * use the process policy. This is what Linux always did * in a NUMA aware kernel and still does by, ahem, default. * * The process policy is applied for most non interrupt memory allocations * in that process' context. Interrupts ignore the policies and always * try to allocate on the local CPU. The VMA policy is only applied for memory * allocations for a VMA in the VM. * * Currently there are a few corner cases in swapping where the policy * is not applied, but the majority should be handled. When process policy * is used it is not remembered over swap outs/swap ins. * * Only the highest zone in the zone hierarchy gets policied. Allocations * requesting a lower zone just use default policy. This implies that * on systems with highmem kernel lowmem allocation don't get policied. * Same with GFP_DMA allocations. * * For shmem/tmpfs shared memory the policy is shared between * all users and remembered even when nobody has memory mapped. */ /* Notebook: fix mmap readahead to honour policy and enable policy for any page cache object statistics for bigpages global policy for page cache? currently it uses process policy. Requires first item above. handle mremap for shared memory (currently ignored for the policy) grows down? make bind policy root only? It can trigger oom much faster and the kernel is not always grateful with that. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mempolicy.h> #include <linux/pagewalk.h> #include <linux/highmem.h> #include <linux/hugetlb.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/task.h> #include <linux/nodemask.h> #include <linux/cpuset.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/export.h> #include <linux/nsproxy.h> #include <linux/interrupt.h> #include <linux/init.h> #include <linux/compat.h> #include <linux/ptrace.h> #include <linux/swap.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/migrate.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/ctype.h> #include <linux/mm_inline.h> #include <linux/mmu_notifier.h> #include <linux/printk.h> #include <linux/swapops.h> #include <asm/tlbflush.h> #include <asm/tlb.h> #include <linux/uaccess.h> #include "internal.h" /* Internal flags */ #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ #define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */ static struct kmem_cache *policy_cache; static struct kmem_cache *sn_cache; /* Highest zone. An specific allocation for a zone below that is not policied. */ enum zone_type policy_zone = 0; /* * run-time system-wide default policy => local allocation */ static struct mempolicy default_policy = { .refcnt = ATOMIC_INIT(1), /* never free it */ .mode = MPOL_LOCAL, }; static struct mempolicy preferred_node_policy[MAX_NUMNODES]; /** * numa_nearest_node - Find nearest node by state * @node: Node id to start the search * @state: State to filter the search * * Lookup the closest node by distance if @nid is not in state. * * Return: this @node if it is in state, otherwise the closest node by distance */ int numa_nearest_node(int node, unsigned int state) { int min_dist = INT_MAX, dist, n, min_node; if (state >= NR_NODE_STATES) return -EINVAL; if (node == NUMA_NO_NODE || node_state(node, state)) return node; min_node = node; for_each_node_state(n, state) { dist = node_distance(node, n); if (dist < min_dist) { min_dist = dist; min_node = n; } } return min_node; } EXPORT_SYMBOL_GPL(numa_nearest_node); struct mempolicy *get_task_policy(struct task_struct *p) { struct mempolicy *pol = p->mempolicy; int node; if (pol) return pol; node = numa_node_id(); if (node != NUMA_NO_NODE) { pol = &preferred_node_policy[node]; /* preferred_node_policy is not initialised early in boot */ if (pol->mode) return pol; } return &default_policy; } static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); } mpol_ops[MPOL_MAX]; static inline int mpol_store_user_nodemask(const struct mempolicy *pol) { return pol->flags & MPOL_MODE_FLAGS; } static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, const nodemask_t *rel) { nodemask_t tmp; nodes_fold(tmp, *orig, nodes_weight(*rel)); nodes_onto(*ret, tmp, *rel); } static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes) { if (nodes_empty(*nodes)) return -EINVAL; pol->nodes = *nodes; return 0; } static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) { if (nodes_empty(*nodes)) return -EINVAL; nodes_clear(pol->nodes); node_set(first_node(*nodes), pol->nodes); return 0; } /* * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if * any, for the new policy. mpol_new() has already validated the nodes * parameter with respect to the policy mode and flags. * * Must be called holding task's alloc_lock to protect task's mems_allowed * and mempolicy. May also be called holding the mmap_lock for write. */ static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes, struct nodemask_scratch *nsc) { int ret; /* * Default (pol==NULL) resp. local memory policies are not a * subject of any remapping. They also do not need any special * constructor. */ if (!pol || pol->mode == MPOL_LOCAL) return 0; /* Check N_MEMORY */ nodes_and(nsc->mask1, cpuset_current_mems_allowed, node_states[N_MEMORY]); VM_BUG_ON(!nodes); if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); else nodes_and(nsc->mask2, *nodes, nsc->mask1); if (mpol_store_user_nodemask(pol)) pol->w.user_nodemask = *nodes; else pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed; ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); return ret; } /* * This function just creates a new policy, does some check and simple * initialization. You must invoke mpol_set_nodemask() to set nodes. */ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, nodemask_t *nodes) { struct mempolicy *policy; if (mode == MPOL_DEFAULT) { if (nodes && !nodes_empty(*nodes)) return ERR_PTR(-EINVAL); return NULL; } VM_BUG_ON(!nodes); /* * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation). * All other modes require a valid pointer to a non-empty nodemask. */ if (mode == MPOL_PREFERRED) { if (nodes_empty(*nodes)) { if (((flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES))) return ERR_PTR(-EINVAL); mode = MPOL_LOCAL; } } else if (mode == MPOL_LOCAL) { if (!nodes_empty(*nodes) || (flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES)) return ERR_PTR(-EINVAL); } else if (nodes_empty(*nodes)) return ERR_PTR(-EINVAL); policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!policy) return ERR_PTR(-ENOMEM); atomic_set(&policy->refcnt, 1); policy->mode = mode; policy->flags = flags; policy->home_node = NUMA_NO_NODE; return policy; } /* Slow path of a mpol destructor. */ void __mpol_put(struct mempolicy *pol) { if (!atomic_dec_and_test(&pol->refcnt)) return; kmem_cache_free(policy_cache, pol); } static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) { } static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) { nodemask_t tmp; if (pol->flags & MPOL_F_STATIC_NODES) nodes_and(tmp, pol->w.user_nodemask, *nodes); else if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); else { nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed, *nodes); pol->w.cpuset_mems_allowed = *nodes; } if (nodes_empty(tmp)) tmp = *nodes; pol->nodes = tmp; } static void mpol_rebind_preferred(struct mempolicy *pol, const nodemask_t *nodes) { pol->w.cpuset_mems_allowed = *nodes; } /* * mpol_rebind_policy - Migrate a policy to a different set of nodes * * Per-vma policies are protected by mmap_lock. Allocations using per-task * policies are protected by task->mems_allowed_seq to prevent a premature * OOM/allocation failure due to parallel nodemask modification. */ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) { if (!pol || pol->mode == MPOL_LOCAL) return; if (!mpol_store_user_nodemask(pol) && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; mpol_ops[pol->mode].rebind(pol, newmask); } /* * Wrapper for mpol_rebind_policy() that just requires task * pointer, and updates task mempolicy. * * Called with task's alloc_lock held. */ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) { mpol_rebind_policy(tsk->mempolicy, new); } /* * Rebind each vma in mm to new nodemask. * * Call holding a reference to mm. Takes mm->mmap_lock during call. */ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); mmap_write_lock(mm); for_each_vma(vmi, vma) { vma_start_write(vma); mpol_rebind_policy(vma->vm_policy, new); } mmap_write_unlock(mm); } static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { [MPOL_DEFAULT] = { .rebind = mpol_rebind_default, }, [MPOL_INTERLEAVE] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, [MPOL_PREFERRED] = { .create = mpol_new_preferred, .rebind = mpol_rebind_preferred, }, [MPOL_BIND] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, [MPOL_LOCAL] = { .rebind = mpol_rebind_default, }, [MPOL_PREFERRED_MANY] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_preferred, }, }; static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags); static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, pgoff_t ilx, int *nid); static bool strictly_unmovable(unsigned long flags) { /* * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO * if any misplaced page is found. */ return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) == MPOL_MF_STRICT; } struct migration_mpol { /* for alloc_migration_target_by_mpol() */ struct mempolicy *pol; pgoff_t ilx; }; struct queue_pages { struct list_head *pagelist; unsigned long flags; nodemask_t *nmask; unsigned long start; unsigned long end; struct vm_area_struct *first; struct folio *large; /* note last large folio encountered */ long nr_failed; /* could not be isolated at this time */ }; /* * Check if the folio's nid is in qp->nmask. * * If MPOL_MF_INVERT is set in qp->flags, check if the nid is * in the invert of qp->nmask. */ static inline bool queue_folio_required(struct folio *folio, struct queue_pages *qp) { int nid = folio_nid(folio); unsigned long flags = qp->flags; return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); } static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) { struct folio *folio; struct queue_pages *qp = walk->private; if (unlikely(is_pmd_migration_entry(*pmd))) { qp->nr_failed++; return; } folio = pfn_folio(pmd_pfn(*pmd)); if (is_huge_zero_page(&folio->page)) { walk->action = ACTION_CONTINUE; return; } if (!queue_folio_required(folio, qp)) return; if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || !vma_migratable(walk->vma) || !migrate_folio_add(folio, qp->pagelist, qp->flags)) qp->nr_failed++; } /* * Scan through folios, checking if they satisfy the required conditions, * moving them from LRU to local pagelist for migration if they do (or not). * * queue_folios_pte_range() has two possible return values: * 0 - continue walking to scan for more, even if an existing folio on the * wrong node could not be isolated and queued for migration. * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL, * and an existing folio was on a node that does not follow the policy. */ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; struct folio *folio; struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; pte_t *pte, *mapped_pte; pte_t ptent; spinlock_t *ptl; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { queue_folios_pmd(pmd, walk); spin_unlock(ptl); goto out; } mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (!pte) { walk->action = ACTION_AGAIN; return 0; } for (; addr != end; pte++, addr += PAGE_SIZE) { ptent = ptep_get(pte); if (pte_none(ptent)) continue; if (!pte_present(ptent)) { if (is_migration_entry(pte_to_swp_entry(ptent))) qp->nr_failed++; continue; } folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; /* * vm_normal_folio() filters out zero pages, but there might * still be reserved folios to skip, perhaps in a VDSO. */ if (folio_test_reserved(folio)) continue; if (!queue_folio_required(folio, qp)) continue; if (folio_test_large(folio)) { /* * A large folio can only be isolated from LRU once, * but may be mapped by many PTEs (and Copy-On-Write may * intersperse PTEs of other, order 0, folios). This is * a common case, so don't mistake it for failure (but * there can be other cases of multi-mapped pages which * this quick check does not help to filter out - and a * search of the pagelist might grow to be prohibitive). * * migrate_pages(&pagelist) returns nr_failed folios, so * check "large" now so that queue_pages_range() returns * a comparable nr_failed folios. This does imply that * if folio could not be isolated for some racy reason * at its first PTE, later PTEs will not give it another * chance of isolation; but keeps the accounting simple. */ if (folio == qp->large) continue; qp->large = folio; } if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || !vma_migratable(vma) || !migrate_folio_add(folio, qp->pagelist, flags)) { qp->nr_failed++; if (strictly_unmovable(flags)) break; } } pte_unmap_unlock(mapped_pte, ptl); cond_resched(); out: if (qp->nr_failed && strictly_unmovable(flags)) return -EIO; return 0; } static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { #ifdef CONFIG_HUGETLB_PAGE struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; struct folio *folio; spinlock_t *ptl; pte_t entry; ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); entry = huge_ptep_get(pte); if (!pte_present(entry)) { if (unlikely(is_hugetlb_entry_migration(entry))) qp->nr_failed++; goto unlock; } folio = pfn_folio(pte_pfn(entry)); if (!queue_folio_required(folio, qp)) goto unlock; if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || !vma_migratable(walk->vma)) { qp->nr_failed++; goto unlock; } /* * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. * Choosing not to migrate a shared folio is not counted as a failure. * * To check if the folio is shared, ideally we want to make sure * every page is mapped to the same process. Doing that is very * expensive, so check the estimated sharers of the folio instead. */ if ((flags & MPOL_MF_MOVE_ALL) || (folio_estimated_sharers(folio) == 1 && !hugetlb_pmd_shared(pte))) if (!isolate_hugetlb(folio, qp->pagelist)) qp->nr_failed++; unlock: spin_unlock(ptl); if (qp->nr_failed && strictly_unmovable(flags)) return -EIO; #endif return 0; } #ifdef CONFIG_NUMA_BALANCING /* * This is used to mark a range of virtual addresses to be inaccessible. * These are later cleared by a NUMA hinting fault. Depending on these * faults, pages may be migrated for better NUMA placement. * * This is assuming that NUMA faults are handled using PROT_NONE. If * an architecture makes a different choice, it will need further * changes to the core. */ unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { struct mmu_gather tlb; long nr_updated; tlb_gather_mmu(&tlb, vma->vm_mm); nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA); if (nr_updated > 0) count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); tlb_finish_mmu(&tlb); return nr_updated; } #endif /* CONFIG_NUMA_BALANCING */ static int queue_pages_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *next, *vma = walk->vma; struct queue_pages *qp = walk->private; unsigned long endvma = vma->vm_end; unsigned long flags = qp->flags; /* range check first */ VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma); if (!qp->first) { qp->first = vma; if (!(flags & MPOL_MF_DISCONTIG_OK) && (qp->start < vma->vm_start)) /* hole at head side of range */ return -EFAULT; } next = find_vma(vma->vm_mm, vma->vm_end); if (!(flags & MPOL_MF_DISCONTIG_OK) && ((vma->vm_end < qp->end) && (!next || vma->vm_end < next->vm_start))) /* hole at middle or tail of range */ return -EFAULT; /* * Need check MPOL_MF_STRICT to return -EIO if possible * regardless of vma_migratable */ if (!vma_migratable(vma) && !(flags & MPOL_MF_STRICT)) return 1; if (endvma > end) endvma = end; /* * Check page nodes, and queue pages to move, in the current vma. * But if no moving, and no strict checking, the scan can be skipped. */ if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) return 0; return 1; } static const struct mm_walk_ops queue_pages_walk_ops = { .hugetlb_entry = queue_folios_hugetlb, .pmd_entry = queue_folios_pte_range, .test_walk = queue_pages_test_walk, .walk_lock = PGWALK_RDLOCK, }; static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = { .hugetlb_entry = queue_folios_hugetlb, .pmd_entry = queue_folios_pte_range, .test_walk = queue_pages_test_walk, .walk_lock = PGWALK_WRLOCK, }; /* * Walk through page tables and collect pages to be migrated. * * If pages found in a given range are not on the required set of @nodes, * and migration is allowed, they are isolated and queued to @pagelist. * * queue_pages_range() may return: * 0 - all pages already on the right node, or successfully queued for moving * (or neither strict checking nor moving requested: only range checking). * >0 - this number of misplaced folios could not be queued for moving * (a hugetlbfs page or a transparent huge page being counted as 1). * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs. * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified. */ static long queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, nodemask_t *nodes, unsigned long flags, struct list_head *pagelist) { int err; struct queue_pages qp = { .pagelist = pagelist, .flags = flags, .nmask = nodes, .start = start, .end = end, .first = NULL, }; const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ? &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops; err = walk_page_range(mm, start, end, ops, &qp); if (!qp.first) /* whole range in hole */ err = -EFAULT; return err ? : qp.nr_failed; } /* * Apply policy to a single VMA * This must be called with the mmap_lock held for writing. */ static int vma_replace_policy(struct vm_area_struct *vma, struct mempolicy *pol) { int err; struct mempolicy *old; struct mempolicy *new; vma_assert_write_locked(vma); new = mpol_dup(pol); if (IS_ERR(new)) return PTR_ERR(new); if (vma->vm_ops && vma->vm_ops->set_policy) { err = vma->vm_ops->set_policy(vma, new); if (err) goto err_out; } old = vma->vm_policy; vma->vm_policy = new; /* protected by mmap_lock */ mpol_put(old); return 0; err_out: mpol_put(new); return err; } /* Split or merge the VMA (if required) and apply the new policy */ static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, struct mempolicy *new_pol) { unsigned long vmstart, vmend; vmend = min(end, vma->vm_end); if (start > vma->vm_start) { *prev = vma; vmstart = start; } else { vmstart = vma->vm_start; } if (mpol_equal(vma->vm_policy, new_pol)) { *prev = vma; return 0; } vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol); if (IS_ERR(vma)) return PTR_ERR(vma); *prev = vma; return vma_replace_policy(vma, new_pol); } /* Set the process memory policy */ static long do_set_mempolicy(unsigned short mode, unsigned short flags, nodemask_t *nodes) { struct mempolicy *new, *old; NODEMASK_SCRATCH(scratch); int ret; if (!scratch) return -ENOMEM; new = mpol_new(mode, flags, nodes); if (IS_ERR(new)) { ret = PTR_ERR(new); goto out; } task_lock(current); ret = mpol_set_nodemask(new, nodes, scratch); if (ret) { task_unlock(current); mpol_put(new); goto out; } old = current->mempolicy; current->mempolicy = new; if (new && new->mode == MPOL_INTERLEAVE) current->il_prev = MAX_NUMNODES-1; task_unlock(current); mpol_put(old); ret = 0; out: NODEMASK_SCRATCH_FREE(scratch); return ret; } /* * Return nodemask for policy for get_mempolicy() query * * Called with task's alloc_lock held */ static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes) { nodes_clear(*nodes); if (pol == &default_policy) return; switch (pol->mode) { case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: *nodes = pol->nodes; break; case MPOL_LOCAL: /* return empty node mask for local allocation */ break; default: BUG(); } } static int lookup_node(struct mm_struct *mm, unsigned long addr) { struct page *p = NULL; int ret; ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p); if (ret > 0) { ret = page_to_nid(p); put_page(p); } return ret; } /* Retrieve NUMA policy */ static long do_get_mempolicy(int *policy, nodemask_t *nmask, unsigned long addr, unsigned long flags) { int err; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL; if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) return -EINVAL; if (flags & MPOL_F_MEMS_ALLOWED) { if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) return -EINVAL; *policy = 0; /* just so it's initialized */ task_lock(current); *nmask = cpuset_current_mems_allowed; task_unlock(current); return 0; } if (flags & MPOL_F_ADDR) { pgoff_t ilx; /* ignored here */ /* * Do NOT fall back to task policy if the * vma/shared policy at addr is NULL. We * want to return MPOL_DEFAULT in this case. */ mmap_read_lock(mm); vma = vma_lookup(mm, addr); if (!vma) { mmap_read_unlock(mm); return -EFAULT; } pol = __get_vma_policy(vma, addr, &ilx); } else if (addr) return -EINVAL; if (!pol) pol = &default_policy; /* indicates default behavior */ if (flags & MPOL_F_NODE) { if (flags & MPOL_F_ADDR) { /* * Take a refcount on the mpol, because we are about to * drop the mmap_lock, after which only "pol" remains * valid, "vma" is stale. */ pol_refcount = pol; vma = NULL; mpol_get(pol); mmap_read_unlock(mm); err = lookup_node(mm, addr); if (err < 0) goto out; *policy = err; } else if (pol == current->mempolicy && pol->mode == MPOL_INTERLEAVE) { *policy = next_node_in(current->il_prev, pol->nodes); } else { err = -EINVAL; goto out; } } else { *policy = pol == &default_policy ? MPOL_DEFAULT : pol->mode; /* * Internal mempolicy flags must be masked off before exposing * the policy to userspace. */ *policy |= (pol->flags & MPOL_MODE_FLAGS); } err = 0; if (nmask) { if (mpol_store_user_nodemask(pol)) { *nmask = pol->w.user_nodemask; } else { task_lock(current); get_policy_nodemask(pol, nmask); task_unlock(current); } } out: mpol_cond_put(pol); if (vma) mmap_read_unlock(mm); if (pol_refcount) mpol_put(pol_refcount); return err; } #ifdef CONFIG_MIGRATION static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { /* * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. * Choosing not to migrate a shared folio is not counted as a failure. * * To check if the folio is shared, ideally we want to make sure * every page is mapped to the same process. Doing that is very * expensive, so check the estimated sharers of the folio instead. */ if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) { if (folio_isolate_lru(folio)) { list_add_tail(&folio->lru, foliolist); node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); } else { /* * Non-movable folio may reach here. And, there may be * temporary off LRU folios or non-LRU movable folios. * Treat them as unmovable folios since they can't be * isolated, so they can't be moved at the moment. */ return false; } } return true; } /* * Migrate pages from one node to a target node. * Returns error or the number of pages not migrated. */ static long migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) { nodemask_t nmask; struct vm_area_struct *vma; LIST_HEAD(pagelist); long nr_failed; long err = 0; struct migration_target_control mtc = { .nid = dest, .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, }; nodes_clear(nmask); node_set(source, nmask); VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); mmap_read_lock(mm); vma = find_vma(mm, 0); /* * This does not migrate the range, but isolates all pages that * need migration. Between passing in the full user address * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail, * but passes back the count of pages which could not be isolated. */ nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); mmap_read_unlock(mm); if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); if (err) putback_movable_pages(&pagelist); } if (err >= 0) err += nr_failed; return err; } /* * Move pages between the two nodesets so as to preserve the physical * layout as much as possible. * * Returns the number of page that could not be moved. */ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { long nr_failed = 0; long err = 0; nodemask_t tmp; lru_cache_disable(); /* * Find a 'source' bit set in 'tmp' whose corresponding 'dest' * bit in 'to' is not also set in 'tmp'. Clear the found 'source' * bit in 'tmp', and return that <source, dest> pair for migration. * The pair of nodemasks 'to' and 'from' define the map. * * If no pair of bits is found that way, fallback to picking some * pair of 'source' and 'dest' bits that are not the same. If the * 'source' and 'dest' bits are the same, this represents a node * that will be migrating to itself, so no pages need move. * * If no bits are left in 'tmp', or if all remaining bits left * in 'tmp' correspond to the same bit in 'to', return false * (nothing left to migrate). * * This lets us pick a pair of nodes to migrate between, such that * if possible the dest node is not already occupied by some other * source node, minimizing the risk of overloading the memory on a * node that would happen if we migrated incoming memory to a node * before migrating outgoing memory source that same node. * * A single scan of tmp is sufficient. As we go, we remember the * most recent <s, d> pair that moved (s != d). If we find a pair * that not only moved, but what's better, moved to an empty slot * (d is not set in tmp), then we break out then, with that pair. * Otherwise when we finish scanning from_tmp, we at least have the * most recent <s, d> pair that moved. If we get all the way through * the scan of tmp without finding any node that moved, much less * moved to an empty node, then there is nothing left worth migrating. */ tmp = *from; while (!nodes_empty(tmp)) { int s, d; int source = NUMA_NO_NODE; int dest = 0; for_each_node_mask(s, tmp) { /* * do_migrate_pages() tries to maintain the relative * node relationship of the pages established between * threads and memory areas. * * However if the number of source nodes is not equal to * the number of destination nodes we can not preserve * this node relative relationship. In that case, skip * copying memory from a node that is in the destination * mask. * * Example: [2,3,4] -> [3,4,5] moves everything. * [0-7] - > [3,4,5] moves only 0,1,2,6,7. */ if ((nodes_weight(*from) != nodes_weight(*to)) && (node_isset(s, *to))) continue; d = node_remap(s, *from, *to); if (s == d) continue; source = s; /* Node moved. Memorize */ dest = d; /* dest not in remaining from nodes? */ if (!node_isset(dest, tmp)) break; } if (source == NUMA_NO_NODE) break; node_clear(source, tmp); err = migrate_to_node(mm, source, dest, flags); if (err > 0) nr_failed += err; if (err < 0) break; } lru_cache_enable(); if (err < 0) return err; return (nr_failed < INT_MAX) ? nr_failed : INT_MAX; } /* * Allocate a new folio for page migration, according to NUMA mempolicy. */ static struct folio *alloc_migration_target_by_mpol(struct folio *src, unsigned long private) { struct migration_mpol *mmpol = (struct migration_mpol *)private; struct mempolicy *pol = mmpol->pol; pgoff_t ilx = mmpol->ilx; struct page *page; unsigned int order; int nid = numa_node_id(); gfp_t gfp; order = folio_order(src); ilx += src->index >> order; if (folio_test_hugetlb(src)) { nodemask_t *nodemask; struct hstate *h; h = folio_hstate(src); gfp = htlb_alloc_mask(h); nodemask = policy_nodemask(gfp, pol, ilx, &nid); return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp); } if (folio_test_large(src)) gfp = GFP_TRANSHUGE; else gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP; page = alloc_pages_mpol(gfp, order, pol, ilx, nid); return page_rmappable_folio(page); } #else static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { return false; } int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { return -ENOSYS; } static struct folio *alloc_migration_target_by_mpol(struct folio *src, unsigned long private) { return NULL; } #endif static long do_mbind(unsigned long start, unsigned long len, unsigned short mode, unsigned short mode_flags, nodemask_t *nmask, unsigned long flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; struct vma_iterator vmi; struct migration_mpol mmpol; struct mempolicy *new; unsigned long end; long err; long nr_failed; LIST_HEAD(pagelist); if (flags & ~(unsigned long)MPOL_MF_VALID) return -EINVAL; if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) return -EPERM; if (start & ~PAGE_MASK) return -EINVAL; if (mode == MPOL_DEFAULT) flags &= ~MPOL_MF_STRICT; len = PAGE_ALIGN(len); end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; new = mpol_new(mode, mode_flags, nmask); if (IS_ERR(new)) return PTR_ERR(new); /* * If we are using the default policy then operation * on discontinuous address spaces is okay after all */ if (!new) flags |= MPOL_MF_DISCONTIG_OK; if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) lru_cache_disable(); { NODEMASK_SCRATCH(scratch); if (scratch) { mmap_write_lock(mm); err = mpol_set_nodemask(new, nmask, scratch); if (err) mmap_write_unlock(mm); } else err = -ENOMEM; NODEMASK_SCRATCH_FREE(scratch); } if (err) goto mpol_out; /* * Lock the VMAs before scanning for pages to migrate, * to ensure we don't miss a concurrently inserted page. */ nr_failed = queue_pages_range(mm, start, end, nmask, flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist); if (nr_failed < 0) { err = nr_failed; nr_failed = 0; } else { vma_iter_init(&vmi, mm, start); prev = vma_prev(&vmi); for_each_vma_range(vmi, vma, end) { err = mbind_range(&vmi, vma, &prev, start, end, new); if (err) break; } } if (!err && !list_empty(&pagelist)) { /* Convert MPOL_DEFAULT's NULL to task or default policy */ if (!new) { new = get_task_policy(current); mpol_get(new); } mmpol.pol = new; mmpol.ilx = 0; /* * In the interleaved case, attempt to allocate on exactly the * targeted nodes, for the first VMA to be migrated; for later * VMAs, the nodes will still be interleaved from the targeted * nodemask, but one by one may be selected differently. */ if (new->mode == MPOL_INTERLEAVE) { struct page *page; unsigned int order; unsigned long addr = -EFAULT; list_for_each_entry(page, &pagelist, lru) { if (!PageKsm(page)) break; } if (!list_entry_is_head(page, &pagelist, lru)) { vma_iter_init(&vmi, mm, start); for_each_vma_range(vmi, vma, end) { addr = page_address_in_vma(page, vma); if (addr != -EFAULT) break; } } if (addr != -EFAULT) { order = compound_order(page); /* We already know the pol, but not the ilx */ mpol_cond_put(get_vma_policy(vma, addr, order, &mmpol.ilx)); /* Set base from which to increment by index */ mmpol.ilx -= page->index >> order; } } } mmap_write_unlock(mm); if (!err && !list_empty(&pagelist)) { nr_failed |= migrate_pages(&pagelist, alloc_migration_target_by_mpol, NULL, (unsigned long)&mmpol, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); } if (nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; if (!list_empty(&pagelist)) putback_movable_pages(&pagelist); mpol_out: mpol_put(new); if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) lru_cache_enable(); return err; } /* * User space interface with variable sized bitmaps for nodelists. */ static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask, unsigned long maxnode) { unsigned long nlongs = BITS_TO_LONGS(maxnode); int ret; if (in_compat_syscall()) ret = compat_get_bitmap(mask, (const compat_ulong_t __user *)nmask, maxnode); else ret = copy_from_user(mask, nmask, nlongs * sizeof(unsigned long)); if (ret) return -EFAULT; if (maxnode % BITS_PER_LONG) mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1; return 0; } /* Copy a node mask from user space. */ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, unsigned long maxnode) { --maxnode; nodes_clear(*nodes); if (maxnode == 0 || !nmask) return 0; if (maxnode > PAGE_SIZE*BITS_PER_BYTE) return -EINVAL; /* * When the user specified more nodes than supported just check * if the non supported part is all zero, one word at a time, * starting at the end. */ while (maxnode > MAX_NUMNODES) { unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG); unsigned long t; if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits)) return -EFAULT; if (maxnode - bits >= MAX_NUMNODES) { maxnode -= bits; } else { maxnode = MAX_NUMNODES; t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); } if (t) return -EINVAL; } return get_bitmap(nodes_addr(*nodes), nmask, maxnode); } /* Copy a kernel node mask to user space */ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, nodemask_t *nodes) { unsigned long copy = ALIGN(maxnode-1, 64) / 8; unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long); bool compat = in_compat_syscall(); if (compat) nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t); if (copy > nbytes) { if (copy > PAGE_SIZE) return -EINVAL; if (clear_user((char __user *)mask + nbytes, copy - nbytes)) return -EFAULT; copy = nbytes; maxnode = nr_node_ids; } if (compat) return compat_put_bitmap((compat_ulong_t __user *)mask, nodes_addr(*nodes), maxnode); return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; } /* Basic parameter sanity check used by both mbind() and set_mempolicy() */ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) { *flags = *mode & MPOL_MODE_FLAGS; *mode &= ~MPOL_MODE_FLAGS; if ((unsigned int)(*mode) >= MPOL_MAX) return -EINVAL; if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) return -EINVAL; if (*flags & MPOL_F_NUMA_BALANCING) { if (*mode != MPOL_BIND) return -EINVAL; *flags |= (MPOL_F_MOF | MPOL_F_MORON); } return 0; } static long kernel_mbind(unsigned long start, unsigned long len, unsigned long mode, const unsigned long __user *nmask, unsigned long maxnode, unsigned int flags) { unsigned short mode_flags; nodemask_t nodes; int lmode = mode; int err; start = untagged_addr(start); err = sanitize_mpol_flags(&lmode, &mode_flags); if (err) return err; err = get_nodes(&nodes, nmask, maxnode); if (err) return err; return do_mbind(start, len, lmode, mode_flags, &nodes, flags); } SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len, unsigned long, home_node, unsigned long, flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; struct mempolicy *new, *old; unsigned long end; int err = -ENOENT; VMA_ITERATOR(vmi, mm, start); start = untagged_addr(start); if (start & ~PAGE_MASK) return -EINVAL; /* * flags is used for future extension if any. */ if (flags != 0) return -EINVAL; /* * Check home_node is online to avoid accessing uninitialized * NODE_DATA. */ if (home_node >= MAX_NUMNODES || !node_online(home_node)) return -EINVAL; len = PAGE_ALIGN(len); end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; mmap_write_lock(mm); prev = vma_prev(&vmi); for_each_vma_range(vmi, vma, end) { /* * If any vma in the range got policy other than MPOL_BIND * or MPOL_PREFERRED_MANY we return error. We don't reset * the home node for vmas we already updated before. */ old = vma_policy(vma); if (!old) { prev = vma; continue; } if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) { err = -EOPNOTSUPP; break; } new = mpol_dup(old); if (IS_ERR(new)) { err = PTR_ERR(new); break; } vma_start_write(vma); new->home_node = home_node; err = mbind_range(&vmi, vma, &prev, start, end, new); mpol_put(new); if (err) break; } mmap_write_unlock(mm); return err; } SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, unsigned long, mode, const unsigned long __user *, nmask, unsigned long, maxnode, unsigned int, flags) { return kernel_mbind(start, len, mode, nmask, maxnode, flags); } /* Set the process memory policy */ static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask, unsigned long maxnode) { unsigned short mode_flags; nodemask_t nodes; int lmode = mode; int err; err = sanitize_mpol_flags(&lmode, &mode_flags); if (err) return err; err = get_nodes(&nodes, nmask, maxnode); if (err) return err; return do_set_mempolicy(lmode, mode_flags, &nodes); } SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, unsigned long, maxnode) { return kernel_set_mempolicy(mode, nmask, maxnode); } static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, const unsigned long __user *old_nodes, const unsigned long __user *new_nodes) { struct mm_struct *mm = NULL; struct task_struct *task; nodemask_t task_nodes; int err; nodemask_t *old; nodemask_t *new; NODEMASK_SCRATCH(scratch); if (!scratch) return -ENOMEM; old = &scratch->mask1; new = &scratch->mask2; err = get_nodes(old, old_nodes, maxnode); if (err) goto out; err = get_nodes(new, new_nodes, maxnode); if (err) goto out; /* Find the mm_struct */ rcu_read_lock(); task = pid ? find_task_by_vpid(pid) : current; if (!task) { rcu_read_unlock(); err = -ESRCH; goto out; } get_task_struct(task); err = -EINVAL; /* * Check if this process has the right to modify the specified process. * Use the regular "ptrace_may_access()" checks. */ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { rcu_read_unlock(); err = -EPERM; goto out_put; } rcu_read_unlock(); task_nodes = cpuset_mems_allowed(task); /* Is the user allowed to access the target nodes? */ if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { err = -EPERM; goto out_put; } task_nodes = cpuset_mems_allowed(current); nodes_and(*new, *new, task_nodes); if (nodes_empty(*new)) goto out_put; err = security_task_movememory(task); if (err) goto out_put; mm = get_task_mm(task); put_task_struct(task); if (!mm) { err = -EINVAL; goto out; } err = do_migrate_pages(mm, old, new, capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); mmput(mm); out: NODEMASK_SCRATCH_FREE(scratch); return err; out_put: put_task_struct(task); goto out; } SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, const unsigned long __user *, old_nodes, const unsigned long __user *, new_nodes) { return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes); } /* Retrieve NUMA policy */ static int kernel_get_mempolicy(int __user *policy, unsigned long __user *nmask, unsigned long maxnode, unsigned long addr, unsigned long flags) { int err; int pval; nodemask_t nodes; if (nmask != NULL && maxnode < nr_node_ids) return -EINVAL; addr = untagged_addr(addr); err = do_get_mempolicy(&pval, &nodes, addr, flags); if (err) return err; if (policy && put_user(pval, policy)) return -EFAULT; if (nmask) err = copy_nodes_to_user(nmask, maxnode, &nodes); return err; } SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, unsigned long __user *, nmask, unsigned long, maxnode, unsigned long, addr, unsigned long, flags) { return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags); } bool vma_migratable(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_IO | VM_PFNMAP)) return false; /* * DAX device mappings require predictable access latency, so avoid * incurring periodic faults. */ if (vma_is_dax(vma)) return false; if (is_vm_hugetlb_page(vma) && !hugepage_migration_supported(hstate_vma(vma))) return false; /* * Migration allocates pages in the highest zone. If we cannot * do so then migration (at least from node to node) is not * possible. */ if (vma->vm_file && gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping)) < policy_zone) return false; return true; } struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx) { *ilx = 0; return (vma->vm_ops && vma->vm_ops->get_policy) ? vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy; } /* * get_vma_policy(@vma, @addr, @order, @ilx) * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup * @order: 0, or appropriate huge_page_order for interleaving * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE * * Returns effective policy for a VMA at specified address. * Falls back to current->mempolicy or system default policy, as necessary. * Shared policies [those marked as MPOL_F_SHARED] require an extra reference * count--added by the get_policy() vm_op, as appropriate--to protect against * freeing by another task. It is the caller's responsibility to free the * extra reference for shared policies. */ struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr, int order, pgoff_t *ilx) { struct mempolicy *pol; pol = __get_vma_policy(vma, addr, ilx); if (!pol) pol = get_task_policy(current); if (pol->mode == MPOL_INTERLEAVE) { *ilx += vma->vm_pgoff >> order; *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order); } return pol; } bool vma_policy_mof(struct vm_area_struct *vma) { struct mempolicy *pol; if (vma->vm_ops && vma->vm_ops->get_policy) { bool ret = false; pgoff_t ilx; /* ignored here */ pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx); if (pol && (pol->flags & MPOL_F_MOF)) ret = true; mpol_cond_put(pol); return ret; } pol = vma->vm_policy; if (!pol) pol = get_task_policy(current); return pol->flags & MPOL_F_MOF; } bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) { enum zone_type dynamic_policy_zone = policy_zone; BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); /* * if policy->nodes has movable memory only, * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. * * policy->nodes is intersect with node_states[N_MEMORY]. * so if the following test fails, it implies * policy->nodes has movable memory only. */ if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY])) dynamic_policy_zone = ZONE_MOVABLE; return zone >= dynamic_policy_zone; } /* Do dynamic interleaving for a process */ static unsigned int interleave_nodes(struct mempolicy *policy) { unsigned int nid; nid = next_node_in(current->il_prev, policy->nodes); if (nid < MAX_NUMNODES) current->il_prev = nid; return nid; } /* * Depending on the memory policy provide a node from which to allocate the * next slab entry. */ unsigned int mempolicy_slab_node(void) { struct mempolicy *policy; int node = numa_mem_id(); if (!in_task()) return node; policy = current->mempolicy; if (!policy) return node; switch (policy->mode) { case MPOL_PREFERRED: return first_node(policy->nodes); case MPOL_INTERLEAVE: return interleave_nodes(policy); case MPOL_BIND: case MPOL_PREFERRED_MANY: { struct zoneref *z; /* * Follow bind policy behavior and start allocation at the * first node. */ struct zonelist *zonelist; enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; z = first_zones_zonelist(zonelist, highest_zoneidx, &policy->nodes); return z->zone ? zone_to_nid(z->zone) : node; } case MPOL_LOCAL: return node; default: BUG(); } } /* * Do static interleaving for interleave index @ilx. Returns the ilx'th * node in pol->nodes (starting from ilx=0), wrapping around if ilx * exceeds the number of present nodes. */ static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx) { nodemask_t nodemask = pol->nodes; unsigned int target, nnodes; int i; int nid; /* * The barrier will stabilize the nodemask in a register or on * the stack so that it will stop changing under the code. * * Between first_node() and next_node(), pol->nodes could be changed * by other threads. So we put pol->nodes in a local stack. */ barrier(); nnodes = nodes_weight(nodemask); if (!nnodes) return numa_node_id(); target = ilx % nnodes; nid = first_node(nodemask); for (i = 0; i < target; i++) nid = next_node(nid, nodemask); return nid; } /* * Return a nodemask representing a mempolicy for filtering nodes for * page allocation, together with preferred node id (or the input node id). */ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, pgoff_t ilx, int *nid) { nodemask_t *nodemask = NULL; switch (pol->mode) { case MPOL_PREFERRED: /* Override input node id */ *nid = first_node(pol->nodes); break; case MPOL_PREFERRED_MANY: nodemask = &pol->nodes; if (pol->home_node != NUMA_NO_NODE) *nid = pol->home_node; break; case MPOL_BIND: /* Restrict to nodemask (but not on lower zones) */ if (apply_policy_zone(pol, gfp_zone(gfp)) && cpuset_nodemask_valid_mems_allowed(&pol->nodes)) nodemask = &pol->nodes; if (pol->home_node != NUMA_NO_NODE) *nid = pol->home_node; /* * __GFP_THISNODE shouldn't even be used with the bind policy * because we might easily break the expectation to stay on the * requested node and not break the policy. */ WARN_ON_ONCE(gfp & __GFP_THISNODE); break; case MPOL_INTERLEAVE: /* Override input node id */ *nid = (ilx == NO_INTERLEAVE_INDEX) ? interleave_nodes(pol) : interleave_nid(pol, ilx); break; } return nodemask; } #ifdef CONFIG_HUGETLBFS /* * huge_node(@vma, @addr, @gfp_flags, @mpol) * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup and interleave policy * @gfp_flags: for requested zone * @mpol: pointer to mempolicy pointer for reference counted mempolicy * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy * * Returns a nid suitable for a huge page allocation and a pointer * to the struct mempolicy for conditional unref after allocation. * If the effective policy is 'bind' or 'prefer-many', returns a pointer * to the mempolicy's @nodemask for filtering the zonelist. */ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask) { pgoff_t ilx; int nid; nid = numa_node_id(); *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx); *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid); return nid; } /* * init_nodemask_of_mempolicy * * If the current task's mempolicy is "default" [NULL], return 'false' * to indicate default policy. Otherwise, extract the policy nodemask * for 'bind' or 'interleave' policy into the argument nodemask, or * initialize the argument nodemask to contain the single node for * 'preferred' or 'local' policy and return 'true' to indicate presence * of non-default mempolicy. * * We don't bother with reference counting the mempolicy [mpol_get/put] * because the current task is examining it's own mempolicy and a task's * mempolicy is only ever changed by the task itself. * * N.B., it is the caller's responsibility to free a returned nodemask. */ bool init_nodemask_of_mempolicy(nodemask_t *mask) { struct mempolicy *mempolicy; if (!(mask && current->mempolicy)) return false; task_lock(current); mempolicy = current->mempolicy; switch (mempolicy->mode) { case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: *mask = mempolicy->nodes; break; case MPOL_LOCAL: init_nodemask_of_node(mask, numa_node_id()); break; default: BUG(); } task_unlock(current); return true; } #endif /* * mempolicy_in_oom_domain * * If tsk's mempolicy is "bind", check for intersection between mask and * the policy nodemask. Otherwise, return true for all other policies * including "interleave", as a tsk with "interleave" policy may have * memory allocated from all nodes in system. * * Takes task_lock(tsk) to prevent freeing of its mempolicy. */ bool mempolicy_in_oom_domain(struct task_struct *tsk, const nodemask_t *mask) { struct mempolicy *mempolicy; bool ret = true; if (!mask) return ret; task_lock(tsk); mempolicy = tsk->mempolicy; if (mempolicy && mempolicy->mode == MPOL_BIND) ret = nodes_intersects(mempolicy->nodes, *mask); task_unlock(tsk); return ret; } static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, int nid, nodemask_t *nodemask) { struct page *page; gfp_t preferred_gfp; /* * This is a two pass approach. The first pass will only try the * preferred nodes but skip the direct reclaim and allow the * allocation to fail, while the second pass will try all the * nodes in system. */ preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); page = __alloc_pages(preferred_gfp, order, nid, nodemask); if (!page) page = __alloc_pages(gfp, order, nid, NULL); return page; } /** * alloc_pages_mpol - Allocate pages according to NUMA mempolicy. * @gfp: GFP flags. * @order: Order of the page allocation. * @pol: Pointer to the NUMA mempolicy. * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()). * @nid: Preferred node (usually numa_node_id() but @mpol may override it). * * Return: The page on success or NULL if allocation fails. */ struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, struct mempolicy *pol, pgoff_t ilx, int nid) { nodemask_t *nodemask; struct page *page; nodemask = policy_nodemask(gfp, pol, ilx, &nid); if (pol->mode == MPOL_PREFERRED_MANY) return alloc_pages_preferred_many(gfp, order, nid, nodemask); if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && /* filter "hugepage" allocation, unless from alloc_pages() */ order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) { /* * For hugepage allocation and non-interleave policy which * allows the current node (or other explicitly preferred * node) we only try to allocate from the current/preferred * node and don't fall back to other nodes, as the cost of * remote accesses would likely offset THP benefits. * * If the policy is interleave or does not allow the current * node in its nodemask, we allocate the standard way. */ if (pol->mode != MPOL_INTERLEAVE && (!nodemask || node_isset(nid, *nodemask))) { /* * First, try to allocate THP only on local node, but * don't reclaim unnecessarily, just compact. */ page = __alloc_pages_node(nid, gfp | __GFP_THISNODE | __GFP_NORETRY, order); if (page || !(gfp & __GFP_DIRECT_RECLAIM)) return page; /* * If hugepage allocations are configured to always * synchronous compact or the vma has been madvised * to prefer hugepage backing, retry allowing remote * memory with both reclaim and compact as well. */ } } page = __alloc_pages(gfp, order, nid, nodemask); if (unlikely(pol->mode == MPOL_INTERLEAVE) && page) { /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */ if (static_branch_likely(&vm_numa_stat_key) && page_to_nid(page) == nid) { preempt_disable(); __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); preempt_enable(); } } return page; } /** * vma_alloc_folio - Allocate a folio for a VMA. * @gfp: GFP flags. * @order: Order of the folio. * @vma: Pointer to VMA. * @addr: Virtual address of the allocation. Must be inside @vma. * @hugepage: Unused (was: For hugepages try only preferred node if possible). * * Allocate a folio for a specific address in @vma, using the appropriate * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the * VMA to prevent it from going away. Should be used for all allocations * for folios that will be mapped into user space, excepting hugetlbfs, and * excepting where direct use of alloc_pages_mpol() is more appropriate. * * Return: The folio on success or NULL if allocation fails. */ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, bool hugepage) { struct mempolicy *pol; pgoff_t ilx; struct page *page; pol = get_vma_policy(vma, addr, order, &ilx); page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol, ilx, numa_node_id()); mpol_cond_put(pol); return page_rmappable_folio(page); } EXPORT_SYMBOL(vma_alloc_folio); /** * alloc_pages - Allocate pages. * @gfp: GFP flags. * @order: Power of two of number of pages to allocate. * * Allocate 1 << @order contiguous pages. The physical address of the * first page is naturally aligned (eg an order-3 allocation will be aligned * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current * process is honoured when in process context. * * Context: Can be called from any context, providing the appropriate GFP * flags are used. * Return: The page on success or NULL if allocation fails. */ struct page *alloc_pages(gfp_t gfp, unsigned int order) { struct mempolicy *pol = &default_policy; /* * No reference counting needed for current->mempolicy * nor system default_policy */ if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX, numa_node_id()); } EXPORT_SYMBOL(alloc_pages); struct folio *folio_alloc(gfp_t gfp, unsigned int order) { return page_rmappable_folio(alloc_pages(gfp | __GFP_COMP, order)); } EXPORT_SYMBOL(folio_alloc); static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { int nodes; unsigned long nr_pages_per_node; int delta; int i; unsigned long nr_allocated; unsigned long total_allocated = 0; nodes = nodes_weight(pol->nodes); nr_pages_per_node = nr_pages / nodes; delta = nr_pages - nodes * nr_pages_per_node; for (i = 0; i < nodes; i++) { if (delta) { nr_allocated = __alloc_pages_bulk(gfp, interleave_nodes(pol), NULL, nr_pages_per_node + 1, NULL, page_array); delta--; } else { nr_allocated = __alloc_pages_bulk(gfp, interleave_nodes(pol), NULL, nr_pages_per_node, NULL, page_array); } page_array += nr_allocated; total_allocated += nr_allocated; } return total_allocated; } static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { gfp_t preferred_gfp; unsigned long nr_allocated = 0; preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); nr_allocated = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes, nr_pages, NULL, page_array); if (nr_allocated < nr_pages) nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL, nr_pages - nr_allocated, NULL, page_array + nr_allocated); return nr_allocated; } /* alloc pages bulk and mempolicy should be considered at the * same time in some situation such as vmalloc. * * It can accelerate memory allocation especially interleaving * allocate memory. */ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, unsigned long nr_pages, struct page **page_array) { struct mempolicy *pol = &default_policy; nodemask_t *nodemask; int nid; if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); if (pol->mode == MPOL_INTERLEAVE) return alloc_pages_bulk_array_interleave(gfp, pol, nr_pages, page_array); if (pol->mode == MPOL_PREFERRED_MANY) return alloc_pages_bulk_array_preferred_many(gfp, numa_node_id(), pol, nr_pages, page_array); nid = numa_node_id(); nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid); return __alloc_pages_bulk(gfp, nid, nodemask, nr_pages, NULL, page_array); } int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { struct mempolicy *pol = mpol_dup(src->vm_policy); if (IS_ERR(pol)) return PTR_ERR(pol); dst->vm_policy = pol; return 0; } /* * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it * rebinds the mempolicy its copying by calling mpol_rebind_policy() * with the mems_allowed returned by cpuset_mems_allowed(). This * keeps mempolicies cpuset relative after its cpuset moves. See * further kernel/cpuset.c update_nodemask(). * * current's mempolicy may be rebinded by the other task(the task that changes * cpuset's mems), so we needn't do rebind work for current task. */ /* Slow path of a mempolicy duplicate */ struct mempolicy *__mpol_dup(struct mempolicy *old) { struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!new) return ERR_PTR(-ENOMEM); /* task's mempolicy is protected by alloc_lock */ if (old == current->mempolicy) { task_lock(current); *new = *old; task_unlock(current); } else *new = *old; if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); mpol_rebind_policy(new, &mems); } atomic_set(&new->refcnt, 1); return new; } /* Slow path of a mempolicy comparison */ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) { if (!a || !b) return false; if (a->mode != b->mode) return false; if (a->flags != b->flags) return false; if (a->home_node != b->home_node) return false; if (mpol_store_user_nodemask(a)) if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) return false; switch (a->mode) { case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: return !!nodes_equal(a->nodes, b->nodes); case MPOL_LOCAL: return true; default: BUG(); return false; } } /* * Shared memory backing store policy support. * * Remember policies even when nobody has shared memory mapped. * The policies are kept in Red-Black tree linked from the inode. * They are protected by the sp->lock rwlock, which should be held * for any accesses to the tree. */ /* * lookup first element intersecting start-end. Caller holds sp->lock for * reading or for writing */ static struct sp_node *sp_lookup(struct shared_policy *sp, pgoff_t start, pgoff_t end) { struct rb_node *n = sp->root.rb_node; while (n) { struct sp_node *p = rb_entry(n, struct sp_node, nd); if (start >= p->end) n = n->rb_right; else if (end <= p->start) n = n->rb_left; else break; } if (!n) return NULL; for (;;) { struct sp_node *w = NULL; struct rb_node *prev = rb_prev(n); if (!prev) break; w = rb_entry(prev, struct sp_node, nd); if (w->end <= start) break; n = prev; } return rb_entry(n, struct sp_node, nd); } /* * Insert a new shared policy into the list. Caller holds sp->lock for * writing. */ static void sp_insert(struct shared_policy *sp, struct sp_node *new) { struct rb_node **p = &sp->root.rb_node; struct rb_node *parent = NULL; struct sp_node *nd; while (*p) { parent = *p; nd = rb_entry(parent, struct sp_node, nd); if (new->start < nd->start) p = &(*p)->rb_left; else if (new->end > nd->end) p = &(*p)->rb_right; else BUG(); } rb_link_node(&new->nd, parent, p); rb_insert_color(&new->nd, &sp->root); } /* Find shared policy intersecting idx */ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx) { struct mempolicy *pol = NULL; struct sp_node *sn; if (!sp->root.rb_node) return NULL; read_lock(&sp->lock); sn = sp_lookup(sp, idx, idx+1); if (sn) { mpol_get(sn->policy); pol = sn->policy; } read_unlock(&sp->lock); return pol; } static void sp_free(struct sp_node *n) { mpol_put(n->policy); kmem_cache_free(sn_cache, n); } /** * mpol_misplaced - check whether current folio node is valid in policy * * @folio: folio to be checked * @vma: vm area where folio mapped * @addr: virtual address in @vma for shared policy lookup and interleave policy * * Lookup current policy node id for vma,addr and "compare to" folio's * node id. Policy determination "mimics" alloc_page_vma(). * Called from fault path where we know the vma and faulting address. * * Return: NUMA_NO_NODE if the page is in a node that is valid for this * policy, or a suitable node ID to allocate a replacement folio from. */ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol; pgoff_t ilx; struct zoneref *z; int curnid = folio_nid(folio); int thiscpu = raw_smp_processor_id(); int thisnid = cpu_to_node(thiscpu); int polnid = NUMA_NO_NODE; int ret = NUMA_NO_NODE; pol = get_vma_policy(vma, addr, folio_order(folio), &ilx); if (!(pol->flags & MPOL_F_MOF)) goto out; switch (pol->mode) { case MPOL_INTERLEAVE: polnid = interleave_nid(pol, ilx); break; case MPOL_PREFERRED: if (node_isset(curnid, pol->nodes)) goto out; polnid = first_node(pol->nodes); break; case MPOL_LOCAL: polnid = numa_node_id(); break; case MPOL_BIND: /* Optimize placement among multiple nodes via NUMA balancing */ if (pol->flags & MPOL_F_MORON) { if (node_isset(thisnid, pol->nodes)) break; goto out; } fallthrough; case MPOL_PREFERRED_MANY: /* * use current page if in policy nodemask, * else select nearest allowed node, if any. * If no allowed nodes, use current [!misplaced]. */ if (node_isset(curnid, pol->nodes)) goto out; z = first_zones_zonelist( node_zonelist(numa_node_id(), GFP_HIGHUSER), gfp_zone(GFP_HIGHUSER), &pol->nodes); polnid = zone_to_nid(z->zone); break; default: BUG(); } /* Migrate the folio towards the node whose CPU is referencing it */ if (pol->flags & MPOL_F_MORON) { polnid = thisnid; if (!should_numa_migrate_memory(current, folio, curnid, thiscpu)) goto out; } if (curnid != polnid) ret = polnid; out: mpol_cond_put(pol); return ret; } /* * Drop the (possibly final) reference to task->mempolicy. It needs to be * dropped after task->mempolicy is set to NULL so that any allocation done as * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed * policy. */ void mpol_put_task_policy(struct task_struct *task) { struct mempolicy *pol; task_lock(task); pol = task->mempolicy; task->mempolicy = NULL; task_unlock(task); mpol_put(pol); } static void sp_delete(struct shared_policy *sp, struct sp_node *n) { rb_erase(&n->nd, &sp->root); sp_free(n); } static void sp_node_init(struct sp_node *node, unsigned long start, unsigned long end, struct mempolicy *pol) { node->start = start; node->end = end; node->policy = pol; } static struct sp_node *sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) { struct sp_node *n; struct mempolicy *newpol; n = kmem_cache_alloc(sn_cache, GFP_KERNEL); if (!n) return NULL; newpol = mpol_dup(pol); if (IS_ERR(newpol)) { kmem_cache_free(sn_cache, n); return NULL; } newpol->flags |= MPOL_F_SHARED; sp_node_init(n, start, end, newpol); return n; } /* Replace a policy range. */ static int shared_policy_replace(struct shared_policy *sp, pgoff_t start, pgoff_t end, struct sp_node *new) { struct sp_node *n; struct sp_node *n_new = NULL; struct mempolicy *mpol_new = NULL; int ret = 0; restart: write_lock(&sp->lock); n = sp_lookup(sp, start, end); /* Take care of old policies in the same range. */ while (n && n->start < end) { struct rb_node *next = rb_next(&n->nd); if (n->start >= start) { if (n->end <= end) sp_delete(sp, n); else n->start = end; } else { /* Old policy spanning whole new range. */ if (n->end > end) { if (!n_new) goto alloc_new; *mpol_new = *n->policy; atomic_set(&mpol_new->refcnt, 1); sp_node_init(n_new, end, n->end, mpol_new); n->end = start; sp_insert(sp, n_new); n_new = NULL; mpol_new = NULL; break; } else n->end = start; } if (!next) break; n = rb_entry(next, struct sp_node, nd); } if (new) sp_insert(sp, new); write_unlock(&sp->lock); ret = 0; err_out: if (mpol_new) mpol_put(mpol_new); if (n_new) kmem_cache_free(sn_cache, n_new); return ret; alloc_new: write_unlock(&sp->lock); ret = -ENOMEM; n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); if (!n_new) goto err_out; mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!mpol_new) goto err_out; atomic_set(&mpol_new->refcnt, 1); goto restart; } /** * mpol_shared_policy_init - initialize shared policy for inode * @sp: pointer to inode shared policy * @mpol: struct mempolicy to install * * Install non-NULL @mpol in inode's shared policy rb-tree. * On entry, the current task has a reference on a non-NULL @mpol. * This must be released on exit. * This is called at get_inode() calls and we can use GFP_KERNEL. */ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) { int ret; sp->root = RB_ROOT; /* empty tree == default mempolicy */ rwlock_init(&sp->lock); if (mpol) { struct sp_node *sn; struct mempolicy *npol; NODEMASK_SCRATCH(scratch); if (!scratch) goto put_mpol; /* contextualize the tmpfs mount point mempolicy to this file */ npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); if (IS_ERR(npol)) goto free_scratch; /* no valid nodemask intersection */ task_lock(current); ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch); task_unlock(current); if (ret) goto put_npol; /* alloc node covering entire file; adds ref to file's npol */ sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol); if (sn) sp_insert(sp, sn); put_npol: mpol_put(npol); /* drop initial ref on file's npol */ free_scratch: NODEMASK_SCRATCH_FREE(scratch); put_mpol: mpol_put(mpol); /* drop our incoming ref on sb mpol */ } } int mpol_set_shared_policy(struct shared_policy *sp, struct vm_area_struct *vma, struct mempolicy *pol) { int err; struct sp_node *new = NULL; unsigned long sz = vma_pages(vma); if (pol) { new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol); if (!new) return -ENOMEM; } err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new); if (err && new) sp_free(new); return err; } /* Free a backing policy store on inode delete. */ void mpol_free_shared_policy(struct shared_policy *sp) { struct sp_node *n; struct rb_node *next; if (!sp->root.rb_node) return; write_lock(&sp->lock); next = rb_first(&sp->root); while (next) { n = rb_entry(next, struct sp_node, nd); next = rb_next(&n->nd); sp_delete(sp, n); } write_unlock(&sp->lock); } #ifdef CONFIG_NUMA_BALANCING static int __initdata numabalancing_override; static void __init check_numabalancing_enable(void) { bool numabalancing_default = false; if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) numabalancing_default = true; /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ if (numabalancing_override) set_numabalancing_state(numabalancing_override == 1); if (num_online_nodes() > 1 && !numabalancing_override) { pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n", numabalancing_default ? "Enabling" : "Disabling"); set_numabalancing_state(numabalancing_default); } } static int __init setup_numabalancing(char *str) { int ret = 0; if (!str) goto out; if (!strcmp(str, "enable")) { numabalancing_override = 1; ret = 1; } else if (!strcmp(str, "disable")) { numabalancing_override = -1; ret = 1; } out: if (!ret) pr_warn("Unable to parse numa_balancing=\n"); return ret; } __setup("numa_balancing=", setup_numabalancing); #else static inline void __init check_numabalancing_enable(void) { } #endif /* CONFIG_NUMA_BALANCING */ void __init numa_policy_init(void) { nodemask_t interleave_nodes; unsigned long largest = 0; int nid, prefer = 0; policy_cache = kmem_cache_create("numa_policy", sizeof(struct mempolicy), 0, SLAB_PANIC, NULL); sn_cache = kmem_cache_create("shared_policy_node", sizeof(struct sp_node), 0, SLAB_PANIC, NULL); for_each_node(nid) { preferred_node_policy[nid] = (struct mempolicy) { .refcnt = ATOMIC_INIT(1), .mode = MPOL_PREFERRED, .flags = MPOL_F_MOF | MPOL_F_MORON, .nodes = nodemask_of_node(nid), }; } /* * Set interleaving policy for system init. Interleaving is only * enabled across suitably sized nodes (default is >= 16MB), or * fall back to the largest node if they're all smaller. */ nodes_clear(interleave_nodes); for_each_node_state(nid, N_MEMORY) { unsigned long total_pages = node_present_pages(nid); /* Preserve the largest node */ if (largest < total_pages) { largest = total_pages; prefer = nid; } /* Interleave this node? */ if ((total_pages << PAGE_SHIFT) >= (16 << 20)) node_set(nid, interleave_nodes); } /* All too small, use the largest */ if (unlikely(nodes_empty(interleave_nodes))) node_set(prefer, interleave_nodes); if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) pr_err("%s: interleaving failed\n", __func__); check_numabalancing_enable(); } /* Reset policy of current process to default */ void numa_default_policy(void) { do_set_mempolicy(MPOL_DEFAULT, 0, NULL); } /* * Parse and format mempolicy from/to strings */ static const char * const policy_modes[] = { [MPOL_DEFAULT] = "default", [MPOL_PREFERRED] = "prefer", [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", [MPOL_LOCAL] = "local", [MPOL_PREFERRED_MANY] = "prefer (many)", }; #ifdef CONFIG_TMPFS /** * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. * @str: string containing mempolicy to parse * @mpol: pointer to struct mempolicy pointer, returned on success. * * Format of input: * <mode>[=<flags>][:<nodelist>] * * Return: %0 on success, else %1 */ int mpol_parse_str(char *str, struct mempolicy **mpol) { struct mempolicy *new = NULL; unsigned short mode_flags; nodemask_t nodes; char *nodelist = strchr(str, ':'); char *flags = strchr(str, '='); int err = 1, mode; if (flags) *flags++ = '\0'; /* terminate mode string */ if (nodelist) { /* NUL-terminate mode or flags string */ *nodelist++ = '\0'; if (nodelist_parse(nodelist, nodes)) goto out; if (!nodes_subset(nodes, node_states[N_MEMORY])) goto out; } else nodes_clear(nodes); mode = match_string(policy_modes, MPOL_MAX, str); if (mode < 0) goto out; switch (mode) { case MPOL_PREFERRED: /* * Insist on a nodelist of one node only, although later * we use first_node(nodes) to grab a single node, so here * nodelist (or nodes) cannot be empty. */ if (nodelist) { char *rest = nodelist; while (isdigit(*rest)) rest++; if (*rest) goto out; if (nodes_empty(nodes)) goto out; } break; case MPOL_INTERLEAVE: /* * Default to online nodes with memory if no nodelist */ if (!nodelist) nodes = node_states[N_MEMORY]; break; case MPOL_LOCAL: /* * Don't allow a nodelist; mpol_new() checks flags */ if (nodelist) goto out; break; case MPOL_DEFAULT: /* * Insist on a empty nodelist */ if (!nodelist) err = 0; goto out; case MPOL_PREFERRED_MANY: case MPOL_BIND: /* * Insist on a nodelist */ if (!nodelist) goto out; } mode_flags = 0; if (flags) { /* * Currently, we only support two mutually exclusive * mode flags. */ if (!strcmp(flags, "static")) mode_flags |= MPOL_F_STATIC_NODES; else if (!strcmp(flags, "relative")) mode_flags |= MPOL_F_RELATIVE_NODES; else goto out; } new = mpol_new(mode, mode_flags, &nodes); if (IS_ERR(new)) goto out; /* * Save nodes for mpol_to_str() to show the tmpfs mount options * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. */ if (mode != MPOL_PREFERRED) { new->nodes = nodes; } else if (nodelist) { nodes_clear(new->nodes); node_set(first_node(nodes), new->nodes); } else { new->mode = MPOL_LOCAL; } /* * Save nodes for contextualization: this will be used to "clone" * the mempolicy in a specific context [cpuset] at a later time. */ new->w.user_nodemask = nodes; err = 0; out: /* Restore string for error message */ if (nodelist) *--nodelist = ':'; if (flags) *--flags = '='; if (!err) *mpol = new; return err; } #endif /* CONFIG_TMPFS */ /** * mpol_to_str - format a mempolicy structure for printing * @buffer: to contain formatted mempolicy string * @maxlen: length of @buffer * @pol: pointer to mempolicy to be formatted * * Convert @pol into a string. If @buffer is too short, truncate the string. * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the * longest flag, "relative", and to display at least a few node ids. */ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) { char *p = buffer; nodemask_t nodes = NODE_MASK_NONE; unsigned short mode = MPOL_DEFAULT; unsigned short flags = 0; if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) { mode = pol->mode; flags = pol->flags; } switch (mode) { case MPOL_DEFAULT: case MPOL_LOCAL: break; case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: nodes = pol->nodes; break; default: WARN_ON_ONCE(1); snprintf(p, maxlen, "unknown"); return; } p += snprintf(p, maxlen, "%s", policy_modes[mode]); if (flags & MPOL_MODE_FLAGS) { p += snprintf(p, buffer + maxlen - p, "="); /* * Currently, the only defined flags are mutually exclusive */ if (flags & MPOL_F_STATIC_NODES) p += snprintf(p, buffer + maxlen - p, "static"); else if (flags & MPOL_F_RELATIVE_NODES) p += snprintf(p, buffer + maxlen - p, "relative"); } if (!nodes_empty(nodes)) p += scnprintf(p, buffer + maxlen - p, ":%*pbl", nodemask_pr_args(&nodes)); }
12 4 10 2 5 1 1 6 6 3 4 2 2 1 1 11 1 1 1 7 1 6 4 4 4 3 3 3 1 2 3 3 4 1 1 1 1 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 // SPDX-License-Identifier: GPL-2.0-only /* bpf/cpumap.c * * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. */ /** * DOC: cpu map * The 'cpumap' is primarily used as a backend map for XDP BPF helper * call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'. * * Unlike devmap which redirects XDP frames out to another NIC device, * this map type redirects raw XDP frames to another CPU. The remote * CPU will do SKB-allocation and call the normal network stack. */ /* * This is a scalability and isolation mechanism, that allow * separating the early driver network XDP layer, from the rest of the * netstack, and assigning dedicated CPUs for this stage. This * basically allows for 10G wirespeed pre-filtering via bpf. */ #include <linux/bitops.h> #include <linux/bpf.h> #include <linux/filter.h> #include <linux/ptr_ring.h> #include <net/xdp.h> #include <net/hotdata.h> #include <linux/sched.h> #include <linux/workqueue.h> #include <linux/kthread.h> #include <linux/completion.h> #include <trace/events/xdp.h> #include <linux/btf_ids.h> #include <linux/netdevice.h> /* netif_receive_skb_list */ #include <linux/etherdevice.h> /* eth_type_trans */ /* General idea: XDP packets getting XDP redirected to another CPU, * will maximum be stored/queued for one driver ->poll() call. It is * guaranteed that queueing the frame and the flush operation happen on * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr() * which queue in bpf_cpu_map_entry contains packets. */ #define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */ struct bpf_cpu_map_entry; struct bpf_cpu_map; struct xdp_bulk_queue { void *q[CPU_MAP_BULK_SIZE]; struct list_head flush_node; struct bpf_cpu_map_entry *obj; unsigned int count; }; /* Struct for every remote "destination" CPU in map */ struct bpf_cpu_map_entry { u32 cpu; /* kthread CPU and map index */ int map_id; /* Back reference to map */ /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ struct xdp_bulk_queue __percpu *bulkq; /* Queue with potential multi-producers, and single-consumer kthread */ struct ptr_ring *queue; struct task_struct *kthread; struct bpf_cpumap_val value; struct bpf_prog *prog; struct completion kthread_running; struct rcu_work free_work; }; struct bpf_cpu_map { struct bpf_map map; /* Below members specific for map type */ struct bpf_cpu_map_entry __rcu **cpu_map; }; static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list); static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { u32 value_size = attr->value_size; struct bpf_cpu_map *cmap; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || (value_size != offsetofend(struct bpf_cpumap_val, qsize) && value_size != offsetofend(struct bpf_cpumap_val, bpf_prog.fd)) || attr->map_flags & ~BPF_F_NUMA_NODE) return ERR_PTR(-EINVAL); /* Pre-limit array size based on NR_CPUS, not final CPU check */ if (attr->max_entries > NR_CPUS) return ERR_PTR(-E2BIG); cmap = bpf_map_area_alloc(sizeof(*cmap), NUMA_NO_NODE); if (!cmap) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&cmap->map, attr); /* Alloc array for possible remote "destination" CPUs */ cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *), cmap->map.numa_node); if (!cmap->cpu_map) { bpf_map_area_free(cmap); return ERR_PTR(-ENOMEM); } return &cmap->map; } static void __cpu_map_ring_cleanup(struct ptr_ring *ring) { /* The tear-down procedure should have made sure that queue is * empty. See __cpu_map_entry_replace() and work-queue * invoked cpu_map_kthread_stop(). Catch any broken behaviour * gracefully and warn once. */ void *ptr; while ((ptr = ptr_ring_consume(ring))) { WARN_ON_ONCE(1); if (unlikely(__ptr_test_bit(0, &ptr))) { __ptr_clear_bit(0, &ptr); kfree_skb(ptr); continue; } xdp_return_frame(ptr); } } static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, struct list_head *listp, struct xdp_cpumap_stats *stats) { struct sk_buff *skb, *tmp; struct xdp_buff xdp; u32 act; int err; list_for_each_entry_safe(skb, tmp, listp, list) { act = bpf_prog_run_generic_xdp(skb, &xdp, rcpu->prog); switch (act) { case XDP_PASS: break; case XDP_REDIRECT: skb_list_del_init(skb); err = xdp_do_generic_redirect(skb->dev, skb, &xdp, rcpu->prog); if (unlikely(err)) { kfree_skb(skb); stats->drop++; } else { stats->redirect++; } return; default: bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(skb->dev, rcpu->prog, act); fallthrough; case XDP_DROP: skb_list_del_init(skb); kfree_skb(skb); stats->drop++; return; } } } static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, void **frames, int n, struct xdp_cpumap_stats *stats) { struct xdp_rxq_info rxq = {}; struct xdp_buff xdp; int i, nframes = 0; xdp_set_return_frame_no_direct(); xdp.rxq = &rxq; for (i = 0; i < n; i++) { struct xdp_frame *xdpf = frames[i]; u32 act; int err; rxq.dev = xdpf->dev_rx; rxq.mem = xdpf->mem; /* TODO: report queue_index to xdp_rxq_info */ xdp_convert_frame_to_buff(xdpf, &xdp); act = bpf_prog_run_xdp(rcpu->prog, &xdp); switch (act) { case XDP_PASS: err = xdp_update_frame_from_buff(&xdp, xdpf); if (err < 0) { xdp_return_frame(xdpf); stats->drop++; } else { frames[nframes++] = xdpf; stats->pass++; } break; case XDP_REDIRECT: err = xdp_do_redirect(xdpf->dev_rx, &xdp, rcpu->prog); if (unlikely(err)) { xdp_return_frame(xdpf); stats->drop++; } else { stats->redirect++; } break; default: bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act); fallthrough; case XDP_DROP: xdp_return_frame(xdpf); stats->drop++; break; } } xdp_clear_return_frame_no_direct(); return nframes; } #define CPUMAP_BATCH 8 static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, int xdp_n, struct xdp_cpumap_stats *stats, struct list_head *list) { int nframes; if (!rcpu->prog) return xdp_n; rcu_read_lock_bh(); nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats); if (stats->redirect) xdp_do_flush(); if (unlikely(!list_empty(list))) cpu_map_bpf_prog_run_skb(rcpu, list, stats); rcu_read_unlock_bh(); /* resched point, may call do_softirq() */ return nframes; } static int cpu_map_kthread_run(void *data) { struct bpf_cpu_map_entry *rcpu = data; complete(&rcpu->kthread_running); set_current_state(TASK_INTERRUPTIBLE); /* When kthread gives stop order, then rcpu have been disconnected * from map, thus no new packets can enter. Remaining in-flight * per CPU stored packets are flushed to this queue. Wait honoring * kthread_stop signal until queue is empty. */ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { struct xdp_cpumap_stats stats = {}; /* zero stats */ unsigned int kmem_alloc_drops = 0, sched = 0; gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; int i, n, m, nframes, xdp_n; void *frames[CPUMAP_BATCH]; void *skbs[CPUMAP_BATCH]; LIST_HEAD(list); /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { set_current_state(TASK_INTERRUPTIBLE); /* Recheck to avoid lost wake-up */ if (__ptr_ring_empty(rcpu->queue)) { schedule(); sched = 1; } else { __set_current_state(TASK_RUNNING); } } else { sched = cond_resched(); } /* * The bpf_cpu_map_entry is single consumer, with this * kthread CPU pinned. Lockless access to ptr_ring * consume side valid as no-resize allowed of queue. */ n = __ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); for (i = 0, xdp_n = 0; i < n; i++) { void *f = frames[i]; struct page *page; if (unlikely(__ptr_test_bit(0, &f))) { struct sk_buff *skb = f; __ptr_clear_bit(0, &skb); list_add_tail(&skb->list, &list); continue; } frames[xdp_n++] = f; page = virt_to_page(f); /* Bring struct page memory area to curr CPU. Read by * build_skb_around via page_is_pfmemalloc(), and when * freed written by page_frag_free call. */ prefetchw(page); } /* Support running another XDP prog on this CPU */ nframes = cpu_map_bpf_prog_run(rcpu, frames, xdp_n, &stats, &list); if (nframes) { m = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, gfp, nframes, skbs); if (unlikely(m == 0)) { for (i = 0; i < nframes; i++) skbs[i] = NULL; /* effect: xdp_return_frame */ kmem_alloc_drops += nframes; } } local_bh_disable(); for (i = 0; i < nframes; i++) { struct xdp_frame *xdpf = frames[i]; struct sk_buff *skb = skbs[i]; skb = __xdp_build_skb_from_frame(xdpf, skb, xdpf->dev_rx); if (!skb) { xdp_return_frame(xdpf); continue; } list_add_tail(&skb->list, &list); } netif_receive_skb_list(&list); /* Feedback loop via tracepoint */ trace_xdp_cpumap_kthread(rcpu->map_id, n, kmem_alloc_drops, sched, &stats); local_bh_enable(); /* resched point, may call do_softirq() */ } __set_current_state(TASK_RUNNING); return 0; } static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, struct bpf_map *map, int fd) { struct bpf_prog *prog; prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); if (IS_ERR(prog)) return PTR_ERR(prog); if (prog->expected_attach_type != BPF_XDP_CPUMAP || !bpf_prog_map_compatible(map, prog)) { bpf_prog_put(prog); return -EINVAL; } rcpu->value.bpf_prog.id = prog->aux->id; rcpu->prog = prog; return 0; } static struct bpf_cpu_map_entry * __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, u32 cpu) { int numa, err, i, fd = value->bpf_prog.fd; gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; struct xdp_bulk_queue *bq; /* Have map->numa_node, but choose node of redirect target CPU */ numa = cpu_to_node(cpu); rcpu = bpf_map_kmalloc_node(map, sizeof(*rcpu), gfp | __GFP_ZERO, numa); if (!rcpu) return NULL; /* Alloc percpu bulkq */ rcpu->bulkq = bpf_map_alloc_percpu(map, sizeof(*rcpu->bulkq), sizeof(void *), gfp); if (!rcpu->bulkq) goto free_rcu; for_each_possible_cpu(i) { bq = per_cpu_ptr(rcpu->bulkq, i); bq->obj = rcpu; } /* Alloc queue */ rcpu->queue = bpf_map_kmalloc_node(map, sizeof(*rcpu->queue), gfp, numa); if (!rcpu->queue) goto free_bulkq; err = ptr_ring_init(rcpu->queue, value->qsize, gfp); if (err) goto free_queue; rcpu->cpu = cpu; rcpu->map_id = map->id; rcpu->value.qsize = value->qsize; if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd)) goto free_ptr_ring; /* Setup kthread */ init_completion(&rcpu->kthread_running); rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, "cpumap/%d/map:%d", cpu, map->id); if (IS_ERR(rcpu->kthread)) goto free_prog; /* Make sure kthread runs on a single CPU */ kthread_bind(rcpu->kthread, cpu); wake_up_process(rcpu->kthread); /* Make sure kthread has been running, so kthread_stop() will not * stop the kthread prematurely and all pending frames or skbs * will be handled by the kthread before kthread_stop() returns. */ wait_for_completion(&rcpu->kthread_running); return rcpu; free_prog: if (rcpu->prog) bpf_prog_put(rcpu->prog); free_ptr_ring: ptr_ring_cleanup(rcpu->queue, NULL); free_queue: kfree(rcpu->queue); free_bulkq: free_percpu(rcpu->bulkq); free_rcu: kfree(rcpu); return NULL; } static void __cpu_map_entry_free(struct work_struct *work) { struct bpf_cpu_map_entry *rcpu; /* This cpu_map_entry have been disconnected from map and one * RCU grace-period have elapsed. Thus, XDP cannot queue any * new packets and cannot change/set flush_needed that can * find this entry. */ rcpu = container_of(to_rcu_work(work), struct bpf_cpu_map_entry, free_work); /* kthread_stop will wake_up_process and wait for it to complete. * cpu_map_kthread_run() makes sure the pointer ring is empty * before exiting. */ kthread_stop(rcpu->kthread); if (rcpu->prog) bpf_prog_put(rcpu->prog); /* The queue should be empty at this point */ __cpu_map_ring_cleanup(rcpu->queue); ptr_ring_cleanup(rcpu->queue, NULL); kfree(rcpu->queue); free_percpu(rcpu->bulkq); kfree(rcpu); } /* After the xchg of the bpf_cpu_map_entry pointer, we need to make sure the old * entry is no longer in use before freeing. We use queue_rcu_work() to call * __cpu_map_entry_free() in a separate workqueue after waiting for an RCU grace * period. This means that (a) all pending enqueue and flush operations have * completed (because of the RCU callback), and (b) we are in a workqueue * context where we can stop the kthread and wait for it to exit before freeing * everything. */ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, u32 key_cpu, struct bpf_cpu_map_entry *rcpu) { struct bpf_cpu_map_entry *old_rcpu; old_rcpu = unrcu_pointer(xchg(&cmap->cpu_map[key_cpu], RCU_INITIALIZER(rcpu))); if (old_rcpu) { INIT_RCU_WORK(&old_rcpu->free_work, __cpu_map_entry_free); queue_rcu_work(system_wq, &old_rcpu->free_work); } } static long cpu_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); u32 key_cpu = *(u32 *)key; if (key_cpu >= map->max_entries) return -EINVAL; /* notice caller map_delete_elem() uses rcu_read_lock() */ __cpu_map_entry_replace(cmap, key_cpu, NULL); return 0; } static long cpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); struct bpf_cpumap_val cpumap_value = {}; struct bpf_cpu_map_entry *rcpu; /* Array index key correspond to CPU number */ u32 key_cpu = *(u32 *)key; memcpy(&cpumap_value, value, map->value_size); if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; if (unlikely(key_cpu >= cmap->map.max_entries)) return -E2BIG; if (unlikely(map_flags == BPF_NOEXIST)) return -EEXIST; if (unlikely(cpumap_value.qsize > 16384)) /* sanity limit on qsize */ return -EOVERFLOW; /* Make sure CPU is a valid possible cpu */ if (key_cpu >= nr_cpumask_bits || !cpu_possible(key_cpu)) return -ENODEV; if (cpumap_value.qsize == 0) { rcpu = NULL; /* Same as deleting */ } else { /* Updating qsize cause re-allocation of bpf_cpu_map_entry */ rcpu = __cpu_map_entry_alloc(map, &cpumap_value, key_cpu); if (!rcpu) return -ENOMEM; } rcu_read_lock(); __cpu_map_entry_replace(cmap, key_cpu, rcpu); rcu_read_unlock(); return 0; } static void cpu_map_free(struct bpf_map *map) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); u32 i; /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, * so the bpf programs (can be more than one that used this map) were * disconnected from events. Wait for outstanding critical sections in * these programs to complete. synchronize_rcu() below not only * guarantees no further "XDP/bpf-side" reads against * bpf_cpu_map->cpu_map, but also ensure pending flush operations * (if any) are completed. */ synchronize_rcu(); /* The only possible user of bpf_cpu_map_entry is * cpu_map_kthread_run(). */ for (i = 0; i < cmap->map.max_entries; i++) { struct bpf_cpu_map_entry *rcpu; rcpu = rcu_dereference_raw(cmap->cpu_map[i]); if (!rcpu) continue; /* Stop kthread and cleanup entry directly */ __cpu_map_entry_free(&rcpu->free_work.work); } bpf_map_area_free(cmap->cpu_map); bpf_map_area_free(cmap); } /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or * by local_bh_disable() (from XDP calls inside NAPI). The * rcu_read_lock_bh_held() below makes lockdep accept both. */ static void *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); struct bpf_cpu_map_entry *rcpu; if (key >= map->max_entries) return NULL; rcpu = rcu_dereference_check(cmap->cpu_map[key], rcu_read_lock_bh_held()); return rcpu; } static void *cpu_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_cpu_map_entry *rcpu = __cpu_map_lookup_elem(map, *(u32 *)key); return rcpu ? &rcpu->value : NULL; } static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); u32 index = key ? *(u32 *)key : U32_MAX; u32 *next = next_key; if (index >= cmap->map.max_entries) { *next = 0; return 0; } if (index == cmap->map.max_entries - 1) return -ENOENT; *next = index + 1; return 0; } static long cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags) { return __bpf_xdp_redirect_map(map, index, flags, 0, __cpu_map_lookup_elem); } static u64 cpu_map_mem_usage(const struct bpf_map *map) { u64 usage = sizeof(struct bpf_cpu_map); /* Currently the dynamically allocated elements are not counted */ usage += (u64)map->max_entries * sizeof(struct bpf_cpu_map_entry *); return usage; } BTF_ID_LIST_SINGLE(cpu_map_btf_ids, struct, bpf_cpu_map) const struct bpf_map_ops cpu_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = cpu_map_alloc, .map_free = cpu_map_free, .map_delete_elem = cpu_map_delete_elem, .map_update_elem = cpu_map_update_elem, .map_lookup_elem = cpu_map_lookup_elem, .map_get_next_key = cpu_map_get_next_key, .map_check_btf = map_check_no_btf, .map_mem_usage = cpu_map_mem_usage, .map_btf_id = &cpu_map_btf_ids[0], .map_redirect = cpu_map_redirect, }; static void bq_flush_to_queue(struct xdp_bulk_queue *bq) { struct bpf_cpu_map_entry *rcpu = bq->obj; unsigned int processed = 0, drops = 0; const int to_cpu = rcpu->cpu; struct ptr_ring *q; int i; if (unlikely(!bq->count)) return; q = rcpu->queue; spin_lock(&q->producer_lock); for (i = 0; i < bq->count; i++) { struct xdp_frame *xdpf = bq->q[i]; int err; err = __ptr_ring_produce(q, xdpf); if (err) { drops++; xdp_return_frame_rx_napi(xdpf); } processed++; } bq->count = 0; spin_unlock(&q->producer_lock); __list_del_clearprev(&bq->flush_node); /* Feedback loop via tracepoints */ trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu); } /* Runs under RCU-read-side, plus in softirq under NAPI protection. * Thus, safe percpu variable access. */ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) bq_flush_to_queue(bq); /* Notice, xdp_buff/page MUST be queued here, long enough for * driver to code invoking us to finished, due to driver * (e.g. ixgbe) recycle tricks based on page-refcnt. * * Thus, incoming xdp_frame is always queued here (else we race * with another CPU on page-refcnt and remaining driver code). * Queue time is very short, as driver will invoke flush * operation, when completing napi->poll call. */ bq->q[bq->count++] = xdpf; if (!bq->flush_node.prev) list_add(&bq->flush_node, flush_list); } int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, struct net_device *dev_rx) { /* Info needed when constructing SKB on remote CPU */ xdpf->dev_rx = dev_rx; bq_enqueue(rcpu, xdpf); return 0; } int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu, struct sk_buff *skb) { int ret; __skb_pull(skb, skb->mac_len); skb_set_redirected(skb, false); __ptr_set_bit(0, &skb); ret = ptr_ring_produce(rcpu->queue, skb); if (ret < 0) goto trace; wake_up_process(rcpu->kthread); trace: trace_xdp_cpumap_enqueue(rcpu->map_id, !ret, !!ret, rcpu->cpu); return ret; } void __cpu_map_flush(void) { struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list); struct xdp_bulk_queue *bq, *tmp; list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { bq_flush_to_queue(bq); /* If already running, costs spin_lock_irqsave + smb_mb */ wake_up_process(bq->obj->kthread); } } #ifdef CONFIG_DEBUG_NET bool cpu_map_check_flush(void) { if (list_empty(this_cpu_ptr(&cpu_map_flush_list))) return false; __cpu_map_flush(); return true; } #endif static int __init cpu_map_init(void) { int cpu; for_each_possible_cpu(cpu) INIT_LIST_HEAD(&per_cpu(cpu_map_flush_list, cpu)); return 0; } subsys_initcall(cpu_map_init);
765 767 650 617 26 26 3 290 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) /* Copyright (C) 2016-2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. * * SipHash: a fast short-input PRF * https://131002.net/siphash/ * * This implementation is specifically for SipHash2-4 for a secure PRF * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for * hashtables. */ #include <linux/siphash.h> #include <asm/unaligned.h> #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 #include <linux/dcache.h> #include <asm/word-at-a-time.h> #endif #define SIPROUND SIPHASH_PERMUTATION(v0, v1, v2, v3) #define PREAMBLE(len) \ u64 v0 = SIPHASH_CONST_0; \ u64 v1 = SIPHASH_CONST_1; \ u64 v2 = SIPHASH_CONST_2; \ u64 v3 = SIPHASH_CONST_3; \ u64 b = ((u64)(len)) << 56; \ v3 ^= key->key[1]; \ v2 ^= key->key[0]; \ v1 ^= key->key[1]; \ v0 ^= key->key[0]; #define POSTAMBLE \ v3 ^= b; \ SIPROUND; \ SIPROUND; \ v0 ^= b; \ v2 ^= 0xff; \ SIPROUND; \ SIPROUND; \ SIPROUND; \ SIPROUND; \ return (v0 ^ v1) ^ (v2 ^ v3); #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); const u8 left = len & (sizeof(u64) - 1); u64 m; PREAMBLE(len) for (; data != end; data += sizeof(u64)) { m = le64_to_cpup(data); v3 ^= m; SIPROUND; SIPROUND; v0 ^= m; } #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 if (left) b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & bytemask_from_count(left))); #else switch (left) { case 7: b |= ((u64)end[6]) << 48; fallthrough; case 6: b |= ((u64)end[5]) << 40; fallthrough; case 5: b |= ((u64)end[4]) << 32; fallthrough; case 4: b |= le32_to_cpup(data); break; case 3: b |= ((u64)end[2]) << 16; fallthrough; case 2: b |= le16_to_cpup(data); break; case 1: b |= end[0]; } #endif POSTAMBLE } EXPORT_SYMBOL(__siphash_aligned); #endif u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); const u8 left = len & (sizeof(u64) - 1); u64 m; PREAMBLE(len) for (; data != end; data += sizeof(u64)) { m = get_unaligned_le64(data); v3 ^= m; SIPROUND; SIPROUND; v0 ^= m; } #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 if (left) b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & bytemask_from_count(left))); #else switch (left) { case 7: b |= ((u64)end[6]) << 48; fallthrough; case 6: b |= ((u64)end[5]) << 40; fallthrough; case 5: b |= ((u64)end[4]) << 32; fallthrough; case 4: b |= get_unaligned_le32(end); break; case 3: b |= ((u64)end[2]) << 16; fallthrough; case 2: b |= get_unaligned_le16(end); break; case 1: b |= end[0]; } #endif POSTAMBLE } EXPORT_SYMBOL(__siphash_unaligned); /** * siphash_1u64 - compute 64-bit siphash PRF value of a u64 * @first: first u64 * @key: the siphash key */ u64 siphash_1u64(const u64 first, const siphash_key_t *key) { PREAMBLE(8) v3 ^= first; SIPROUND; SIPROUND; v0 ^= first; POSTAMBLE } EXPORT_SYMBOL(siphash_1u64); /** * siphash_2u64 - compute 64-bit siphash PRF value of 2 u64 * @first: first u64 * @second: second u64 * @key: the siphash key */ u64 siphash_2u64(const u64 first, const u64 second, const siphash_key_t *key) { PREAMBLE(16) v3 ^= first; SIPROUND; SIPROUND; v0 ^= first; v3 ^= second; SIPROUND; SIPROUND; v0 ^= second; POSTAMBLE } EXPORT_SYMBOL(siphash_2u64); /** * siphash_3u64 - compute 64-bit siphash PRF value of 3 u64 * @first: first u64 * @second: second u64 * @third: third u64 * @key: the siphash key */ u64 siphash_3u64(const u64 first, const u64 second, const u64 third, const siphash_key_t *key) { PREAMBLE(24) v3 ^= first; SIPROUND; SIPROUND; v0 ^= first; v3 ^= second; SIPROUND; SIPROUND; v0 ^= second; v3 ^= third; SIPROUND; SIPROUND; v0 ^= third; POSTAMBLE } EXPORT_SYMBOL(siphash_3u64); /** * siphash_4u64 - compute 64-bit siphash PRF value of 4 u64 * @first: first u64 * @second: second u64 * @third: third u64 * @forth: forth u64 * @key: the siphash key */ u64 siphash_4u64(const u64 first, const u64 second, const u64 third, const u64 forth, const siphash_key_t *key) { PREAMBLE(32) v3 ^= first; SIPROUND; SIPROUND; v0 ^= first; v3 ^= second; SIPROUND; SIPROUND; v0 ^= second; v3 ^= third; SIPROUND; SIPROUND; v0 ^= third; v3 ^= forth; SIPROUND; SIPROUND; v0 ^= forth; POSTAMBLE } EXPORT_SYMBOL(siphash_4u64); u64 siphash_1u32(const u32 first, const siphash_key_t *key) { PREAMBLE(4) b |= first; POSTAMBLE } EXPORT_SYMBOL(siphash_1u32); u64 siphash_3u32(const u32 first, const u32 second, const u32 third, const siphash_key_t *key) { u64 combined = (u64)second << 32 | first; PREAMBLE(12) v3 ^= combined; SIPROUND; SIPROUND; v0 ^= combined; b |= third; POSTAMBLE } EXPORT_SYMBOL(siphash_3u32); #if BITS_PER_LONG == 64 /* Note that on 64-bit, we make HalfSipHash1-3 actually be SipHash1-3, for * performance reasons. On 32-bit, below, we actually implement HalfSipHash1-3. */ #define HSIPROUND SIPROUND #define HPREAMBLE(len) PREAMBLE(len) #define HPOSTAMBLE \ v3 ^= b; \ HSIPROUND; \ v0 ^= b; \ v2 ^= 0xff; \ HSIPROUND; \ HSIPROUND; \ HSIPROUND; \ return (v0 ^ v1) ^ (v2 ^ v3); #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); const u8 left = len & (sizeof(u64) - 1); u64 m; HPREAMBLE(len) for (; data != end; data += sizeof(u64)) { m = le64_to_cpup(data); v3 ^= m; HSIPROUND; v0 ^= m; } #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 if (left) b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & bytemask_from_count(left))); #else switch (left) { case 7: b |= ((u64)end[6]) << 48; fallthrough; case 6: b |= ((u64)end[5]) << 40; fallthrough; case 5: b |= ((u64)end[4]) << 32; fallthrough; case 4: b |= le32_to_cpup(data); break; case 3: b |= ((u64)end[2]) << 16; fallthrough; case 2: b |= le16_to_cpup(data); break; case 1: b |= end[0]; } #endif HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_aligned); #endif u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); const u8 left = len & (sizeof(u64) - 1); u64 m; HPREAMBLE(len) for (; data != end; data += sizeof(u64)) { m = get_unaligned_le64(data); v3 ^= m; HSIPROUND; v0 ^= m; } #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 if (left) b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & bytemask_from_count(left))); #else switch (left) { case 7: b |= ((u64)end[6]) << 48; fallthrough; case 6: b |= ((u64)end[5]) << 40; fallthrough; case 5: b |= ((u64)end[4]) << 32; fallthrough; case 4: b |= get_unaligned_le32(end); break; case 3: b |= ((u64)end[2]) << 16; fallthrough; case 2: b |= get_unaligned_le16(end); break; case 1: b |= end[0]; } #endif HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_unaligned); /** * hsiphash_1u32 - compute 64-bit hsiphash PRF value of a u32 * @first: first u32 * @key: the hsiphash key */ u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key) { HPREAMBLE(4) b |= first; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_1u32); /** * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32 * @first: first u32 * @second: second u32 * @key: the hsiphash key */ u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key) { u64 combined = (u64)second << 32 | first; HPREAMBLE(8) v3 ^= combined; HSIPROUND; v0 ^= combined; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_2u32); /** * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32 * @first: first u32 * @second: second u32 * @third: third u32 * @key: the hsiphash key */ u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third, const hsiphash_key_t *key) { u64 combined = (u64)second << 32 | first; HPREAMBLE(12) v3 ^= combined; HSIPROUND; v0 ^= combined; b |= third; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_3u32); /** * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32 * @first: first u32 * @second: second u32 * @third: third u32 * @forth: forth u32 * @key: the hsiphash key */ u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third, const u32 forth, const hsiphash_key_t *key) { u64 combined = (u64)second << 32 | first; HPREAMBLE(16) v3 ^= combined; HSIPROUND; v0 ^= combined; combined = (u64)forth << 32 | third; v3 ^= combined; HSIPROUND; v0 ^= combined; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_4u32); #else #define HSIPROUND HSIPHASH_PERMUTATION(v0, v1, v2, v3) #define HPREAMBLE(len) \ u32 v0 = HSIPHASH_CONST_0; \ u32 v1 = HSIPHASH_CONST_1; \ u32 v2 = HSIPHASH_CONST_2; \ u32 v3 = HSIPHASH_CONST_3; \ u32 b = ((u32)(len)) << 24; \ v3 ^= key->key[1]; \ v2 ^= key->key[0]; \ v1 ^= key->key[1]; \ v0 ^= key->key[0]; #define HPOSTAMBLE \ v3 ^= b; \ HSIPROUND; \ v0 ^= b; \ v2 ^= 0xff; \ HSIPROUND; \ HSIPROUND; \ HSIPROUND; \ return v1 ^ v3; #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u32)); const u8 left = len & (sizeof(u32) - 1); u32 m; HPREAMBLE(len) for (; data != end; data += sizeof(u32)) { m = le32_to_cpup(data); v3 ^= m; HSIPROUND; v0 ^= m; } switch (left) { case 3: b |= ((u32)end[2]) << 16; fallthrough; case 2: b |= le16_to_cpup(data); break; case 1: b |= end[0]; } HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_aligned); #endif u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u32)); const u8 left = len & (sizeof(u32) - 1); u32 m; HPREAMBLE(len) for (; data != end; data += sizeof(u32)) { m = get_unaligned_le32(data); v3 ^= m; HSIPROUND; v0 ^= m; } switch (left) { case 3: b |= ((u32)end[2]) << 16; fallthrough; case 2: b |= get_unaligned_le16(end); break; case 1: b |= end[0]; } HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_unaligned); /** * hsiphash_1u32 - compute 32-bit hsiphash PRF value of a u32 * @first: first u32 * @key: the hsiphash key */ u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key) { HPREAMBLE(4) v3 ^= first; HSIPROUND; v0 ^= first; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_1u32); /** * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32 * @first: first u32 * @second: second u32 * @key: the hsiphash key */ u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key) { HPREAMBLE(8) v3 ^= first; HSIPROUND; v0 ^= first; v3 ^= second; HSIPROUND; v0 ^= second; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_2u32); /** * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32 * @first: first u32 * @second: second u32 * @third: third u32 * @key: the hsiphash key */ u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third, const hsiphash_key_t *key) { HPREAMBLE(12) v3 ^= first; HSIPROUND; v0 ^= first; v3 ^= second; HSIPROUND; v0 ^= second; v3 ^= third; HSIPROUND; v0 ^= third; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_3u32); /** * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32 * @first: first u32 * @second: second u32 * @third: third u32 * @forth: forth u32 * @key: the hsiphash key */ u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third, const u32 forth, const hsiphash_key_t *key) { HPREAMBLE(16) v3 ^= first; HSIPROUND; v0 ^= first; v3 ^= second; HSIPROUND; v0 ^= second; v3 ^= third; HSIPROUND; v0 ^= third; v3 ^= forth; HSIPROUND; v0 ^= forth; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_4u32); #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 /* SPDX-License-Identifier: GPL-2.0 */ /* * Events for filesystem locks * * Copyright 2013 Jeff Layton <jlayton@poochiereds.net> */ #undef TRACE_SYSTEM #define TRACE_SYSTEM filelock #if !defined(_TRACE_FILELOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_FILELOCK_H #include <linux/tracepoint.h> #include <linux/fs.h> #include <linux/device.h> #include <linux/kdev_t.h> #define show_fl_flags(val) \ __print_flags(val, "|", \ { FL_POSIX, "FL_POSIX" }, \ { FL_FLOCK, "FL_FLOCK" }, \ { FL_DELEG, "FL_DELEG" }, \ { FL_ACCESS, "FL_ACCESS" }, \ { FL_EXISTS, "FL_EXISTS" }, \ { FL_LEASE, "FL_LEASE" }, \ { FL_CLOSE, "FL_CLOSE" }, \ { FL_SLEEP, "FL_SLEEP" }, \ { FL_DOWNGRADE_PENDING, "FL_DOWNGRADE_PENDING" }, \ { FL_UNLOCK_PENDING, "FL_UNLOCK_PENDING" }, \ { FL_OFDLCK, "FL_OFDLCK" }) #define show_fl_type(val) \ __print_symbolic(val, \ { F_RDLCK, "F_RDLCK" }, \ { F_WRLCK, "F_WRLCK" }, \ { F_UNLCK, "F_UNLCK" }) TRACE_EVENT(locks_get_lock_context, TP_PROTO(struct inode *inode, int type, struct file_lock_context *ctx), TP_ARGS(inode, type, ctx), TP_STRUCT__entry( __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(unsigned char, type) __field(struct file_lock_context *, ctx) ), TP_fast_assign( __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->type = type; __entry->ctx = ctx; ), TP_printk("dev=0x%x:0x%x ino=0x%lx type=%s ctx=%p", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, show_fl_type(__entry->type), __entry->ctx) ); DECLARE_EVENT_CLASS(filelock_lock, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret), TP_STRUCT__entry( __field(struct file_lock *, fl) __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(struct file_lock_core *, blocker) __field(fl_owner_t, owner) __field(unsigned int, pid) __field(unsigned int, flags) __field(unsigned char, type) __field(loff_t, fl_start) __field(loff_t, fl_end) __field(int, ret) ), TP_fast_assign( __entry->fl = fl ? fl : NULL; __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->blocker = fl ? fl->c.flc_blocker : NULL; __entry->owner = fl ? fl->c.flc_owner : NULL; __entry->pid = fl ? fl->c.flc_pid : 0; __entry->flags = fl ? fl->c.flc_flags : 0; __entry->type = fl ? fl->c.flc_type : 0; __entry->fl_start = fl ? fl->fl_start : 0; __entry->fl_end = fl ? fl->fl_end : 0; __entry->ret = ret; ), TP_printk("fl=%p dev=0x%x:0x%x ino=0x%lx fl_blocker=%p fl_owner=%p fl_pid=%u fl_flags=%s fl_type=%s fl_start=%lld fl_end=%lld ret=%d", __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->blocker, __entry->owner, __entry->pid, show_fl_flags(__entry->flags), show_fl_type(__entry->type), __entry->fl_start, __entry->fl_end, __entry->ret) ); DEFINE_EVENT(filelock_lock, posix_lock_inode, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DEFINE_EVENT(filelock_lock, fcntl_setlk, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DEFINE_EVENT(filelock_lock, locks_remove_posix, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DEFINE_EVENT(filelock_lock, flock_lock_inode, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DECLARE_EVENT_CLASS(filelock_lease, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl), TP_STRUCT__entry( __field(struct file_lease *, fl) __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(struct file_lock_core *, blocker) __field(fl_owner_t, owner) __field(unsigned int, flags) __field(unsigned char, type) __field(unsigned long, break_time) __field(unsigned long, downgrade_time) ), TP_fast_assign( __entry->fl = fl ? fl : NULL; __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->blocker = fl ? fl->c.flc_blocker : NULL; __entry->owner = fl ? fl->c.flc_owner : NULL; __entry->flags = fl ? fl->c.flc_flags : 0; __entry->type = fl ? fl->c.flc_type : 0; __entry->break_time = fl ? fl->fl_break_time : 0; __entry->downgrade_time = fl ? fl->fl_downgrade_time : 0; ), TP_printk("fl=%p dev=0x%x:0x%x ino=0x%lx fl_blocker=%p fl_owner=%p fl_flags=%s fl_type=%s fl_break_time=%lu fl_downgrade_time=%lu", __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->blocker, __entry->owner, show_fl_flags(__entry->flags), show_fl_type(__entry->type), __entry->break_time, __entry->downgrade_time) ); DEFINE_EVENT(filelock_lease, break_lease_noblock, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, break_lease_block, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, break_lease_unblock, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, generic_delete_lease, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, time_out_leases, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl)); TRACE_EVENT(generic_add_lease, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl), TP_STRUCT__entry( __field(unsigned long, i_ino) __field(int, wcount) __field(int, rcount) __field(int, icount) __field(dev_t, s_dev) __field(fl_owner_t, owner) __field(unsigned int, flags) __field(unsigned char, type) ), TP_fast_assign( __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->wcount = atomic_read(&inode->i_writecount); __entry->rcount = atomic_read(&inode->i_readcount); __entry->icount = atomic_read(&inode->i_count); __entry->owner = fl->c.flc_owner; __entry->flags = fl->c.flc_flags; __entry->type = fl->c.flc_type; ), TP_printk("dev=0x%x:0x%x ino=0x%lx wcount=%d rcount=%d icount=%d fl_owner=%p fl_flags=%s fl_type=%s", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->wcount, __entry->rcount, __entry->icount, __entry->owner, show_fl_flags(__entry->flags), show_fl_type(__entry->type)) ); TRACE_EVENT(leases_conflict, TP_PROTO(bool conflict, struct file_lease *lease, struct file_lease *breaker), TP_ARGS(conflict, lease, breaker), TP_STRUCT__entry( __field(void *, lease) __field(void *, breaker) __field(unsigned int, l_fl_flags) __field(unsigned int, b_fl_flags) __field(unsigned char, l_fl_type) __field(unsigned char, b_fl_type) __field(bool, conflict) ), TP_fast_assign( __entry->lease = lease; __entry->l_fl_flags = lease->c.flc_flags; __entry->l_fl_type = lease->c.flc_type; __entry->breaker = breaker; __entry->b_fl_flags = breaker->c.flc_flags; __entry->b_fl_type = breaker->c.flc_type; __entry->conflict = conflict; ), TP_printk("conflict %d: lease=%p fl_flags=%s fl_type=%s; breaker=%p fl_flags=%s fl_type=%s", __entry->conflict, __entry->lease, show_fl_flags(__entry->l_fl_flags), show_fl_type(__entry->l_fl_type), __entry->breaker, show_fl_flags(__entry->b_fl_flags), show_fl_type(__entry->b_fl_type)) ); #endif /* _TRACE_FILELOCK_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
3 2 2 2 2 5 5 5 3 2 5 3 1 1 1 3 1 4 4 4 2 2 12 2 5 1 2 10 3 7 2 31 8 1 10 7 1 1 2 1 56 12 43 10 10 1 9 9 9 3 3 3 3 5 5 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 // SPDX-License-Identifier: GPL-2.0-only /* * net/dccp/proto.c * * An implementation of the DCCP protocol * Arnaldo Carvalho de Melo <acme@conectiva.com.br> */ #include <linux/dccp.h> #include <linux/module.h> #include <linux/types.h> #include <linux/sched.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/in.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/random.h> #include <linux/slab.h> #include <net/checksum.h> #include <net/inet_sock.h> #include <net/inet_common.h> #include <net/sock.h> #include <net/xfrm.h> #include <asm/ioctls.h> #include <linux/spinlock.h> #include <linux/timer.h> #include <linux/delay.h> #include <linux/poll.h> #include "ccid.h" #include "dccp.h" #include "feat.h" #define CREATE_TRACE_POINTS #include "trace.h" DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly; EXPORT_SYMBOL_GPL(dccp_statistics); DEFINE_PER_CPU(unsigned int, dccp_orphan_count); EXPORT_PER_CPU_SYMBOL_GPL(dccp_orphan_count); struct inet_hashinfo dccp_hashinfo; EXPORT_SYMBOL_GPL(dccp_hashinfo); /* the maximum queue length for tx in packets. 0 is no limit */ int sysctl_dccp_tx_qlen __read_mostly = 5; #ifdef CONFIG_IP_DCCP_DEBUG static const char *dccp_state_name(const int state) { static const char *const dccp_state_names[] = { [DCCP_OPEN] = "OPEN", [DCCP_REQUESTING] = "REQUESTING", [DCCP_PARTOPEN] = "PARTOPEN", [DCCP_LISTEN] = "LISTEN", [DCCP_RESPOND] = "RESPOND", [DCCP_CLOSING] = "CLOSING", [DCCP_ACTIVE_CLOSEREQ] = "CLOSEREQ", [DCCP_PASSIVE_CLOSE] = "PASSIVE_CLOSE", [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ", [DCCP_TIME_WAIT] = "TIME_WAIT", [DCCP_CLOSED] = "CLOSED", }; if (state >= DCCP_MAX_STATES) return "INVALID STATE!"; else return dccp_state_names[state]; } #endif void dccp_set_state(struct sock *sk, const int state) { const int oldstate = sk->sk_state; dccp_pr_debug("%s(%p) %s --> %s\n", dccp_role(sk), sk, dccp_state_name(oldstate), dccp_state_name(state)); WARN_ON(state == oldstate); switch (state) { case DCCP_OPEN: if (oldstate != DCCP_OPEN) DCCP_INC_STATS(DCCP_MIB_CURRESTAB); /* Client retransmits all Confirm options until entering OPEN */ if (oldstate == DCCP_PARTOPEN) dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg); break; case DCCP_CLOSED: if (oldstate == DCCP_OPEN || oldstate == DCCP_ACTIVE_CLOSEREQ || oldstate == DCCP_CLOSING) DCCP_INC_STATS(DCCP_MIB_ESTABRESETS); sk->sk_prot->unhash(sk); if (inet_csk(sk)->icsk_bind_hash != NULL && !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) inet_put_port(sk); fallthrough; default: if (oldstate == DCCP_OPEN) DCCP_DEC_STATS(DCCP_MIB_CURRESTAB); } /* Change state AFTER socket is unhashed to avoid closed * socket sitting in hash tables. */ inet_sk_set_state(sk, state); } EXPORT_SYMBOL_GPL(dccp_set_state); static void dccp_finish_passive_close(struct sock *sk) { switch (sk->sk_state) { case DCCP_PASSIVE_CLOSE: /* Node (client or server) has received Close packet. */ dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED); dccp_set_state(sk, DCCP_CLOSED); break; case DCCP_PASSIVE_CLOSEREQ: /* * Client received CloseReq. We set the `active' flag so that * dccp_send_close() retransmits the Close as per RFC 4340, 8.3. */ dccp_send_close(sk, 1); dccp_set_state(sk, DCCP_CLOSING); } } void dccp_done(struct sock *sk) { dccp_set_state(sk, DCCP_CLOSED); dccp_clear_xmit_timers(sk); sk->sk_shutdown = SHUTDOWN_MASK; if (!sock_flag(sk, SOCK_DEAD)) sk->sk_state_change(sk); else inet_csk_destroy_sock(sk); } EXPORT_SYMBOL_GPL(dccp_done); const char *dccp_packet_name(const int type) { static const char *const dccp_packet_names[] = { [DCCP_PKT_REQUEST] = "REQUEST", [DCCP_PKT_RESPONSE] = "RESPONSE", [DCCP_PKT_DATA] = "DATA", [DCCP_PKT_ACK] = "ACK", [DCCP_PKT_DATAACK] = "DATAACK", [DCCP_PKT_CLOSEREQ] = "CLOSEREQ", [DCCP_PKT_CLOSE] = "CLOSE", [DCCP_PKT_RESET] = "RESET", [DCCP_PKT_SYNC] = "SYNC", [DCCP_PKT_SYNCACK] = "SYNCACK", }; if (type >= DCCP_NR_PKT_TYPES) return "INVALID"; else return dccp_packet_names[type]; } EXPORT_SYMBOL_GPL(dccp_packet_name); void dccp_destruct_common(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); dp->dccps_hc_tx_ccid = NULL; } EXPORT_SYMBOL_GPL(dccp_destruct_common); static void dccp_sk_destruct(struct sock *sk) { dccp_destruct_common(sk); inet_sock_destruct(sk); } int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) { struct dccp_sock *dp = dccp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); pr_warn_once("DCCP is deprecated and scheduled to be removed in 2025, " "please contact the netdev mailing list\n"); icsk->icsk_rto = DCCP_TIMEOUT_INIT; icsk->icsk_syn_retries = sysctl_dccp_request_retries; sk->sk_state = DCCP_CLOSED; sk->sk_write_space = dccp_write_space; sk->sk_destruct = dccp_sk_destruct; icsk->icsk_sync_mss = dccp_sync_mss; dp->dccps_mss_cache = 536; dp->dccps_rate_last = jiffies; dp->dccps_role = DCCP_ROLE_UNDEFINED; dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; dp->dccps_tx_qlen = sysctl_dccp_tx_qlen; dccp_init_xmit_timers(sk); INIT_LIST_HEAD(&dp->dccps_featneg); /* control socket doesn't need feat nego */ if (likely(ctl_sock_initialized)) return dccp_feat_init(sk); return 0; } EXPORT_SYMBOL_GPL(dccp_init_sock); void dccp_destroy_sock(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); __skb_queue_purge(&sk->sk_write_queue); if (sk->sk_send_head != NULL) { kfree_skb(sk->sk_send_head); sk->sk_send_head = NULL; } /* Clean up a referenced DCCP bind bucket. */ if (inet_csk(sk)->icsk_bind_hash != NULL) inet_put_port(sk); kfree(dp->dccps_service_list); dp->dccps_service_list = NULL; if (dp->dccps_hc_rx_ackvec != NULL) { dccp_ackvec_free(dp->dccps_hc_rx_ackvec); dp->dccps_hc_rx_ackvec = NULL; } ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); dp->dccps_hc_rx_ccid = NULL; /* clean up feature negotiation state */ dccp_feat_list_purge(&dp->dccps_featneg); } EXPORT_SYMBOL_GPL(dccp_destroy_sock); static inline int dccp_need_reset(int state) { return state != DCCP_CLOSED && state != DCCP_LISTEN && state != DCCP_REQUESTING; } int dccp_disconnect(struct sock *sk, int flags) { struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); struct dccp_sock *dp = dccp_sk(sk); const int old_state = sk->sk_state; if (old_state != DCCP_CLOSED) dccp_set_state(sk, DCCP_CLOSED); /* * This corresponds to the ABORT function of RFC793, sec. 3.8 * TCP uses a RST segment, DCCP a Reset packet with Code 2, "Aborted". */ if (old_state == DCCP_LISTEN) { inet_csk_listen_stop(sk); } else if (dccp_need_reset(old_state)) { dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); sk->sk_err = ECONNRESET; } else if (old_state == DCCP_REQUESTING) sk->sk_err = ECONNRESET; dccp_clear_xmit_timers(sk); ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); dp->dccps_hc_rx_ccid = NULL; __skb_queue_purge(&sk->sk_receive_queue); __skb_queue_purge(&sk->sk_write_queue); if (sk->sk_send_head != NULL) { __kfree_skb(sk->sk_send_head); sk->sk_send_head = NULL; } inet->inet_dport = 0; inet_bhash2_reset_saddr(sk); sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); icsk->icsk_backoff = 0; inet_csk_delack_init(sk); __sk_dst_reset(sk); WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); sk_error_report(sk); return 0; } EXPORT_SYMBOL_GPL(dccp_disconnect); /* * Wait for a DCCP event. * * Note that we don't need to lock the socket, as the upper poll layers * take care of normal races (between the test and the event) and we don't * go look at any of the socket buffers directly. */ __poll_t dccp_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask; u8 shutdown; int state; sock_poll_wait(file, sock, wait); state = inet_sk_state_load(sk); if (state == DCCP_LISTEN) return inet_csk_listen_poll(sk); /* Socket is not locked. We are protected from async events by poll logic and correct handling of state changes made by another threads is impossible in any case. */ mask = 0; if (READ_ONCE(sk->sk_err)) mask = EPOLLERR; shutdown = READ_ONCE(sk->sk_shutdown); if (shutdown == SHUTDOWN_MASK || state == DCCP_CLOSED) mask |= EPOLLHUP; if (shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; /* Connected? */ if ((1 << state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) { if (atomic_read(&sk->sk_rmem_alloc) > 0) mask |= EPOLLIN | EPOLLRDNORM; if (!(shutdown & SEND_SHUTDOWN)) { if (sk_stream_is_writeable(sk)) { mask |= EPOLLOUT | EPOLLWRNORM; } else { /* send SIGIO later */ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); /* Race breaker. If space is freed after * wspace test but before the flags are set, * IO signal will be lost. */ if (sk_stream_is_writeable(sk)) mask |= EPOLLOUT | EPOLLWRNORM; } } } return mask; } EXPORT_SYMBOL_GPL(dccp_poll); int dccp_ioctl(struct sock *sk, int cmd, int *karg) { int rc = -ENOTCONN; lock_sock(sk); if (sk->sk_state == DCCP_LISTEN) goto out; switch (cmd) { case SIOCOUTQ: { *karg = sk_wmem_alloc_get(sk); /* Using sk_wmem_alloc here because sk_wmem_queued is not used by DCCP and * always 0, comparably to UDP. */ rc = 0; } break; case SIOCINQ: { struct sk_buff *skb; *karg = 0; skb = skb_peek(&sk->sk_receive_queue); if (skb != NULL) { /* * We will only return the amount of this packet since * that is all that will be read. */ *karg = skb->len; } rc = 0; } break; default: rc = -ENOIOCTLCMD; break; } out: release_sock(sk); return rc; } EXPORT_SYMBOL_GPL(dccp_ioctl); static int dccp_setsockopt_service(struct sock *sk, const __be32 service, sockptr_t optval, unsigned int optlen) { struct dccp_sock *dp = dccp_sk(sk); struct dccp_service_list *sl = NULL; if (service == DCCP_SERVICE_INVALID_VALUE || optlen > DCCP_SERVICE_LIST_MAX_LEN * sizeof(u32)) return -EINVAL; if (optlen > sizeof(service)) { sl = kmalloc(optlen, GFP_KERNEL); if (sl == NULL) return -ENOMEM; sl->dccpsl_nr = optlen / sizeof(u32) - 1; if (copy_from_sockptr_offset(sl->dccpsl_list, optval, sizeof(service), optlen - sizeof(service)) || dccp_list_has_service(sl, DCCP_SERVICE_INVALID_VALUE)) { kfree(sl); return -EFAULT; } } lock_sock(sk); dp->dccps_service = service; kfree(dp->dccps_service_list); dp->dccps_service_list = sl; release_sock(sk); return 0; } static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx) { u8 *list, len; int i, rc; if (cscov < 0 || cscov > 15) return -EINVAL; /* * Populate a list of permissible values, in the range cscov...15. This * is necessary since feature negotiation of single values only works if * both sides incidentally choose the same value. Since the list starts * lowest-value first, negotiation will pick the smallest shared value. */ if (cscov == 0) return 0; len = 16 - cscov; list = kmalloc(len, GFP_KERNEL); if (list == NULL) return -ENOBUFS; for (i = 0; i < len; i++) list[i] = cscov++; rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len); if (rc == 0) { if (rx) dccp_sk(sk)->dccps_pcrlen = cscov; else dccp_sk(sk)->dccps_pcslen = cscov; } kfree(list); return rc; } static int dccp_setsockopt_ccid(struct sock *sk, int type, sockptr_t optval, unsigned int optlen) { u8 *val; int rc = 0; if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS) return -EINVAL; val = memdup_sockptr(optval, optlen); if (IS_ERR(val)) return PTR_ERR(val); lock_sock(sk); if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID) rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen); if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID)) rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen); release_sock(sk); kfree(val); return rc; } static int do_dccp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct dccp_sock *dp = dccp_sk(sk); int val, err = 0; switch (optname) { case DCCP_SOCKOPT_PACKET_SIZE: DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); return 0; case DCCP_SOCKOPT_CHANGE_L: case DCCP_SOCKOPT_CHANGE_R: DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n"); return 0; case DCCP_SOCKOPT_CCID: case DCCP_SOCKOPT_RX_CCID: case DCCP_SOCKOPT_TX_CCID: return dccp_setsockopt_ccid(sk, optname, optval, optlen); } if (optlen < (int)sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; if (optname == DCCP_SOCKOPT_SERVICE) return dccp_setsockopt_service(sk, val, optval, optlen); lock_sock(sk); switch (optname) { case DCCP_SOCKOPT_SERVER_TIMEWAIT: if (dp->dccps_role != DCCP_ROLE_SERVER) err = -EOPNOTSUPP; else dp->dccps_server_timewait = (val != 0); break; case DCCP_SOCKOPT_SEND_CSCOV: err = dccp_setsockopt_cscov(sk, val, false); break; case DCCP_SOCKOPT_RECV_CSCOV: err = dccp_setsockopt_cscov(sk, val, true); break; case DCCP_SOCKOPT_QPOLICY_ID: if (sk->sk_state != DCCP_CLOSED) err = -EISCONN; else if (val < 0 || val >= DCCPQ_POLICY_MAX) err = -EINVAL; else dp->dccps_qpolicy = val; break; case DCCP_SOCKOPT_QPOLICY_TXQLEN: if (val < 0) err = -EINVAL; else dp->dccps_tx_qlen = val; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } int dccp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { if (level != SOL_DCCP) return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level, optname, optval, optlen); return do_dccp_setsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL_GPL(dccp_setsockopt); static int dccp_getsockopt_service(struct sock *sk, int len, __be32 __user *optval, int __user *optlen) { const struct dccp_sock *dp = dccp_sk(sk); const struct dccp_service_list *sl; int err = -ENOENT, slen = 0, total_len = sizeof(u32); lock_sock(sk); if ((sl = dp->dccps_service_list) != NULL) { slen = sl->dccpsl_nr * sizeof(u32); total_len += slen; } err = -EINVAL; if (total_len > len) goto out; err = 0; if (put_user(total_len, optlen) || put_user(dp->dccps_service, optval) || (sl != NULL && copy_to_user(optval + 1, sl->dccpsl_list, slen))) err = -EFAULT; out: release_sock(sk); return err; } static int do_dccp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { struct dccp_sock *dp; int val, len; if (get_user(len, optlen)) return -EFAULT; if (len < (int)sizeof(int)) return -EINVAL; dp = dccp_sk(sk); switch (optname) { case DCCP_SOCKOPT_PACKET_SIZE: DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); return 0; case DCCP_SOCKOPT_SERVICE: return dccp_getsockopt_service(sk, len, (__be32 __user *)optval, optlen); case DCCP_SOCKOPT_GET_CUR_MPS: val = READ_ONCE(dp->dccps_mss_cache); break; case DCCP_SOCKOPT_AVAILABLE_CCIDS: return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen); case DCCP_SOCKOPT_TX_CCID: val = ccid_get_current_tx_ccid(dp); if (val < 0) return -ENOPROTOOPT; break; case DCCP_SOCKOPT_RX_CCID: val = ccid_get_current_rx_ccid(dp); if (val < 0) return -ENOPROTOOPT; break; case DCCP_SOCKOPT_SERVER_TIMEWAIT: val = dp->dccps_server_timewait; break; case DCCP_SOCKOPT_SEND_CSCOV: val = dp->dccps_pcslen; break; case DCCP_SOCKOPT_RECV_CSCOV: val = dp->dccps_pcrlen; break; case DCCP_SOCKOPT_QPOLICY_ID: val = dp->dccps_qpolicy; break; case DCCP_SOCKOPT_QPOLICY_TXQLEN: val = dp->dccps_tx_qlen; break; case 128 ... 191: return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname, len, (u32 __user *)optval, optlen); case 192 ... 255: return ccid_hc_tx_getsockopt(dp->dccps_hc_tx_ccid, sk, optname, len, (u32 __user *)optval, optlen); default: return -ENOPROTOOPT; } len = sizeof(val); if (put_user(len, optlen) || copy_to_user(optval, &val, len)) return -EFAULT; return 0; } int dccp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { if (level != SOL_DCCP) return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level, optname, optval, optlen); return do_dccp_getsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL_GPL(dccp_getsockopt); static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb) { struct cmsghdr *cmsg; /* * Assign an (opaque) qpolicy priority value to skb->priority. * * We are overloading this skb field for use with the qpolicy subystem. * The skb->priority is normally used for the SO_PRIORITY option, which * is initialised from sk_priority. Since the assignment of sk_priority * to skb->priority happens later (on layer 3), we overload this field * for use with queueing priorities as long as the skb is on layer 4. * The default priority value (if nothing is set) is 0. */ skb->priority = 0; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_DCCP) continue; if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX && !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type)) return -EINVAL; switch (cmsg->cmsg_type) { case DCCP_SCM_PRIORITY: if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32))) return -EINVAL; skb->priority = *(__u32 *)CMSG_DATA(cmsg); break; default: return -EINVAL; } } return 0; } int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { const struct dccp_sock *dp = dccp_sk(sk); const int flags = msg->msg_flags; const int noblock = flags & MSG_DONTWAIT; struct sk_buff *skb; int rc, size; long timeo; trace_dccp_probe(sk, len); if (len > READ_ONCE(dp->dccps_mss_cache)) return -EMSGSIZE; lock_sock(sk); timeo = sock_sndtimeo(sk, noblock); /* * We have to use sk_stream_wait_connect here to set sk_write_pending, * so that the trick in dccp_rcv_request_sent_state_process. */ /* Wait for a connection to finish. */ if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN)) if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0) goto out_release; size = sk->sk_prot->max_header + len; release_sock(sk); skb = sock_alloc_send_skb(sk, size, noblock, &rc); lock_sock(sk); if (skb == NULL) goto out_release; if (dccp_qpolicy_full(sk)) { rc = -EAGAIN; goto out_discard; } if (sk->sk_state == DCCP_CLOSED) { rc = -ENOTCONN; goto out_discard; } /* We need to check dccps_mss_cache after socket is locked. */ if (len > dp->dccps_mss_cache) { rc = -EMSGSIZE; goto out_discard; } skb_reserve(skb, sk->sk_prot->max_header); rc = memcpy_from_msg(skb_put(skb, len), msg, len); if (rc != 0) goto out_discard; rc = dccp_msghdr_parse(msg, skb); if (rc != 0) goto out_discard; dccp_qpolicy_push(sk, skb); /* * The xmit_timer is set if the TX CCID is rate-based and will expire * when congestion control permits to release further packets into the * network. Window-based CCIDs do not use this timer. */ if (!timer_pending(&dp->dccps_xmit_timer)) dccp_write_xmit(sk); out_release: release_sock(sk); return rc ? : len; out_discard: kfree_skb(skb); goto out_release; } EXPORT_SYMBOL_GPL(dccp_sendmsg); int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { const struct dccp_hdr *dh; long timeo; lock_sock(sk); if (sk->sk_state == DCCP_LISTEN) { len = -ENOTCONN; goto out; } timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); if (skb == NULL) goto verify_sock_status; dh = dccp_hdr(skb); switch (dh->dccph_type) { case DCCP_PKT_DATA: case DCCP_PKT_DATAACK: goto found_ok_skb; case DCCP_PKT_CLOSE: case DCCP_PKT_CLOSEREQ: if (!(flags & MSG_PEEK)) dccp_finish_passive_close(sk); fallthrough; case DCCP_PKT_RESET: dccp_pr_debug("found fin (%s) ok!\n", dccp_packet_name(dh->dccph_type)); len = 0; goto found_fin_ok; default: dccp_pr_debug("packet_type=%s\n", dccp_packet_name(dh->dccph_type)); sk_eat_skb(sk, skb); } verify_sock_status: if (sock_flag(sk, SOCK_DONE)) { len = 0; break; } if (sk->sk_err) { len = sock_error(sk); break; } if (sk->sk_shutdown & RCV_SHUTDOWN) { len = 0; break; } if (sk->sk_state == DCCP_CLOSED) { if (!sock_flag(sk, SOCK_DONE)) { /* This occurs when user tries to read * from never connected socket. */ len = -ENOTCONN; break; } len = 0; break; } if (!timeo) { len = -EAGAIN; break; } if (signal_pending(current)) { len = sock_intr_errno(timeo); break; } sk_wait_data(sk, &timeo, NULL); continue; found_ok_skb: if (len > skb->len) len = skb->len; else if (len < skb->len) msg->msg_flags |= MSG_TRUNC; if (skb_copy_datagram_msg(skb, 0, msg, len)) { /* Exception. Bailout! */ len = -EFAULT; break; } if (flags & MSG_TRUNC) len = skb->len; found_fin_ok: if (!(flags & MSG_PEEK)) sk_eat_skb(sk, skb); break; } while (1); out: release_sock(sk); return len; } EXPORT_SYMBOL_GPL(dccp_recvmsg); int inet_dccp_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; unsigned char old_state; int err; lock_sock(sk); err = -EINVAL; if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP) goto out; old_state = sk->sk_state; if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN))) goto out; WRITE_ONCE(sk->sk_max_ack_backlog, backlog); /* Really, if the socket is already in listen state * we can only allow the backlog to be adjusted. */ if (old_state != DCCP_LISTEN) { struct dccp_sock *dp = dccp_sk(sk); dp->dccps_role = DCCP_ROLE_LISTEN; /* do not start to listen if feature negotiation setup fails */ if (dccp_feat_finalise_settings(dp)) { err = -EPROTO; goto out; } err = inet_csk_listen_start(sk); if (err) goto out; } err = 0; out: release_sock(sk); return err; } EXPORT_SYMBOL_GPL(inet_dccp_listen); static void dccp_terminate_connection(struct sock *sk) { u8 next_state = DCCP_CLOSED; switch (sk->sk_state) { case DCCP_PASSIVE_CLOSE: case DCCP_PASSIVE_CLOSEREQ: dccp_finish_passive_close(sk); break; case DCCP_PARTOPEN: dccp_pr_debug("Stop PARTOPEN timer (%p)\n", sk); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); fallthrough; case DCCP_OPEN: dccp_send_close(sk, 1); if (dccp_sk(sk)->dccps_role == DCCP_ROLE_SERVER && !dccp_sk(sk)->dccps_server_timewait) next_state = DCCP_ACTIVE_CLOSEREQ; else next_state = DCCP_CLOSING; fallthrough; default: dccp_set_state(sk, next_state); } } void dccp_close(struct sock *sk, long timeout) { struct dccp_sock *dp = dccp_sk(sk); struct sk_buff *skb; u32 data_was_unread = 0; int state; lock_sock(sk); sk->sk_shutdown = SHUTDOWN_MASK; if (sk->sk_state == DCCP_LISTEN) { dccp_set_state(sk, DCCP_CLOSED); /* Special case. */ inet_csk_listen_stop(sk); goto adjudge_to_death; } sk_stop_timer(sk, &dp->dccps_xmit_timer); /* * We need to flush the recv. buffs. We do this only on the * descriptor close, not protocol-sourced closes, because the *reader process may not have drained the data yet! */ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { data_was_unread += skb->len; __kfree_skb(skb); } /* If socket has been already reset kill it. */ if (sk->sk_state == DCCP_CLOSED) goto adjudge_to_death; if (data_was_unread) { /* Unread data was tossed, send an appropriate Reset Code */ DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread); dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); dccp_set_state(sk, DCCP_CLOSED); } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); } else if (sk->sk_state != DCCP_CLOSED) { /* * Normal connection termination. May need to wait if there are * still packets in the TX queue that are delayed by the CCID. */ dccp_flush_write_queue(sk, &timeout); dccp_terminate_connection(sk); } /* * Flush write queue. This may be necessary in several cases: * - we have been closed by the peer but still have application data; * - abortive termination (unread data or zero linger time), * - normal termination but queue could not be flushed within time limit */ __skb_queue_purge(&sk->sk_write_queue); sk_stream_wait_close(sk, timeout); adjudge_to_death: state = sk->sk_state; sock_hold(sk); sock_orphan(sk); /* * It is the last release_sock in its life. It will remove backlog. */ release_sock(sk); /* * Now socket is owned by kernel and we acquire BH lock * to finish close. No need to check for user refs. */ local_bh_disable(); bh_lock_sock(sk); WARN_ON(sock_owned_by_user(sk)); this_cpu_inc(dccp_orphan_count); /* Have we already been destroyed by a softirq or backlog? */ if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED) goto out; if (sk->sk_state == DCCP_CLOSED) inet_csk_destroy_sock(sk); /* Otherwise, socket is reprieved until protocol close. */ out: bh_unlock_sock(sk); local_bh_enable(); sock_put(sk); } EXPORT_SYMBOL_GPL(dccp_close); void dccp_shutdown(struct sock *sk, int how) { dccp_pr_debug("called shutdown(%x)\n", how); } EXPORT_SYMBOL_GPL(dccp_shutdown); static inline int __init dccp_mib_init(void) { dccp_statistics = alloc_percpu(struct dccp_mib); if (!dccp_statistics) return -ENOMEM; return 0; } static inline void dccp_mib_exit(void) { free_percpu(dccp_statistics); } static int thash_entries; module_param(thash_entries, int, 0444); MODULE_PARM_DESC(thash_entries, "Number of ehash buckets"); #ifdef CONFIG_IP_DCCP_DEBUG bool dccp_debug; module_param(dccp_debug, bool, 0644); MODULE_PARM_DESC(dccp_debug, "Enable debug messages"); EXPORT_SYMBOL_GPL(dccp_debug); #endif static int __init dccp_init(void) { unsigned long goal; unsigned long nr_pages = totalram_pages(); int ehash_order, bhash_order, i; int rc; BUILD_BUG_ON(sizeof(struct dccp_skb_cb) > sizeof_field(struct sk_buff, cb)); rc = inet_hashinfo2_init_mod(&dccp_hashinfo); if (rc) goto out_fail; rc = -ENOBUFS; dccp_hashinfo.bind_bucket_cachep = kmem_cache_create("dccp_bind_bucket", sizeof(struct inet_bind_bucket), 0, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); if (!dccp_hashinfo.bind_bucket_cachep) goto out_free_hashinfo2; dccp_hashinfo.bind2_bucket_cachep = kmem_cache_create("dccp_bind2_bucket", sizeof(struct inet_bind2_bucket), 0, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); if (!dccp_hashinfo.bind2_bucket_cachep) goto out_free_bind_bucket_cachep; /* * Size and allocate the main established and bind bucket * hash tables. * * The methodology is similar to that of the buffer cache. */ if (nr_pages >= (128 * 1024)) goal = nr_pages >> (21 - PAGE_SHIFT); else goal = nr_pages >> (23 - PAGE_SHIFT); if (thash_entries) goal = (thash_entries * sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT; for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++) ; do { unsigned long hash_size = (1UL << ehash_order) * PAGE_SIZE / sizeof(struct inet_ehash_bucket); while (hash_size & (hash_size - 1)) hash_size--; dccp_hashinfo.ehash_mask = hash_size - 1; dccp_hashinfo.ehash = (struct inet_ehash_bucket *) __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, ehash_order); } while (!dccp_hashinfo.ehash && --ehash_order > 0); if (!dccp_hashinfo.ehash) { DCCP_CRIT("Failed to allocate DCCP established hash table"); goto out_free_bind2_bucket_cachep; } for (i = 0; i <= dccp_hashinfo.ehash_mask; i++) INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i); if (inet_ehash_locks_alloc(&dccp_hashinfo)) goto out_free_dccp_ehash; bhash_order = ehash_order; do { dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE / sizeof(struct inet_bind_hashbucket); if ((dccp_hashinfo.bhash_size > (64 * 1024)) && bhash_order > 0) continue; dccp_hashinfo.bhash = (struct inet_bind_hashbucket *) __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, bhash_order); } while (!dccp_hashinfo.bhash && --bhash_order >= 0); if (!dccp_hashinfo.bhash) { DCCP_CRIT("Failed to allocate DCCP bind hash table"); goto out_free_dccp_locks; } dccp_hashinfo.bhash2 = (struct inet_bind_hashbucket *) __get_free_pages(GFP_ATOMIC | __GFP_NOWARN, bhash_order); if (!dccp_hashinfo.bhash2) { DCCP_CRIT("Failed to allocate DCCP bind2 hash table"); goto out_free_dccp_bhash; } for (i = 0; i < dccp_hashinfo.bhash_size; i++) { spin_lock_init(&dccp_hashinfo.bhash[i].lock); INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain); spin_lock_init(&dccp_hashinfo.bhash2[i].lock); INIT_HLIST_HEAD(&dccp_hashinfo.bhash2[i].chain); } dccp_hashinfo.pernet = false; rc = dccp_mib_init(); if (rc) goto out_free_dccp_bhash2; rc = dccp_ackvec_init(); if (rc) goto out_free_dccp_mib; rc = dccp_sysctl_init(); if (rc) goto out_ackvec_exit; rc = ccid_initialize_builtins(); if (rc) goto out_sysctl_exit; dccp_timestamping_init(); return 0; out_sysctl_exit: dccp_sysctl_exit(); out_ackvec_exit: dccp_ackvec_exit(); out_free_dccp_mib: dccp_mib_exit(); out_free_dccp_bhash2: free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order); out_free_dccp_bhash: free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); out_free_dccp_locks: inet_ehash_locks_free(&dccp_hashinfo); out_free_dccp_ehash: free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order); out_free_bind2_bucket_cachep: kmem_cache_destroy(dccp_hashinfo.bind2_bucket_cachep); out_free_bind_bucket_cachep: kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); out_free_hashinfo2: inet_hashinfo2_free_mod(&dccp_hashinfo); out_fail: dccp_hashinfo.bhash = NULL; dccp_hashinfo.bhash2 = NULL; dccp_hashinfo.ehash = NULL; dccp_hashinfo.bind_bucket_cachep = NULL; dccp_hashinfo.bind2_bucket_cachep = NULL; return rc; } static void __exit dccp_fini(void) { int bhash_order = get_order(dccp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket)); ccid_cleanup_builtins(); dccp_mib_exit(); free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order); free_pages((unsigned long)dccp_hashinfo.ehash, get_order((dccp_hashinfo.ehash_mask + 1) * sizeof(struct inet_ehash_bucket))); inet_ehash_locks_free(&dccp_hashinfo); kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); dccp_ackvec_exit(); dccp_sysctl_exit(); inet_hashinfo2_free_mod(&dccp_hashinfo); } module_init(dccp_init); module_exit(dccp_fini); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@conectiva.com.br>"); MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");
142 21 17 21 16 6 8 8 14 14 6 2 2 6 31 1 15 5 57 57 53 54 54 2 54 54 41 1 46 40 46 1 9 18 6 10 2 10 10 10 18 5 22 19 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 /* SPDX-License-Identifier: GPL-2.0 */ /* Multipath TCP * * Copyright (c) 2017 - 2019, Intel Corporation. */ #ifndef __MPTCP_PROTOCOL_H #define __MPTCP_PROTOCOL_H #include <linux/random.h> #include <net/tcp.h> #include <net/inet_connection_sock.h> #include <uapi/linux/mptcp.h> #include <net/genetlink.h> #include "mptcp_pm_gen.h" #define MPTCP_SUPPORTED_VERSION 1 /* MPTCP option bits */ #define OPTION_MPTCP_MPC_SYN BIT(0) #define OPTION_MPTCP_MPC_SYNACK BIT(1) #define OPTION_MPTCP_MPC_ACK BIT(2) #define OPTION_MPTCP_MPJ_SYN BIT(3) #define OPTION_MPTCP_MPJ_SYNACK BIT(4) #define OPTION_MPTCP_MPJ_ACK BIT(5) #define OPTION_MPTCP_ADD_ADDR BIT(6) #define OPTION_MPTCP_RM_ADDR BIT(7) #define OPTION_MPTCP_FASTCLOSE BIT(8) #define OPTION_MPTCP_PRIO BIT(9) #define OPTION_MPTCP_RST BIT(10) #define OPTION_MPTCP_DSS BIT(11) #define OPTION_MPTCP_FAIL BIT(12) #define OPTION_MPTCP_CSUMREQD BIT(13) #define OPTIONS_MPTCP_MPC (OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | \ OPTION_MPTCP_MPC_ACK) #define OPTIONS_MPTCP_MPJ (OPTION_MPTCP_MPJ_SYN | OPTION_MPTCP_MPJ_SYNACK | \ OPTION_MPTCP_MPJ_ACK) /* MPTCP option subtypes */ #define MPTCPOPT_MP_CAPABLE 0 #define MPTCPOPT_MP_JOIN 1 #define MPTCPOPT_DSS 2 #define MPTCPOPT_ADD_ADDR 3 #define MPTCPOPT_RM_ADDR 4 #define MPTCPOPT_MP_PRIO 5 #define MPTCPOPT_MP_FAIL 6 #define MPTCPOPT_MP_FASTCLOSE 7 #define MPTCPOPT_RST 8 /* MPTCP suboption lengths */ #define TCPOLEN_MPTCP_MPC_SYN 4 #define TCPOLEN_MPTCP_MPC_SYNACK 12 #define TCPOLEN_MPTCP_MPC_ACK 20 #define TCPOLEN_MPTCP_MPC_ACK_DATA 22 #define TCPOLEN_MPTCP_MPJ_SYN 12 #define TCPOLEN_MPTCP_MPJ_SYNACK 16 #define TCPOLEN_MPTCP_MPJ_ACK 24 #define TCPOLEN_MPTCP_DSS_BASE 4 #define TCPOLEN_MPTCP_DSS_ACK32 4 #define TCPOLEN_MPTCP_DSS_ACK64 8 #define TCPOLEN_MPTCP_DSS_MAP32 10 #define TCPOLEN_MPTCP_DSS_MAP64 14 #define TCPOLEN_MPTCP_DSS_CHECKSUM 2 #define TCPOLEN_MPTCP_ADD_ADDR 16 #define TCPOLEN_MPTCP_ADD_ADDR_PORT 18 #define TCPOLEN_MPTCP_ADD_ADDR_BASE 8 #define TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT 10 #define TCPOLEN_MPTCP_ADD_ADDR6 28 #define TCPOLEN_MPTCP_ADD_ADDR6_PORT 30 #define TCPOLEN_MPTCP_ADD_ADDR6_BASE 20 #define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 22 #define TCPOLEN_MPTCP_PORT_LEN 2 #define TCPOLEN_MPTCP_PORT_ALIGN 2 #define TCPOLEN_MPTCP_RM_ADDR_BASE 3 #define TCPOLEN_MPTCP_PRIO 3 #define TCPOLEN_MPTCP_PRIO_ALIGN 4 #define TCPOLEN_MPTCP_FASTCLOSE 12 #define TCPOLEN_MPTCP_RST 4 #define TCPOLEN_MPTCP_FAIL 12 #define TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM (TCPOLEN_MPTCP_DSS_CHECKSUM + TCPOLEN_MPTCP_MPC_ACK_DATA) /* MPTCP MP_JOIN flags */ #define MPTCPOPT_BACKUP BIT(0) #define MPTCPOPT_THMAC_LEN 8 /* MPTCP MP_CAPABLE flags */ #define MPTCP_VERSION_MASK (0x0F) #define MPTCP_CAP_CHECKSUM_REQD BIT(7) #define MPTCP_CAP_EXTENSIBILITY BIT(6) #define MPTCP_CAP_DENY_JOIN_ID0 BIT(5) #define MPTCP_CAP_HMAC_SHA256 BIT(0) #define MPTCP_CAP_FLAG_MASK (0x1F) /* MPTCP DSS flags */ #define MPTCP_DSS_DATA_FIN BIT(4) #define MPTCP_DSS_DSN64 BIT(3) #define MPTCP_DSS_HAS_MAP BIT(2) #define MPTCP_DSS_ACK64 BIT(1) #define MPTCP_DSS_HAS_ACK BIT(0) #define MPTCP_DSS_FLAG_MASK (0x1F) /* MPTCP ADD_ADDR flags */ #define MPTCP_ADDR_ECHO BIT(0) /* MPTCP MP_PRIO flags */ #define MPTCP_PRIO_BKUP BIT(0) /* MPTCP TCPRST flags */ #define MPTCP_RST_TRANSIENT BIT(0) /* MPTCP socket atomic flags */ #define MPTCP_WORK_RTX 1 #define MPTCP_FALLBACK_DONE 2 #define MPTCP_WORK_CLOSE_SUBFLOW 3 /* MPTCP socket release cb flags */ #define MPTCP_PUSH_PENDING 1 #define MPTCP_CLEAN_UNA 2 #define MPTCP_ERROR_REPORT 3 #define MPTCP_RETRANSMIT 4 #define MPTCP_FLUSH_JOIN_LIST 5 #define MPTCP_SYNC_STATE 6 #define MPTCP_SYNC_SNDBUF 7 struct mptcp_skb_cb { u64 map_seq; u64 end_seq; u32 offset; u8 has_rxtstamp:1; }; #define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0])) static inline bool before64(__u64 seq1, __u64 seq2) { return (__s64)(seq1 - seq2) < 0; } #define after64(seq2, seq1) before64(seq1, seq2) struct mptcp_options_received { u64 sndr_key; u64 rcvr_key; u64 data_ack; u64 data_seq; u32 subflow_seq; u16 data_len; __sum16 csum; u16 suboptions; u32 token; u32 nonce; u16 use_map:1, dsn64:1, data_fin:1, use_ack:1, ack64:1, mpc_map:1, reset_reason:4, reset_transient:1, echo:1, backup:1, deny_join_id0:1, __unused:2; u8 join_id; u64 thmac; u8 hmac[MPTCPOPT_HMAC_LEN]; struct mptcp_addr_info addr; struct mptcp_rm_list rm_list; u64 ahmac; u64 fail_seq; }; static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field) { return htonl((TCPOPT_MPTCP << 24) | (len << 16) | (subopt << 12) | ((nib & 0xF) << 8) | field); } enum mptcp_pm_status { MPTCP_PM_ADD_ADDR_RECEIVED, MPTCP_PM_ADD_ADDR_SEND_ACK, MPTCP_PM_RM_ADDR_RECEIVED, MPTCP_PM_ESTABLISHED, MPTCP_PM_SUBFLOW_ESTABLISHED, MPTCP_PM_ALREADY_ESTABLISHED, /* persistent status, set after ESTABLISHED event */ MPTCP_PM_MPC_ENDPOINT_ACCOUNTED /* persistent status, set after MPC local address is * accounted int id_avail_bitmap */ }; enum mptcp_pm_type { MPTCP_PM_TYPE_KERNEL = 0, MPTCP_PM_TYPE_USERSPACE, __MPTCP_PM_TYPE_NR, __MPTCP_PM_TYPE_MAX = __MPTCP_PM_TYPE_NR - 1, }; /* Status bits below MPTCP_PM_ALREADY_ESTABLISHED need pm worker actions */ #define MPTCP_PM_WORK_MASK ((1 << MPTCP_PM_ALREADY_ESTABLISHED) - 1) enum mptcp_addr_signal_status { MPTCP_ADD_ADDR_SIGNAL, MPTCP_ADD_ADDR_ECHO, MPTCP_RM_ADDR_SIGNAL, }; /* max value of mptcp_addr_info.id */ #define MPTCP_PM_MAX_ADDR_ID U8_MAX struct mptcp_pm_data { struct mptcp_addr_info local; struct mptcp_addr_info remote; struct list_head anno_list; struct list_head userspace_pm_local_addr_list; spinlock_t lock; /*protects the whole PM data */ u8 addr_signal; bool server_side; bool work_pending; bool accept_addr; bool accept_subflow; bool remote_deny_join_id0; u8 add_addr_signaled; u8 add_addr_accepted; u8 local_addr_used; u8 pm_type; u8 subflows; u8 status; DECLARE_BITMAP(id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); struct mptcp_rm_list rm_list_tx; struct mptcp_rm_list rm_list_rx; }; struct mptcp_pm_addr_entry { struct list_head list; struct mptcp_addr_info addr; u8 flags; int ifindex; struct socket *lsk; }; struct mptcp_data_frag { struct list_head list; u64 data_seq; u16 data_len; u16 offset; u16 overhead; u16 already_sent; struct page *page; }; /* MPTCP connection sock */ struct mptcp_sock { /* inet_connection_sock must be the first member */ struct inet_connection_sock sk; u64 local_key; /* protected by the first subflow socket lock * lockless access read */ u64 remote_key; /* same as above */ u64 write_seq; u64 bytes_sent; u64 snd_nxt; u64 bytes_received; u64 ack_seq; atomic64_t rcv_wnd_sent; u64 rcv_data_fin_seq; u64 bytes_retrans; u64 bytes_consumed; int rmem_fwd_alloc; int snd_burst; int old_wspace; u64 recovery_snd_nxt; /* in recovery mode accept up to this seq; * recovery related fields are under data_lock * protection */ u64 bytes_acked; u64 snd_una; u64 wnd_end; unsigned long timer_ival; u32 token; int rmem_released; unsigned long flags; unsigned long cb_flags; bool recovery; /* closing subflow write queue reinjected */ bool can_ack; bool fully_established; bool rcv_data_fin; bool snd_data_fin_enable; bool rcv_fastclose; bool use_64bit_ack; /* Set when we received a 64-bit DSN */ bool csum_enabled; bool allow_infinite_fallback; u8 pending_state; /* A subflow asked to set this sk_state, * protected by the msk data lock */ u8 mpc_endpoint_id; u8 recvmsg_inq:1, cork:1, nodelay:1, fastopening:1, in_accept_queue:1, free_first:1, rcvspace_init:1; u32 notsent_lowat; struct work_struct work; struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; struct sk_buff_head receive_queue; struct list_head conn_list; struct list_head rtx_queue; struct mptcp_data_frag *first_pending; struct list_head join_list; struct sock *first; /* The mptcp ops can safely dereference, using suitable * ONCE annotation, the subflow outside the socket * lock as such sock is freed after close(). */ struct mptcp_pm_data pm; struct mptcp_sched_ops *sched; struct { u32 space; /* bytes copied in last measurement window */ u32 copied; /* bytes copied in this measurement window */ u64 time; /* start time of measurement window */ u64 rtt_us; /* last maximum rtt of subflows */ } rcvq_space; u8 scaling_ratio; u32 subflow_id; u32 setsockopt_seq; char ca_name[TCP_CA_NAME_MAX]; }; #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock) #define mptcp_data_unlock(sk) spin_unlock_bh(&(sk)->sk_lock.slock) #define mptcp_for_each_subflow(__msk, __subflow) \ list_for_each_entry(__subflow, &((__msk)->conn_list), node) #define mptcp_for_each_subflow_safe(__msk, __subflow, __tmp) \ list_for_each_entry_safe(__subflow, __tmp, &((__msk)->conn_list), node) extern struct genl_family mptcp_genl_family; static inline void msk_owned_by_me(const struct mptcp_sock *msk) { sock_owned_by_me((const struct sock *)msk); } #ifdef CONFIG_DEBUG_NET /* MPTCP-specific: we might (indirectly) call this helper with the wrong sk */ #undef tcp_sk #define tcp_sk(ptr) ({ \ typeof(ptr) _ptr = (ptr); \ WARN_ON(_ptr->sk_protocol != IPPROTO_TCP); \ container_of_const(_ptr, struct tcp_sock, inet_conn.icsk_inet.sk); \ }) #define mptcp_sk(ptr) ({ \ typeof(ptr) _ptr = (ptr); \ WARN_ON(_ptr->sk_protocol != IPPROTO_MPTCP); \ container_of_const(_ptr, struct mptcp_sock, sk.icsk_inet.sk); \ }) #else /* !CONFIG_DEBUG_NET */ #define mptcp_sk(ptr) container_of_const(ptr, struct mptcp_sock, sk.icsk_inet.sk) #endif /* the msk socket don't use the backlog, also account for the bulk * free memory */ static inline int __mptcp_rmem(const struct sock *sk) { return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released); } static inline int mptcp_win_from_space(const struct sock *sk, int space) { return __tcp_win_from_space(mptcp_sk(sk)->scaling_ratio, space); } static inline int __mptcp_space(const struct sock *sk) { return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk)); } static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk) { const struct mptcp_sock *msk = mptcp_sk(sk); return READ_ONCE(msk->first_pending); } static inline struct mptcp_data_frag *mptcp_send_next(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *cur; cur = msk->first_pending; return list_is_last(&cur->list, &msk->rtx_queue) ? NULL : list_next_entry(cur, list); } static inline struct mptcp_data_frag *mptcp_pending_tail(const struct sock *sk) { const struct mptcp_sock *msk = mptcp_sk(sk); if (!msk->first_pending) return NULL; if (WARN_ON_ONCE(list_empty(&msk->rtx_queue))) return NULL; return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list); } static inline struct mptcp_data_frag *mptcp_rtx_head(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); if (msk->snd_una == msk->snd_nxt) return NULL; return list_first_entry_or_null(&msk->rtx_queue, struct mptcp_data_frag, list); } struct csum_pseudo_header { __be64 data_seq; __be32 subflow_seq; __be16 data_len; __sum16 csum; }; struct mptcp_subflow_request_sock { struct tcp_request_sock sk; u16 mp_capable : 1, mp_join : 1, backup : 1, csum_reqd : 1, allow_join_id0 : 1; u8 local_id; u8 remote_id; u64 local_key; u64 idsn; u32 token; u32 ssn_offset; u64 thmac; u32 local_nonce; u32 remote_nonce; struct mptcp_sock *msk; struct hlist_nulls_node token_node; }; static inline struct mptcp_subflow_request_sock * mptcp_subflow_rsk(const struct request_sock *rsk) { return (struct mptcp_subflow_request_sock *)rsk; } struct mptcp_delegated_action { struct napi_struct napi; struct list_head head; }; DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); #define MPTCP_DELEGATE_SCHEDULED 0 #define MPTCP_DELEGATE_SEND 1 #define MPTCP_DELEGATE_ACK 2 #define MPTCP_DELEGATE_SNDBUF 3 #define MPTCP_DELEGATE_ACTIONS_MASK (~BIT(MPTCP_DELEGATE_SCHEDULED)) /* MPTCP subflow context */ struct mptcp_subflow_context { struct list_head node;/* conn_list of subflows */ struct_group(reset, unsigned long avg_pacing_rate; /* protected by msk socket lock */ u64 local_key; u64 remote_key; u64 idsn; u64 map_seq; u32 snd_isn; u32 token; u32 rel_write_seq; u32 map_subflow_seq; u32 ssn_offset; u32 map_data_len; __wsum map_data_csum; u32 map_csum_len; u32 request_mptcp : 1, /* send MP_CAPABLE */ request_join : 1, /* send MP_JOIN */ request_bkup : 1, mp_capable : 1, /* remote is MPTCP capable */ mp_join : 1, /* remote is JOINing */ fully_established : 1, /* path validated */ pm_notified : 1, /* PM hook called for established status */ conn_finished : 1, map_valid : 1, map_csum_reqd : 1, map_data_fin : 1, mpc_map : 1, backup : 1, send_mp_prio : 1, send_mp_fail : 1, send_fastclose : 1, send_infinite_map : 1, remote_key_valid : 1, /* received the peer key from */ disposable : 1, /* ctx can be free at ulp release time */ stale : 1, /* unable to snd/rcv data, do not use for xmit */ valid_csum_seen : 1, /* at least one csum validated */ is_mptfo : 1, /* subflow is doing TFO */ __unused : 10; bool data_avail; bool scheduled; u32 remote_nonce; u64 thmac; u32 local_nonce; u32 remote_token; union { u8 hmac[MPTCPOPT_HMAC_LEN]; /* MPJ subflow only */ u64 iasn; /* initial ack sequence number, MPC subflows only */ }; s16 local_id; /* if negative not initialized yet */ u8 remote_id; u8 reset_seen:1; u8 reset_transient:1; u8 reset_reason:4; u8 stale_count; u32 subflow_id; long delegated_status; unsigned long fail_tout; ); struct list_head delegated_node; /* link into delegated_action, protected by local BH */ u32 setsockopt_seq; u32 stale_rcv_tstamp; int cached_sndbuf; /* sndbuf size when last synced with the msk sndbuf, * protected by the msk socket lock */ struct sock *tcp_sock; /* tcp sk backpointer */ struct sock *conn; /* parent mptcp_sock */ const struct inet_connection_sock_af_ops *icsk_af_ops; void (*tcp_state_change)(struct sock *sk); void (*tcp_error_report)(struct sock *sk); struct rcu_head rcu; }; static inline struct mptcp_subflow_context * mptcp_subflow_ctx(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); /* Use RCU on icsk_ulp_data only for sock diag code */ return (__force struct mptcp_subflow_context *)icsk->icsk_ulp_data; } static inline struct sock * mptcp_subflow_tcp_sock(const struct mptcp_subflow_context *subflow) { return subflow->tcp_sock; } static inline void mptcp_subflow_ctx_reset(struct mptcp_subflow_context *subflow) { memset(&subflow->reset, 0, sizeof(subflow->reset)); subflow->request_mptcp = 1; WRITE_ONCE(subflow->local_id, -1); } static inline u64 mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow) { return tcp_sk(mptcp_subflow_tcp_sock(subflow))->copied_seq - subflow->ssn_offset - subflow->map_subflow_seq; } static inline u64 mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow) { return subflow->map_seq + mptcp_subflow_get_map_offset(subflow); } void mptcp_subflow_process_delegated(struct sock *ssk, long actions); static inline void mptcp_subflow_delegate(struct mptcp_subflow_context *subflow, int action) { long old, set_bits = BIT(MPTCP_DELEGATE_SCHEDULED) | BIT(action); struct mptcp_delegated_action *delegated; bool schedule; /* the caller held the subflow bh socket lock */ lockdep_assert_in_softirq(); /* The implied barrier pairs with tcp_release_cb_override() * mptcp_napi_poll(), and ensures the below list check sees list * updates done prior to delegated status bits changes */ old = set_mask_bits(&subflow->delegated_status, 0, set_bits); if (!(old & BIT(MPTCP_DELEGATE_SCHEDULED))) { if (WARN_ON_ONCE(!list_empty(&subflow->delegated_node))) return; delegated = this_cpu_ptr(&mptcp_delegated_actions); schedule = list_empty(&delegated->head); list_add_tail(&subflow->delegated_node, &delegated->head); sock_hold(mptcp_subflow_tcp_sock(subflow)); if (schedule) napi_schedule(&delegated->napi); } } static inline struct mptcp_subflow_context * mptcp_subflow_delegated_next(struct mptcp_delegated_action *delegated) { struct mptcp_subflow_context *ret; if (list_empty(&delegated->head)) return NULL; ret = list_first_entry(&delegated->head, struct mptcp_subflow_context, delegated_node); list_del_init(&ret->delegated_node); return ret; } int mptcp_is_enabled(const struct net *net); unsigned int mptcp_get_add_addr_timeout(const struct net *net); int mptcp_is_checksum_enabled(const struct net *net); int mptcp_allow_join_id0(const struct net *net); unsigned int mptcp_stale_loss_cnt(const struct net *net); unsigned int mptcp_close_timeout(const struct sock *sk); int mptcp_get_pm_type(const struct net *net); const char *mptcp_get_scheduler(const struct net *net); void __mptcp_subflow_fully_established(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt); bool __mptcp_retransmit_pending_data(struct sock *sk); void mptcp_check_and_set_pending(struct sock *sk); void __mptcp_push_pending(struct sock *sk, unsigned int flags); bool mptcp_subflow_data_available(struct sock *sk); void __init mptcp_subflow_init(void); void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how); void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow); void __mptcp_subflow_send_ack(struct sock *ssk); void mptcp_subflow_reset(struct sock *ssk); void mptcp_subflow_queue_clean(struct sock *sk, struct sock *ssk); void mptcp_sock_graft(struct sock *sk, struct socket *parent); struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk); bool __mptcp_close(struct sock *sk, long timeout); void mptcp_cancel_work(struct sock *sk); void __mptcp_unaccepted_force_close(struct sock *sk); void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk); void mptcp_set_state(struct sock *sk, int state); bool mptcp_addresses_equal(const struct mptcp_addr_info *a, const struct mptcp_addr_info *b, bool use_port); void mptcp_local_address(const struct sock_common *skc, struct mptcp_addr_info *addr); /* called with sk socket lock held */ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, const struct mptcp_addr_info *remote); int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, struct socket **new_sock); void mptcp_info2sockaddr(const struct mptcp_addr_info *info, struct sockaddr_storage *addr, unsigned short family); struct mptcp_sched_ops *mptcp_sched_find(const char *name); int mptcp_register_scheduler(struct mptcp_sched_ops *sched); void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched); void mptcp_sched_init(void); int mptcp_init_sched(struct mptcp_sock *msk, struct mptcp_sched_ops *sched); void mptcp_release_sched(struct mptcp_sock *msk); void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow, bool scheduled); struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk); struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk); int mptcp_sched_get_send(struct mptcp_sock *msk); int mptcp_sched_get_retrans(struct mptcp_sock *msk); static inline u64 mptcp_data_avail(const struct mptcp_sock *msk) { return READ_ONCE(msk->bytes_received) - READ_ONCE(msk->bytes_consumed); } static inline bool mptcp_epollin_ready(const struct sock *sk) { /* mptcp doesn't have to deal with small skbs in the receive queue, * at it can always coalesce them */ return (mptcp_data_avail(mptcp_sk(sk)) >= sk->sk_rcvlowat) || (mem_cgroup_sockets_enabled && sk->sk_memcg && mem_cgroup_under_socket_pressure(sk->sk_memcg)) || READ_ONCE(tcp_memory_pressure); } int mptcp_set_rcvlowat(struct sock *sk, int val); static inline bool __tcp_can_send(const struct sock *ssk) { /* only send if our side has not closed yet */ return ((1 << inet_sk_state_load(ssk)) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)); } static inline bool __mptcp_subflow_active(struct mptcp_subflow_context *subflow) { /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */ if (subflow->request_join && !subflow->fully_established) return false; return __tcp_can_send(mptcp_subflow_tcp_sock(subflow)); } void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow); bool mptcp_subflow_active(struct mptcp_subflow_context *subflow); void mptcp_subflow_drop_ctx(struct sock *ssk); static inline void mptcp_subflow_tcp_fallback(struct sock *sk, struct mptcp_subflow_context *ctx) { sk->sk_data_ready = sock_def_readable; sk->sk_state_change = ctx->tcp_state_change; sk->sk_write_space = sk_stream_write_space; sk->sk_error_report = ctx->tcp_error_report; inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops; } void __init mptcp_proto_init(void); #if IS_ENABLED(CONFIG_MPTCP_IPV6) int __init mptcp_proto_v6_init(void); #endif struct sock *mptcp_sk_clone_init(const struct sock *sk, const struct mptcp_options_received *mp_opt, struct sock *ssk, struct request_sock *req); void mptcp_get_options(const struct sk_buff *skb, struct mptcp_options_received *mp_opt); void mptcp_finish_connect(struct sock *sk); void __mptcp_sync_state(struct sock *sk, int state); void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout); static inline void mptcp_stop_tout_timer(struct sock *sk) { if (!inet_csk(sk)->icsk_mtup.probe_timestamp) return; sk_stop_timer(sk, &sk->sk_timer); inet_csk(sk)->icsk_mtup.probe_timestamp = 0; } static inline void mptcp_set_close_tout(struct sock *sk, unsigned long tout) { /* avoid 0 timestamp, as that means no close timeout */ inet_csk(sk)->icsk_mtup.probe_timestamp = tout ? : 1; } static inline void mptcp_start_tout_timer(struct sock *sk) { mptcp_set_close_tout(sk, tcp_jiffies32); mptcp_reset_tout_timer(mptcp_sk(sk), 0); } static inline bool mptcp_is_fully_established(struct sock *sk) { return inet_sk_state_load(sk) == TCP_ESTABLISHED && READ_ONCE(mptcp_sk(sk)->fully_established); } void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk); void mptcp_data_ready(struct sock *sk, struct sock *ssk); bool mptcp_finish_join(struct sock *sk); bool mptcp_schedule_work(struct sock *sk); int mptcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int mptcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *option); u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq); static inline u64 mptcp_expand_seq(u64 old_seq, u64 cur_seq, bool use_64bit) { if (use_64bit) return cur_seq; return __mptcp_expand_seq(old_seq, cur_seq); } void __mptcp_check_push(struct sock *sk, struct sock *ssk); void __mptcp_data_acked(struct sock *sk); void __mptcp_error_report(struct sock *sk); bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit); static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk) { return READ_ONCE(msk->snd_data_fin_enable) && READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt); } static inline u32 mptcp_notsent_lowat(const struct sock *sk) { struct net *net = sock_net(sk); u32 val; val = READ_ONCE(mptcp_sk(sk)->notsent_lowat); return val ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat); } static inline bool mptcp_stream_memory_free(const struct sock *sk, int wake) { const struct mptcp_sock *msk = mptcp_sk(sk); u32 notsent_bytes; notsent_bytes = READ_ONCE(msk->write_seq) - READ_ONCE(msk->snd_nxt); return (notsent_bytes << wake) < mptcp_notsent_lowat(sk); } static inline bool __mptcp_stream_is_writeable(const struct sock *sk, int wake) { return mptcp_stream_memory_free(sk, wake) && __sk_stream_is_writeable(sk, wake); } static inline void mptcp_write_space(struct sock *sk) { /* pairs with memory barrier in mptcp_poll */ smp_mb(); if (mptcp_stream_memory_free(sk, 1)) sk_stream_write_space(sk); } static inline void __mptcp_sync_sndbuf(struct sock *sk) { struct mptcp_subflow_context *subflow; int ssk_sndbuf, new_sndbuf; if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) return; new_sndbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[0]); mptcp_for_each_subflow(mptcp_sk(sk), subflow) { ssk_sndbuf = READ_ONCE(mptcp_subflow_tcp_sock(subflow)->sk_sndbuf); subflow->cached_sndbuf = ssk_sndbuf; new_sndbuf += ssk_sndbuf; } /* the msk max wmem limit is <nr_subflows> * tcp wmem[2] */ WRITE_ONCE(sk->sk_sndbuf, new_sndbuf); mptcp_write_space(sk); } /* The called held both the msk socket and the subflow socket locks, * possibly under BH */ static inline void __mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); if (READ_ONCE(ssk->sk_sndbuf) != subflow->cached_sndbuf) __mptcp_sync_sndbuf(sk); } /* the caller held only the subflow socket lock, either in process or * BH context. Additionally this can be called under the msk data lock, * so we can't acquire such lock here: let the delegate action acquires * the needed locks in suitable order. */ static inline void mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); if (likely(READ_ONCE(ssk->sk_sndbuf) == subflow->cached_sndbuf)) return; local_bh_disable(); mptcp_subflow_delegate(subflow, MPTCP_DELEGATE_SNDBUF); local_bh_enable(); } void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags); #define MPTCP_TOKEN_MAX_RETRIES 4 void __init mptcp_token_init(void); static inline void mptcp_token_init_request(struct request_sock *req) { mptcp_subflow_rsk(req)->token_node.pprev = NULL; } int mptcp_token_new_request(struct request_sock *req); void mptcp_token_destroy_request(struct request_sock *req); int mptcp_token_new_connect(struct sock *ssk); void mptcp_token_accept(struct mptcp_subflow_request_sock *r, struct mptcp_sock *msk); bool mptcp_token_exists(u32 token); struct mptcp_sock *mptcp_token_get_sock(struct net *net, u32 token); struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot, long *s_num); void mptcp_token_destroy(struct mptcp_sock *msk); void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn); void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac); __sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum); void __init mptcp_pm_init(void); void mptcp_pm_data_init(struct mptcp_sock *msk); void mptcp_pm_data_reset(struct mptcp_sock *msk); int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, struct mptcp_addr_info *addr); int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info, bool require_family, struct mptcp_pm_addr_entry *entry); bool mptcp_pm_addr_families_match(const struct sock *sk, const struct mptcp_addr_info *loc, const struct mptcp_addr_info *rem); void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side); void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk); bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk); void mptcp_pm_connection_closed(struct mptcp_sock *msk); void mptcp_pm_subflow_established(struct mptcp_sock *msk); bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk); void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct mptcp_subflow_context *subflow); void mptcp_pm_add_addr_received(const struct sock *ssk, const struct mptcp_addr_info *addr); void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk); void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk); void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup); void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq); int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, struct mptcp_addr_info *addr, struct mptcp_addr_info *rem, u8 bkup); bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); void mptcp_pm_free_anno_list(struct mptcp_sock *msk); bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk); struct mptcp_pm_add_entry * mptcp_pm_del_add_timer(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool check_id); struct mptcp_pm_add_entry * mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk, const struct mptcp_addr_info *addr); int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, u8 *flags, int *ifindex); int mptcp_pm_nl_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, u8 *flags, int *ifindex); int mptcp_userspace_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, u8 *flags, int *ifindex); int mptcp_pm_set_flags(struct sk_buff *skb, struct genl_info *info); int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info); int mptcp_userspace_pm_set_flags(struct sk_buff *skb, struct genl_info *info); int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool echo); int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list); void mptcp_free_local_addr_list(struct mptcp_sock *msk); void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp); void mptcp_event_addr_announced(const struct sock *ssk, const struct mptcp_addr_info *info); void mptcp_event_addr_removed(const struct mptcp_sock *msk, u8 id); void mptcp_event_pm_listener(const struct sock *ssk, enum mptcp_event_type event); bool mptcp_userspace_pm_active(const struct mptcp_sock *msk); void __mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt); void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow, struct request_sock *req); int mptcp_nl_fill_addr(struct sk_buff *skb, struct mptcp_pm_addr_entry *entry); static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.addr_signal) & (BIT(MPTCP_ADD_ADDR_SIGNAL) | BIT(MPTCP_ADD_ADDR_ECHO)); } static inline bool mptcp_pm_should_add_signal_addr(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_SIGNAL); } static inline bool mptcp_pm_should_add_signal_echo(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_ECHO); } static inline bool mptcp_pm_should_rm_signal(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_RM_ADDR_SIGNAL); } static inline bool mptcp_pm_is_userspace(const struct mptcp_sock *msk) { return READ_ONCE(msk->pm.pm_type) == MPTCP_PM_TYPE_USERSPACE; } static inline bool mptcp_pm_is_kernel(const struct mptcp_sock *msk) { return READ_ONCE(msk->pm.pm_type) == MPTCP_PM_TYPE_KERNEL; } static inline unsigned int mptcp_add_addr_len(int family, bool echo, bool port) { u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE; if (family == AF_INET6) len = TCPOLEN_MPTCP_ADD_ADDR6_BASE; if (!echo) len += MPTCPOPT_THMAC_LEN; /* account for 2 trailing 'nop' options */ if (port) len += TCPOLEN_MPTCP_PORT_LEN + TCPOLEN_MPTCP_PORT_ALIGN; return len; } static inline int mptcp_rm_addr_len(const struct mptcp_rm_list *rm_list) { if (rm_list->nr == 0 || rm_list->nr > MPTCP_RM_IDS_MAX) return -EINVAL; return TCPOLEN_MPTCP_RM_ADDR_BASE + roundup(rm_list->nr - 1, 4) + 1; } bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb, unsigned int opt_size, unsigned int remaining, struct mptcp_addr_info *addr, bool *echo, bool *drop_other_suboptions); bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, struct mptcp_rm_list *rm_list); int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc); int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc); int mptcp_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb); int mptcp_pm_nl_dump_addr(struct sk_buff *msg, struct netlink_callback *cb); int mptcp_userspace_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb); int mptcp_pm_get_addr(struct sk_buff *skb, struct genl_info *info); int mptcp_pm_nl_get_addr(struct sk_buff *skb, struct genl_info *info); int mptcp_userspace_pm_get_addr(struct sk_buff *skb, struct genl_info *info); static inline u8 subflow_get_local_id(const struct mptcp_subflow_context *subflow) { int local_id = READ_ONCE(subflow->local_id); if (local_id < 0) return 0; return local_id; } void __init mptcp_pm_nl_init(void); void mptcp_pm_nl_work(struct mptcp_sock *msk); void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk); /* called under PM lock */ static inline void __mptcp_pm_close_subflow(struct mptcp_sock *msk) { if (--msk->pm.subflows < mptcp_pm_get_subflows_max(msk)) WRITE_ONCE(msk->pm.accept_subflow, true); } static inline void mptcp_pm_close_subflow(struct mptcp_sock *msk) { spin_lock_bh(&msk->pm.lock); __mptcp_pm_close_subflow(msk); spin_unlock_bh(&msk->pm.lock); } void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk); void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk); static inline struct mptcp_ext *mptcp_get_ext(const struct sk_buff *skb) { return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP); } void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops); static inline bool __mptcp_check_fallback(const struct mptcp_sock *msk) { return test_bit(MPTCP_FALLBACK_DONE, &msk->flags); } static inline bool mptcp_check_fallback(const struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); return __mptcp_check_fallback(msk); } static inline void __mptcp_do_fallback(struct mptcp_sock *msk) { if (__mptcp_check_fallback(msk)) { pr_debug("TCP fallback already done (msk=%p)", msk); return; } set_bit(MPTCP_FALLBACK_DONE, &msk->flags); } static inline bool __mptcp_has_initial_subflow(const struct mptcp_sock *msk) { struct sock *ssk = READ_ONCE(msk->first); return ssk && ((1 << inet_sk_state_load(ssk)) & (TCPF_ESTABLISHED | TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_LISTEN)); } static inline void mptcp_do_fallback(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = subflow->conn; struct mptcp_sock *msk; msk = mptcp_sk(sk); __mptcp_do_fallback(msk); if (READ_ONCE(msk->snd_data_fin_enable) && !(ssk->sk_shutdown & SEND_SHUTDOWN)) { gfp_t saved_allocation = ssk->sk_allocation; /* we are in a atomic (BH) scope, override ssk default for data * fin allocation */ ssk->sk_allocation = GFP_ATOMIC; ssk->sk_shutdown |= SEND_SHUTDOWN; tcp_shutdown(ssk, SEND_SHUTDOWN); ssk->sk_allocation = saved_allocation; } } #define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)", __func__, a) static inline bool mptcp_check_infinite_map(struct sk_buff *skb) { struct mptcp_ext *mpext; mpext = skb ? mptcp_get_ext(skb) : NULL; if (mpext && mpext->infinite_map) return true; return false; } static inline bool is_active_ssk(struct mptcp_subflow_context *subflow) { return (subflow->request_mptcp || subflow->request_join); } static inline bool subflow_simultaneous_connect(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING) && is_active_ssk(subflow) && !subflow->conn_finished; } #ifdef CONFIG_SYN_COOKIES void subflow_init_req_cookie_join_save(const struct mptcp_subflow_request_sock *subflow_req, struct sk_buff *skb); bool mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subflow_req, struct sk_buff *skb); void __init mptcp_join_cookie_init(void); #else static inline void subflow_init_req_cookie_join_save(const struct mptcp_subflow_request_sock *subflow_req, struct sk_buff *skb) {} static inline bool mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subflow_req, struct sk_buff *skb) { return false; } static inline void mptcp_join_cookie_init(void) {} #endif #endif /* __MPTCP_PROTOCOL_H */
4 9 4 4 7 12 1 11 9 5 4 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2016-2021 Christoph Hellwig. */ #include <linux/module.h> #include <linux/compiler.h> #include <linux/fs.h> #include <linux/iomap.h> #include <linux/fiemap.h> #include <linux/pagemap.h> static int iomap_to_fiemap(struct fiemap_extent_info *fi, const struct iomap *iomap, u32 flags) { switch (iomap->type) { case IOMAP_HOLE: /* skip holes */ return 0; case IOMAP_DELALLOC: flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN; break; case IOMAP_MAPPED: break; case IOMAP_UNWRITTEN: flags |= FIEMAP_EXTENT_UNWRITTEN; break; case IOMAP_INLINE: flags |= FIEMAP_EXTENT_DATA_INLINE; break; } if (iomap->flags & IOMAP_F_MERGED) flags |= FIEMAP_EXTENT_MERGED; if (iomap->flags & IOMAP_F_SHARED) flags |= FIEMAP_EXTENT_SHARED; return fiemap_fill_next_extent(fi, iomap->offset, iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0, iomap->length, flags); } static loff_t iomap_fiemap_iter(const struct iomap_iter *iter, struct fiemap_extent_info *fi, struct iomap *prev) { int ret; if (iter->iomap.type == IOMAP_HOLE) return iomap_length(iter); ret = iomap_to_fiemap(fi, prev, 0); *prev = iter->iomap; switch (ret) { case 0: /* success */ return iomap_length(iter); case 1: /* extent array full */ return 0; default: /* error */ return ret; } } int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, u64 start, u64 len, const struct iomap_ops *ops) { struct iomap_iter iter = { .inode = inode, .pos = start, .len = len, .flags = IOMAP_REPORT, }; struct iomap prev = { .type = IOMAP_HOLE, }; int ret; ret = fiemap_prep(inode, fi, start, &iter.len, 0); if (ret) return ret; while ((ret = iomap_iter(&iter, ops)) > 0) iter.processed = iomap_fiemap_iter(&iter, fi, &prev); if (prev.type != IOMAP_HOLE) { ret = iomap_to_fiemap(fi, &prev, FIEMAP_EXTENT_LAST); if (ret < 0) return ret; } /* inode with no (attribute) mapping will give ENOENT */ if (ret < 0 && ret != -ENOENT) return ret; return 0; } EXPORT_SYMBOL_GPL(iomap_fiemap); /* legacy ->bmap interface. 0 is the error return (!) */ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, const struct iomap_ops *ops) { struct iomap_iter iter = { .inode = mapping->host, .pos = (loff_t)bno << mapping->host->i_blkbits, .len = i_blocksize(mapping->host), .flags = IOMAP_REPORT, }; const unsigned int blkshift = mapping->host->i_blkbits - SECTOR_SHIFT; int ret; if (filemap_write_and_wait(mapping)) return 0; bno = 0; while ((ret = iomap_iter(&iter, ops)) > 0) { if (iter.iomap.type == IOMAP_MAPPED) bno = iomap_sector(&iter.iomap, iter.pos) >> blkshift; /* leave iter.processed unset to abort loop */ } if (ret) return 0; return bno; } EXPORT_SYMBOL_GPL(iomap_bmap);
61 61 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 // SPDX-License-Identifier: GPL-2.0-or-later /* * cn_proc.c - process events connector * * Copyright (C) Matt Helsley, IBM Corp. 2005 * Based on cn_fork.c by Guillaume Thouvenin <guillaume.thouvenin@bull.net> * Original copyright notice follows: * Copyright (C) 2005 BULL SA. */ #include <linux/kernel.h> #include <linux/ktime.h> #include <linux/init.h> #include <linux/connector.h> #include <linux/gfp.h> #include <linux/ptrace.h> #include <linux/atomic.h> #include <linux/pid_namespace.h> #include <linux/cn_proc.h> #include <linux/local_lock.h> /* * Size of a cn_msg followed by a proc_event structure. Since the * sizeof struct cn_msg is a multiple of 4 bytes, but not 8 bytes, we * add one 4-byte word to the size here, and then start the actual * cn_msg structure 4 bytes into the stack buffer. The result is that * the immediately following proc_event structure is aligned to 8 bytes. */ #define CN_PROC_MSG_SIZE (sizeof(struct cn_msg) + sizeof(struct proc_event) + 4) /* See comment above; we test our assumption about sizeof struct cn_msg here. */ static inline struct cn_msg *buffer_to_cn_msg(__u8 *buffer) { BUILD_BUG_ON(sizeof(struct cn_msg) != 20); return (struct cn_msg *)(buffer + 4); } static atomic_t proc_event_num_listeners = ATOMIC_INIT(0); static struct cb_id cn_proc_event_id = { CN_IDX_PROC, CN_VAL_PROC }; /* local_event.count is used as the sequence number of the netlink message */ struct local_event { local_lock_t lock; __u32 count; }; static DEFINE_PER_CPU(struct local_event, local_event) = { .lock = INIT_LOCAL_LOCK(lock), }; static int cn_filter(struct sock *dsk, struct sk_buff *skb, void *data) { __u32 what, exit_code, *ptr; enum proc_cn_mcast_op mc_op; uintptr_t val; if (!dsk || !dsk->sk_user_data || !data) return 0; ptr = (__u32 *)data; what = *ptr++; exit_code = *ptr; val = ((struct proc_input *)(dsk->sk_user_data))->event_type; mc_op = ((struct proc_input *)(dsk->sk_user_data))->mcast_op; if (mc_op == PROC_CN_MCAST_IGNORE) return 1; if ((__u32)val == PROC_EVENT_ALL) return 0; /* * Drop packet if we have to report only non-zero exit status * (PROC_EVENT_NONZERO_EXIT) and exit status is 0 */ if (((__u32)val & PROC_EVENT_NONZERO_EXIT) && (what == PROC_EVENT_EXIT)) { if (exit_code) return 0; } if ((__u32)val & what) return 0; return 1; } static inline void send_msg(struct cn_msg *msg) { __u32 filter_data[2]; local_lock(&local_event.lock); msg->seq = __this_cpu_inc_return(local_event.count) - 1; ((struct proc_event *)msg->data)->cpu = smp_processor_id(); /* * local_lock() disables preemption during send to ensure the messages * are ordered according to their sequence numbers. * * If cn_netlink_send() fails, the data is not sent. */ filter_data[0] = ((struct proc_event *)msg->data)->what; if (filter_data[0] == PROC_EVENT_EXIT) { filter_data[1] = ((struct proc_event *)msg->data)->event_data.exit.exit_code; } else { filter_data[1] = 0; } cn_netlink_send_mult(msg, msg->len, 0, CN_IDX_PROC, GFP_NOWAIT, cn_filter, (void *)filter_data); local_unlock(&local_event.lock); } void proc_fork_connector(struct task_struct *task) { struct cn_msg *msg; struct proc_event *ev; __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8); struct task_struct *parent; if (atomic_read(&proc_event_num_listeners) < 1) return; msg = buffer_to_cn_msg(buffer); ev = (struct proc_event *)msg->data; memset(&ev->event_data, 0, sizeof(ev->event_data)); ev->timestamp_ns = ktime_get_ns(); ev->what = PROC_EVENT_FORK; rcu_read_lock(); parent = rcu_dereference(task->real_parent); ev->event_data.fork.parent_pid = parent->pid; ev->event_data.fork.parent_tgid = parent->tgid; rcu_read_unlock(); ev->event_data.fork.child_pid = task->pid; ev->event_data.fork.child_tgid = task->tgid; memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); msg->flags = 0; /* not used */ send_msg(msg); } void proc_exec_connector(struct task_struct *task) { struct cn_msg *msg; struct proc_event *ev; __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8); if (atomic_read(&proc_event_num_listeners) < 1) return; msg = buffer_to_cn_msg(buffer); ev = (struct proc_event *)msg->data; memset(&ev->event_data, 0, sizeof(ev->event_data)); ev->timestamp_ns = ktime_get_ns(); ev->what = PROC_EVENT_EXEC; ev->event_data.exec.process_pid = task->pid; ev->event_data.exec.process_tgid = task->tgid; memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); msg->flags = 0; /* not used */ send_msg(msg); } void proc_id_connector(struct task_struct *task, int which_id) { struct cn_msg *msg; struct proc_event *ev; __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8); const struct cred *cred; if (atomic_read(&proc_event_num_listeners) < 1) return; msg = buffer_to_cn_msg(buffer); ev = (struct proc_event *)msg->data; memset(&ev->event_data, 0, sizeof(ev->event_data)); ev->what = which_id; ev->event_data.id.process_pid = task->pid; ev->event_data.id.process_tgid = task->tgid; rcu_read_lock(); cred = __task_cred(task); if (which_id == PROC_EVENT_UID) { ev->event_data.id.r.ruid = from_kuid_munged(&init_user_ns, cred->uid); ev->event_data.id.e.euid = from_kuid_munged(&init_user_ns, cred->euid); } else if (which_id == PROC_EVENT_GID) { ev->event_data.id.r.rgid = from_kgid_munged(&init_user_ns, cred->gid); ev->event_data.id.e.egid = from_kgid_munged(&init_user_ns, cred->egid); } else { rcu_read_unlock(); return; } rcu_read_unlock(); ev->timestamp_ns = ktime_get_ns(); memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); msg->flags = 0; /* not used */ send_msg(msg); } void proc_sid_connector(struct task_struct *task) { struct cn_msg *msg; struct proc_event *ev; __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8); if (atomic_read(&proc_event_num_listeners) < 1) return; msg = buffer_to_cn_msg(buffer); ev = (struct proc_event *)msg->data; memset(&ev->event_data, 0, sizeof(ev->event_data)); ev->timestamp_ns = ktime_get_ns(); ev->what = PROC_EVENT_SID; ev->event_data.sid.process_pid = task->pid; ev->event_data.sid.process_tgid = task->tgid; memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); msg->flags = 0; /* not used */ send_msg(msg); } void proc_ptrace_connector(struct task_struct *task, int ptrace_id) { struct cn_msg *msg; struct proc_event *ev; __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8); if (atomic_read(&proc_event_num_listeners) < 1) return; msg = buffer_to_cn_msg(buffer); ev = (struct proc_event *)msg->data; memset(&ev->event_data, 0, sizeof(ev->event_data)); ev->timestamp_ns = ktime_get_ns(); ev->what = PROC_EVENT_PTRACE; ev->event_data.ptrace.process_pid = task->pid; ev->event_data.ptrace.process_tgid = task->tgid; if (ptrace_id == PTRACE_ATTACH) { ev->event_data.ptrace.tracer_pid = current->pid; ev->event_data.ptrace.tracer_tgid = current->tgid; } else if (ptrace_id == PTRACE_DETACH) { ev->event_data.ptrace.tracer_pid = 0; ev->event_data.ptrace.tracer_tgid = 0; } else return; memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); msg->flags = 0; /* not used */ send_msg(msg); } void proc_comm_connector(struct task_struct *task) { struct cn_msg *msg; struct proc_event *ev; __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8); if (atomic_read(&proc_event_num_listeners) < 1) return; msg = buffer_to_cn_msg(buffer); ev = (struct proc_event *)msg->data; memset(&ev->event_data, 0, sizeof(ev->event_data)); ev->timestamp_ns = ktime_get_ns(); ev->what = PROC_EVENT_COMM; ev->event_data.comm.process_pid = task->pid; ev->event_data.comm.process_tgid = task->tgid; get_task_comm(ev->event_data.comm.comm, task); memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); msg->flags = 0; /* not used */ send_msg(msg); } void proc_coredump_connector(struct task_struct *task) { struct cn_msg *msg; struct proc_event *ev; struct task_struct *parent; __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8); if (atomic_read(&proc_event_num_listeners) < 1) return; msg = buffer_to_cn_msg(buffer); ev = (struct proc_event *)msg->data; memset(&ev->event_data, 0, sizeof(ev->event_data)); ev->timestamp_ns = ktime_get_ns(); ev->what = PROC_EVENT_COREDUMP; ev->event_data.coredump.process_pid = task->pid; ev->event_data.coredump.process_tgid = task->tgid; rcu_read_lock(); if (pid_alive(task)) { parent = rcu_dereference(task->real_parent); ev->event_data.coredump.parent_pid = parent->pid; ev->event_data.coredump.parent_tgid = parent->tgid; } rcu_read_unlock(); memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); msg->flags = 0; /* not used */ send_msg(msg); } void proc_exit_connector(struct task_struct *task) { struct cn_msg *msg; struct proc_event *ev; struct task_struct *parent; __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8); if (atomic_read(&proc_event_num_listeners) < 1) return; msg = buffer_to_cn_msg(buffer); ev = (struct proc_event *)msg->data; memset(&ev->event_data, 0, sizeof(ev->event_data)); ev->timestamp_ns = ktime_get_ns(); ev->what = PROC_EVENT_EXIT; ev->event_data.exit.process_pid = task->pid; ev->event_data.exit.process_tgid = task->tgid; ev->event_data.exit.exit_code = task->exit_code; ev->event_data.exit.exit_signal = task->exit_signal; rcu_read_lock(); if (pid_alive(task)) { parent = rcu_dereference(task->real_parent); ev->event_data.exit.parent_pid = parent->pid; ev->event_data.exit.parent_tgid = parent->tgid; } rcu_read_unlock(); memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); msg->flags = 0; /* not used */ send_msg(msg); } /* * Send an acknowledgement message to userspace * * Use 0 for success, EFOO otherwise. * Note: this is the negative of conventional kernel error * values because it's not being returned via syscall return * mechanisms. */ static void cn_proc_ack(int err, int rcvd_seq, int rcvd_ack) { struct cn_msg *msg; struct proc_event *ev; __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8); if (atomic_read(&proc_event_num_listeners) < 1) return; msg = buffer_to_cn_msg(buffer); ev = (struct proc_event *)msg->data; memset(&ev->event_data, 0, sizeof(ev->event_data)); msg->seq = rcvd_seq; ev->timestamp_ns = ktime_get_ns(); ev->cpu = -1; ev->what = PROC_EVENT_NONE; ev->event_data.ack.err = err; memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = rcvd_ack + 1; msg->len = sizeof(*ev); msg->flags = 0; /* not used */ send_msg(msg); } /** * cn_proc_mcast_ctl * @msg: message sent from userspace via the connector * @nsp: NETLINK_CB of the client's socket buffer */ static void cn_proc_mcast_ctl(struct cn_msg *msg, struct netlink_skb_parms *nsp) { enum proc_cn_mcast_op mc_op = 0, prev_mc_op = 0; struct proc_input *pinput = NULL; enum proc_cn_event ev_type = 0; int err = 0, initial = 0; struct sock *sk = NULL; /* * Events are reported with respect to the initial pid * and user namespaces so ignore requestors from * other namespaces. */ if ((current_user_ns() != &init_user_ns) || !task_is_in_init_pid_ns(current)) return; if (msg->len == sizeof(*pinput)) { pinput = (struct proc_input *)msg->data; mc_op = pinput->mcast_op; ev_type = pinput->event_type; } else if (msg->len == sizeof(mc_op)) { mc_op = *((enum proc_cn_mcast_op *)msg->data); ev_type = PROC_EVENT_ALL; } else { return; } ev_type = valid_event((enum proc_cn_event)ev_type); if (ev_type == PROC_EVENT_NONE) ev_type = PROC_EVENT_ALL; if (nsp->sk) { sk = nsp->sk; if (sk->sk_user_data == NULL) { sk->sk_user_data = kzalloc(sizeof(struct proc_input), GFP_KERNEL); if (sk->sk_user_data == NULL) { err = ENOMEM; goto out; } initial = 1; } else { prev_mc_op = ((struct proc_input *)(sk->sk_user_data))->mcast_op; } ((struct proc_input *)(sk->sk_user_data))->event_type = ev_type; ((struct proc_input *)(sk->sk_user_data))->mcast_op = mc_op; } switch (mc_op) { case PROC_CN_MCAST_LISTEN: if (initial || (prev_mc_op != PROC_CN_MCAST_LISTEN)) atomic_inc(&proc_event_num_listeners); break; case PROC_CN_MCAST_IGNORE: if (!initial && (prev_mc_op != PROC_CN_MCAST_IGNORE)) atomic_dec(&proc_event_num_listeners); ((struct proc_input *)(sk->sk_user_data))->event_type = PROC_EVENT_NONE; break; default: err = EINVAL; break; } out: cn_proc_ack(err, msg->seq, msg->ack); } /* * cn_proc_init - initialization entry point * * Adds the connector callback to the connector driver. */ static int __init cn_proc_init(void) { int err = cn_add_callback(&cn_proc_event_id, "cn_proc", &cn_proc_mcast_ctl); if (err) { pr_warn("cn_proc failed to register\n"); return err; } return 0; } device_initcall(cn_proc_init);
14 14 14 14 14 14 9 9 14 14 14 14 14 14 14 14 14 14 1 1 1 14 14 14 14 14 14 14 14 14 14 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 // SPDX-License-Identifier: GPL-2.0-or-later /* audit.c -- Auditing support * Gateway between the kernel (e.g., selinux) and the user-space audit daemon. * System-call specific features have moved to auditsc.c * * Copyright 2003-2007 Red Hat Inc., Durham, North Carolina. * All Rights Reserved. * * Written by Rickard E. (Rik) Faith <faith@redhat.com> * * Goals: 1) Integrate fully with Security Modules. * 2) Minimal run-time overhead: * a) Minimal when syscall auditing is disabled (audit_enable=0). * b) Small when syscall auditing is enabled and no audit record * is generated (defer as much work as possible to record * generation time): * i) context is allocated, * ii) names from getname are stored without a copy, and * iii) inode information stored from path_lookup. * 3) Ability to disable syscall auditing at boot time (audit=0). * 4) Usable by other parts of the kernel (if audit_log* is called, * then a syscall record will be generated automatically for the * current syscall). * 5) Netlink interface to user-space. * 6) Support low-overhead kernel-based filtering to minimize the * information that must be passed to user-space. * * Audit userspace, documentation, tests, and bug/issue trackers: * https://github.com/linux-audit */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/file.h> #include <linux/init.h> #include <linux/types.h> #include <linux/atomic.h> #include <linux/mm.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/err.h> #include <linux/kthread.h> #include <linux/kernel.h> #include <linux/syscalls.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/mutex.h> #include <linux/gfp.h> #include <linux/pid.h> #include <linux/audit.h> #include <net/sock.h> #include <net/netlink.h> #include <linux/skbuff.h> #include <linux/security.h> #include <linux/freezer.h> #include <linux/pid_namespace.h> #include <net/netns/generic.h> #include "audit.h" /* No auditing will take place until audit_initialized == AUDIT_INITIALIZED. * (Initialization happens after skb_init is called.) */ #define AUDIT_DISABLED -1 #define AUDIT_UNINITIALIZED 0 #define AUDIT_INITIALIZED 1 static int audit_initialized = AUDIT_UNINITIALIZED; u32 audit_enabled = AUDIT_OFF; bool audit_ever_enabled = !!AUDIT_OFF; EXPORT_SYMBOL_GPL(audit_enabled); /* Default state when kernel boots without any parameters. */ static u32 audit_default = AUDIT_OFF; /* If auditing cannot proceed, audit_failure selects what happens. */ static u32 audit_failure = AUDIT_FAIL_PRINTK; /* private audit network namespace index */ static unsigned int audit_net_id; /** * struct audit_net - audit private network namespace data * @sk: communication socket */ struct audit_net { struct sock *sk; }; /** * struct auditd_connection - kernel/auditd connection state * @pid: auditd PID * @portid: netlink portid * @net: the associated network namespace * @rcu: RCU head * * Description: * This struct is RCU protected; you must either hold the RCU lock for reading * or the associated spinlock for writing. */ struct auditd_connection { struct pid *pid; u32 portid; struct net *net; struct rcu_head rcu; }; static struct auditd_connection __rcu *auditd_conn; static DEFINE_SPINLOCK(auditd_conn_lock); /* If audit_rate_limit is non-zero, limit the rate of sending audit records * to that number per second. This prevents DoS attacks, but results in * audit records being dropped. */ static u32 audit_rate_limit; /* Number of outstanding audit_buffers allowed. * When set to zero, this means unlimited. */ static u32 audit_backlog_limit = 64; #define AUDIT_BACKLOG_WAIT_TIME (60 * HZ) static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; /* The identity of the user shutting down the audit system. */ static kuid_t audit_sig_uid = INVALID_UID; static pid_t audit_sig_pid = -1; static u32 audit_sig_sid; /* Records can be lost in several ways: 0) [suppressed in audit_alloc] 1) out of memory in audit_log_start [kmalloc of struct audit_buffer] 2) out of memory in audit_log_move [alloc_skb] 3) suppressed due to audit_rate_limit 4) suppressed due to audit_backlog_limit */ static atomic_t audit_lost = ATOMIC_INIT(0); /* Monotonically increasing sum of time the kernel has spent * waiting while the backlog limit is exceeded. */ static atomic_t audit_backlog_wait_time_actual = ATOMIC_INIT(0); /* Hash for inode-based rules */ struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; static struct kmem_cache *audit_buffer_cache; /* queue msgs to send via kauditd_task */ static struct sk_buff_head audit_queue; /* queue msgs due to temporary unicast send problems */ static struct sk_buff_head audit_retry_queue; /* queue msgs waiting for new auditd connection */ static struct sk_buff_head audit_hold_queue; /* queue servicing thread */ static struct task_struct *kauditd_task; static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); /* waitqueue for callers who are blocked on the audit backlog */ static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION, .mask = -1, .features = 0, .lock = 0,}; static char *audit_feature_names[2] = { "only_unset_loginuid", "loginuid_immutable", }; /** * struct audit_ctl_mutex - serialize requests from userspace * @lock: the mutex used for locking * @owner: the task which owns the lock * * Description: * This is the lock struct used to ensure we only process userspace requests * in an orderly fashion. We can't simply use a mutex/lock here because we * need to track lock ownership so we don't end up blocking the lock owner in * audit_log_start() or similar. */ static struct audit_ctl_mutex { struct mutex lock; void *owner; } audit_cmd_mutex; /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting * audit records. Since printk uses a 1024 byte buffer, this buffer * should be at least that large. */ #define AUDIT_BUFSIZ 1024 /* The audit_buffer is used when formatting an audit record. The caller * locks briefly to get the record off the freelist or to allocate the * buffer, and locks briefly to send the buffer to the netlink layer or * to place it on a transmit queue. Multiple audit_buffers can be in * use simultaneously. */ struct audit_buffer { struct sk_buff *skb; /* formatted skb ready to send */ struct audit_context *ctx; /* NULL or associated context */ gfp_t gfp_mask; }; struct audit_reply { __u32 portid; struct net *net; struct sk_buff *skb; }; /** * auditd_test_task - Check to see if a given task is an audit daemon * @task: the task to check * * Description: * Return 1 if the task is a registered audit daemon, 0 otherwise. */ int auditd_test_task(struct task_struct *task) { int rc; struct auditd_connection *ac; rcu_read_lock(); ac = rcu_dereference(auditd_conn); rc = (ac && ac->pid == task_tgid(task) ? 1 : 0); rcu_read_unlock(); return rc; } /** * audit_ctl_lock - Take the audit control lock */ void audit_ctl_lock(void) { mutex_lock(&audit_cmd_mutex.lock); audit_cmd_mutex.owner = current; } /** * audit_ctl_unlock - Drop the audit control lock */ void audit_ctl_unlock(void) { audit_cmd_mutex.owner = NULL; mutex_unlock(&audit_cmd_mutex.lock); } /** * audit_ctl_owner_current - Test to see if the current task owns the lock * * Description: * Return true if the current task owns the audit control lock, false if it * doesn't own the lock. */ static bool audit_ctl_owner_current(void) { return (current == audit_cmd_mutex.owner); } /** * auditd_pid_vnr - Return the auditd PID relative to the namespace * * Description: * Returns the PID in relation to the namespace, 0 on failure. */ static pid_t auditd_pid_vnr(void) { pid_t pid; const struct auditd_connection *ac; rcu_read_lock(); ac = rcu_dereference(auditd_conn); if (!ac || !ac->pid) pid = 0; else pid = pid_vnr(ac->pid); rcu_read_unlock(); return pid; } /** * audit_get_sk - Return the audit socket for the given network namespace * @net: the destination network namespace * * Description: * Returns the sock pointer if valid, NULL otherwise. The caller must ensure * that a reference is held for the network namespace while the sock is in use. */ static struct sock *audit_get_sk(const struct net *net) { struct audit_net *aunet; if (!net) return NULL; aunet = net_generic(net, audit_net_id); return aunet->sk; } void audit_panic(const char *message) { switch (audit_failure) { case AUDIT_FAIL_SILENT: break; case AUDIT_FAIL_PRINTK: if (printk_ratelimit()) pr_err("%s\n", message); break; case AUDIT_FAIL_PANIC: panic("audit: %s\n", message); break; } } static inline int audit_rate_check(void) { static unsigned long last_check = 0; static int messages = 0; static DEFINE_SPINLOCK(lock); unsigned long flags; unsigned long now; int retval = 0; if (!audit_rate_limit) return 1; spin_lock_irqsave(&lock, flags); if (++messages < audit_rate_limit) { retval = 1; } else { now = jiffies; if (time_after(now, last_check + HZ)) { last_check = now; messages = 0; retval = 1; } } spin_unlock_irqrestore(&lock, flags); return retval; } /** * audit_log_lost - conditionally log lost audit message event * @message: the message stating reason for lost audit message * * Emit at least 1 message per second, even if audit_rate_check is * throttling. * Always increment the lost messages counter. */ void audit_log_lost(const char *message) { static unsigned long last_msg = 0; static DEFINE_SPINLOCK(lock); unsigned long flags; unsigned long now; int print; atomic_inc(&audit_lost); print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit); if (!print) { spin_lock_irqsave(&lock, flags); now = jiffies; if (time_after(now, last_msg + HZ)) { print = 1; last_msg = now; } spin_unlock_irqrestore(&lock, flags); } if (print) { if (printk_ratelimit()) pr_warn("audit_lost=%u audit_rate_limit=%u audit_backlog_limit=%u\n", atomic_read(&audit_lost), audit_rate_limit, audit_backlog_limit); audit_panic(message); } } static int audit_log_config_change(char *function_name, u32 new, u32 old, int allow_changes) { struct audit_buffer *ab; int rc = 0; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return rc; audit_log_format(ab, "op=set %s=%u old=%u ", function_name, new, old); audit_log_session_info(ab); rc = audit_log_task_context(ab); if (rc) allow_changes = 0; /* Something weird, deny request */ audit_log_format(ab, " res=%d", allow_changes); audit_log_end(ab); return rc; } static int audit_do_config_change(char *function_name, u32 *to_change, u32 new) { int allow_changes, rc = 0; u32 old = *to_change; /* check if we are locked */ if (audit_enabled == AUDIT_LOCKED) allow_changes = 0; else allow_changes = 1; if (audit_enabled != AUDIT_OFF) { rc = audit_log_config_change(function_name, new, old, allow_changes); if (rc) allow_changes = 0; } /* If we are allowed, make the change */ if (allow_changes == 1) *to_change = new; /* Not allowed, update reason */ else if (rc == 0) rc = -EPERM; return rc; } static int audit_set_rate_limit(u32 limit) { return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit); } static int audit_set_backlog_limit(u32 limit) { return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit); } static int audit_set_backlog_wait_time(u32 timeout) { return audit_do_config_change("audit_backlog_wait_time", &audit_backlog_wait_time, timeout); } static int audit_set_enabled(u32 state) { int rc; if (state > AUDIT_LOCKED) return -EINVAL; rc = audit_do_config_change("audit_enabled", &audit_enabled, state); if (!rc) audit_ever_enabled |= !!state; return rc; } static int audit_set_failure(u32 state) { if (state != AUDIT_FAIL_SILENT && state != AUDIT_FAIL_PRINTK && state != AUDIT_FAIL_PANIC) return -EINVAL; return audit_do_config_change("audit_failure", &audit_failure, state); } /** * auditd_conn_free - RCU helper to release an auditd connection struct * @rcu: RCU head * * Description: * Drop any references inside the auditd connection tracking struct and free * the memory. */ static void auditd_conn_free(struct rcu_head *rcu) { struct auditd_connection *ac; ac = container_of(rcu, struct auditd_connection, rcu); put_pid(ac->pid); put_net(ac->net); kfree(ac); } /** * auditd_set - Set/Reset the auditd connection state * @pid: auditd PID * @portid: auditd netlink portid * @net: auditd network namespace pointer * @skb: the netlink command from the audit daemon * @ack: netlink ack flag, cleared if ack'd here * * Description: * This function will obtain and drop network namespace references as * necessary. Returns zero on success, negative values on failure. */ static int auditd_set(struct pid *pid, u32 portid, struct net *net, struct sk_buff *skb, bool *ack) { unsigned long flags; struct auditd_connection *ac_old, *ac_new; struct nlmsghdr *nlh; if (!pid || !net) return -EINVAL; ac_new = kzalloc(sizeof(*ac_new), GFP_KERNEL); if (!ac_new) return -ENOMEM; ac_new->pid = get_pid(pid); ac_new->portid = portid; ac_new->net = get_net(net); /* send the ack now to avoid a race with the queue backlog */ if (*ack) { nlh = nlmsg_hdr(skb); netlink_ack(skb, nlh, 0, NULL); *ack = false; } spin_lock_irqsave(&auditd_conn_lock, flags); ac_old = rcu_dereference_protected(auditd_conn, lockdep_is_held(&auditd_conn_lock)); rcu_assign_pointer(auditd_conn, ac_new); spin_unlock_irqrestore(&auditd_conn_lock, flags); if (ac_old) call_rcu(&ac_old->rcu, auditd_conn_free); return 0; } /** * kauditd_printk_skb - Print the audit record to the ring buffer * @skb: audit record * * Whatever the reason, this packet may not make it to the auditd connection * so write it via printk so the information isn't completely lost. */ static void kauditd_printk_skb(struct sk_buff *skb) { struct nlmsghdr *nlh = nlmsg_hdr(skb); char *data = nlmsg_data(nlh); if (nlh->nlmsg_type != AUDIT_EOE && printk_ratelimit()) pr_notice("type=%d %s\n", nlh->nlmsg_type, data); } /** * kauditd_rehold_skb - Handle a audit record send failure in the hold queue * @skb: audit record * @error: error code (unused) * * Description: * This should only be used by the kauditd_thread when it fails to flush the * hold queue. */ static void kauditd_rehold_skb(struct sk_buff *skb, __always_unused int error) { /* put the record back in the queue */ skb_queue_tail(&audit_hold_queue, skb); } /** * kauditd_hold_skb - Queue an audit record, waiting for auditd * @skb: audit record * @error: error code * * Description: * Queue the audit record, waiting for an instance of auditd. When this * function is called we haven't given up yet on sending the record, but things * are not looking good. The first thing we want to do is try to write the * record via printk and then see if we want to try and hold on to the record * and queue it, if we have room. If we want to hold on to the record, but we * don't have room, record a record lost message. */ static void kauditd_hold_skb(struct sk_buff *skb, int error) { /* at this point it is uncertain if we will ever send this to auditd so * try to send the message via printk before we go any further */ kauditd_printk_skb(skb); /* can we just silently drop the message? */ if (!audit_default) goto drop; /* the hold queue is only for when the daemon goes away completely, * not -EAGAIN failures; if we are in a -EAGAIN state requeue the * record on the retry queue unless it's full, in which case drop it */ if (error == -EAGAIN) { if (!audit_backlog_limit || skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { skb_queue_tail(&audit_retry_queue, skb); return; } audit_log_lost("kauditd retry queue overflow"); goto drop; } /* if we have room in the hold queue, queue the message */ if (!audit_backlog_limit || skb_queue_len(&audit_hold_queue) < audit_backlog_limit) { skb_queue_tail(&audit_hold_queue, skb); return; } /* we have no other options - drop the message */ audit_log_lost("kauditd hold queue overflow"); drop: kfree_skb(skb); } /** * kauditd_retry_skb - Queue an audit record, attempt to send again to auditd * @skb: audit record * @error: error code (unused) * * Description: * Not as serious as kauditd_hold_skb() as we still have a connected auditd, * but for some reason we are having problems sending it audit records so * queue the given record and attempt to resend. */ static void kauditd_retry_skb(struct sk_buff *skb, __always_unused int error) { if (!audit_backlog_limit || skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { skb_queue_tail(&audit_retry_queue, skb); return; } /* we have to drop the record, send it via printk as a last effort */ kauditd_printk_skb(skb); audit_log_lost("kauditd retry queue overflow"); kfree_skb(skb); } /** * auditd_reset - Disconnect the auditd connection * @ac: auditd connection state * * Description: * Break the auditd/kauditd connection and move all the queued records into the * hold queue in case auditd reconnects. It is important to note that the @ac * pointer should never be dereferenced inside this function as it may be NULL * or invalid, you can only compare the memory address! If @ac is NULL then * the connection will always be reset. */ static void auditd_reset(const struct auditd_connection *ac) { unsigned long flags; struct sk_buff *skb; struct auditd_connection *ac_old; /* if it isn't already broken, break the connection */ spin_lock_irqsave(&auditd_conn_lock, flags); ac_old = rcu_dereference_protected(auditd_conn, lockdep_is_held(&auditd_conn_lock)); if (ac && ac != ac_old) { /* someone already registered a new auditd connection */ spin_unlock_irqrestore(&auditd_conn_lock, flags); return; } rcu_assign_pointer(auditd_conn, NULL); spin_unlock_irqrestore(&auditd_conn_lock, flags); if (ac_old) call_rcu(&ac_old->rcu, auditd_conn_free); /* flush the retry queue to the hold queue, but don't touch the main * queue since we need to process that normally for multicast */ while ((skb = skb_dequeue(&audit_retry_queue))) kauditd_hold_skb(skb, -ECONNREFUSED); } /** * auditd_send_unicast_skb - Send a record via unicast to auditd * @skb: audit record * * Description: * Send a skb to the audit daemon, returns positive/zero values on success and * negative values on failure; in all cases the skb will be consumed by this * function. If the send results in -ECONNREFUSED the connection with auditd * will be reset. This function may sleep so callers should not hold any locks * where this would cause a problem. */ static int auditd_send_unicast_skb(struct sk_buff *skb) { int rc; u32 portid; struct net *net; struct sock *sk; struct auditd_connection *ac; /* NOTE: we can't call netlink_unicast while in the RCU section so * take a reference to the network namespace and grab local * copies of the namespace, the sock, and the portid; the * namespace and sock aren't going to go away while we hold a * reference and if the portid does become invalid after the RCU * section netlink_unicast() should safely return an error */ rcu_read_lock(); ac = rcu_dereference(auditd_conn); if (!ac) { rcu_read_unlock(); kfree_skb(skb); rc = -ECONNREFUSED; goto err; } net = get_net(ac->net); sk = audit_get_sk(net); portid = ac->portid; rcu_read_unlock(); rc = netlink_unicast(sk, skb, portid, 0); put_net(net); if (rc < 0) goto err; return rc; err: if (ac && rc == -ECONNREFUSED) auditd_reset(ac); return rc; } /** * kauditd_send_queue - Helper for kauditd_thread to flush skb queues * @sk: the sending sock * @portid: the netlink destination * @queue: the skb queue to process * @retry_limit: limit on number of netlink unicast failures * @skb_hook: per-skb hook for additional processing * @err_hook: hook called if the skb fails the netlink unicast send * * Description: * Run through the given queue and attempt to send the audit records to auditd, * returns zero on success, negative values on failure. It is up to the caller * to ensure that the @sk is valid for the duration of this function. * */ static int kauditd_send_queue(struct sock *sk, u32 portid, struct sk_buff_head *queue, unsigned int retry_limit, void (*skb_hook)(struct sk_buff *skb), void (*err_hook)(struct sk_buff *skb, int error)) { int rc = 0; struct sk_buff *skb = NULL; struct sk_buff *skb_tail; unsigned int failed = 0; /* NOTE: kauditd_thread takes care of all our locking, we just use * the netlink info passed to us (e.g. sk and portid) */ skb_tail = skb_peek_tail(queue); while ((skb != skb_tail) && (skb = skb_dequeue(queue))) { /* call the skb_hook for each skb we touch */ if (skb_hook) (*skb_hook)(skb); /* can we send to anyone via unicast? */ if (!sk) { if (err_hook) (*err_hook)(skb, -ECONNREFUSED); continue; } retry: /* grab an extra skb reference in case of error */ skb_get(skb); rc = netlink_unicast(sk, skb, portid, 0); if (rc < 0) { /* send failed - try a few times unless fatal error */ if (++failed >= retry_limit || rc == -ECONNREFUSED || rc == -EPERM) { sk = NULL; if (err_hook) (*err_hook)(skb, rc); if (rc == -EAGAIN) rc = 0; /* continue to drain the queue */ continue; } else goto retry; } else { /* skb sent - drop the extra reference and continue */ consume_skb(skb); failed = 0; } } return (rc >= 0 ? 0 : rc); } /* * kauditd_send_multicast_skb - Send a record to any multicast listeners * @skb: audit record * * Description: * Write a multicast message to anyone listening in the initial network * namespace. This function doesn't consume an skb as might be expected since * it has to copy it anyways. */ static void kauditd_send_multicast_skb(struct sk_buff *skb) { struct sk_buff *copy; struct sock *sock = audit_get_sk(&init_net); struct nlmsghdr *nlh; /* NOTE: we are not taking an additional reference for init_net since * we don't have to worry about it going away */ if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG)) return; /* * The seemingly wasteful skb_copy() rather than bumping the refcount * using skb_get() is necessary because non-standard mods are made to * the skb by the original kaudit unicast socket send routine. The * existing auditd daemon assumes this breakage. Fixing this would * require co-ordinating a change in the established protocol between * the kaudit kernel subsystem and the auditd userspace code. There is * no reason for new multicast clients to continue with this * non-compliance. */ copy = skb_copy(skb, GFP_KERNEL); if (!copy) return; nlh = nlmsg_hdr(copy); nlh->nlmsg_len = skb->len; nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL); } /** * kauditd_thread - Worker thread to send audit records to userspace * @dummy: unused */ static int kauditd_thread(void *dummy) { int rc; u32 portid = 0; struct net *net = NULL; struct sock *sk = NULL; struct auditd_connection *ac; #define UNICAST_RETRIES 5 set_freezable(); while (!kthread_should_stop()) { /* NOTE: see the lock comments in auditd_send_unicast_skb() */ rcu_read_lock(); ac = rcu_dereference(auditd_conn); if (!ac) { rcu_read_unlock(); goto main_queue; } net = get_net(ac->net); sk = audit_get_sk(net); portid = ac->portid; rcu_read_unlock(); /* attempt to flush the hold queue */ rc = kauditd_send_queue(sk, portid, &audit_hold_queue, UNICAST_RETRIES, NULL, kauditd_rehold_skb); if (rc < 0) { sk = NULL; auditd_reset(ac); goto main_queue; } /* attempt to flush the retry queue */ rc = kauditd_send_queue(sk, portid, &audit_retry_queue, UNICAST_RETRIES, NULL, kauditd_hold_skb); if (rc < 0) { sk = NULL; auditd_reset(ac); goto main_queue; } main_queue: /* process the main queue - do the multicast send and attempt * unicast, dump failed record sends to the retry queue; if * sk == NULL due to previous failures we will just do the * multicast send and move the record to the hold queue */ rc = kauditd_send_queue(sk, portid, &audit_queue, 1, kauditd_send_multicast_skb, (sk ? kauditd_retry_skb : kauditd_hold_skb)); if (ac && rc < 0) auditd_reset(ac); sk = NULL; /* drop our netns reference, no auditd sends past this line */ if (net) { put_net(net); net = NULL; } /* we have processed all the queues so wake everyone */ wake_up(&audit_backlog_wait); /* NOTE: we want to wake up if there is anything on the queue, * regardless of if an auditd is connected, as we need to * do the multicast send and rotate records from the * main queue to the retry/hold queues */ wait_event_freezable(kauditd_wait, (skb_queue_len(&audit_queue) ? 1 : 0)); } return 0; } int audit_send_list_thread(void *_dest) { struct audit_netlink_list *dest = _dest; struct sk_buff *skb; struct sock *sk = audit_get_sk(dest->net); /* wait for parent to finish and send an ACK */ audit_ctl_lock(); audit_ctl_unlock(); while ((skb = __skb_dequeue(&dest->q)) != NULL) netlink_unicast(sk, skb, dest->portid, 0); put_net(dest->net); kfree(dest); return 0; } struct sk_buff *audit_make_reply(int seq, int type, int done, int multi, const void *payload, int size) { struct sk_buff *skb; struct nlmsghdr *nlh; void *data; int flags = multi ? NLM_F_MULTI : 0; int t = done ? NLMSG_DONE : type; skb = nlmsg_new(size, GFP_KERNEL); if (!skb) return NULL; nlh = nlmsg_put(skb, 0, seq, t, size, flags); if (!nlh) goto out_kfree_skb; data = nlmsg_data(nlh); memcpy(data, payload, size); return skb; out_kfree_skb: kfree_skb(skb); return NULL; } static void audit_free_reply(struct audit_reply *reply) { if (!reply) return; kfree_skb(reply->skb); if (reply->net) put_net(reply->net); kfree(reply); } static int audit_send_reply_thread(void *arg) { struct audit_reply *reply = (struct audit_reply *)arg; audit_ctl_lock(); audit_ctl_unlock(); /* Ignore failure. It'll only happen if the sender goes away, because our timeout is set to infinite. */ netlink_unicast(audit_get_sk(reply->net), reply->skb, reply->portid, 0); reply->skb = NULL; audit_free_reply(reply); return 0; } /** * audit_send_reply - send an audit reply message via netlink * @request_skb: skb of request we are replying to (used to target the reply) * @seq: sequence number * @type: audit message type * @done: done (last) flag * @multi: multi-part message flag * @payload: payload data * @size: payload size * * Allocates a skb, builds the netlink message, and sends it to the port id. */ static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done, int multi, const void *payload, int size) { struct task_struct *tsk; struct audit_reply *reply; reply = kzalloc(sizeof(*reply), GFP_KERNEL); if (!reply) return; reply->skb = audit_make_reply(seq, type, done, multi, payload, size); if (!reply->skb) goto err; reply->net = get_net(sock_net(NETLINK_CB(request_skb).sk)); reply->portid = NETLINK_CB(request_skb).portid; tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); if (IS_ERR(tsk)) goto err; return; err: audit_free_reply(reply); } /* * Check for appropriate CAP_AUDIT_ capabilities on incoming audit * control messages. */ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) { int err = 0; /* Only support initial user namespace for now. */ /* * We return ECONNREFUSED because it tricks userspace into thinking * that audit was not configured into the kernel. Lots of users * configure their PAM stack (because that's what the distro does) * to reject login if unable to send messages to audit. If we return * ECONNREFUSED the PAM stack thinks the kernel does not have audit * configured in and will let login proceed. If we return EPERM * userspace will reject all logins. This should be removed when we * support non init namespaces!! */ if (current_user_ns() != &init_user_ns) return -ECONNREFUSED; switch (msg_type) { case AUDIT_LIST: case AUDIT_ADD: case AUDIT_DEL: return -EOPNOTSUPP; case AUDIT_GET: case AUDIT_SET: case AUDIT_GET_FEATURE: case AUDIT_SET_FEATURE: case AUDIT_LIST_RULES: case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: case AUDIT_SIGNAL_INFO: case AUDIT_TTY_GET: case AUDIT_TTY_SET: case AUDIT_TRIM: case AUDIT_MAKE_EQUIV: /* Only support auditd and auditctl in initial pid namespace * for now. */ if (task_active_pid_ns(current) != &init_pid_ns) return -EPERM; if (!netlink_capable(skb, CAP_AUDIT_CONTROL)) err = -EPERM; break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: if (!netlink_capable(skb, CAP_AUDIT_WRITE)) err = -EPERM; break; default: /* bad msg */ err = -EINVAL; } return err; } static void audit_log_common_recv_msg(struct audit_context *context, struct audit_buffer **ab, u16 msg_type) { uid_t uid = from_kuid(&init_user_ns, current_uid()); pid_t pid = task_tgid_nr(current); if (!audit_enabled && msg_type != AUDIT_USER_AVC) { *ab = NULL; return; } *ab = audit_log_start(context, GFP_KERNEL, msg_type); if (unlikely(!*ab)) return; audit_log_format(*ab, "pid=%d uid=%u ", pid, uid); audit_log_session_info(*ab); audit_log_task_context(*ab); } static inline void audit_log_user_recv_msg(struct audit_buffer **ab, u16 msg_type) { audit_log_common_recv_msg(NULL, ab, msg_type); } static int is_audit_feature_set(int i) { return af.features & AUDIT_FEATURE_TO_MASK(i); } static int audit_get_feature(struct sk_buff *skb) { u32 seq; seq = nlmsg_hdr(skb)->nlmsg_seq; audit_send_reply(skb, seq, AUDIT_GET_FEATURE, 0, 0, &af, sizeof(af)); return 0; } static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature, u32 old_lock, u32 new_lock, int res) { struct audit_buffer *ab; if (audit_enabled == AUDIT_OFF) return; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_FEATURE_CHANGE); if (!ab) return; audit_log_task_info(ab); audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d", audit_feature_names[which], !!old_feature, !!new_feature, !!old_lock, !!new_lock, res); audit_log_end(ab); } static int audit_set_feature(struct audit_features *uaf) { int i; BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names)); /* if there is ever a version 2 we should handle that here */ for (i = 0; i <= AUDIT_LAST_FEATURE; i++) { u32 feature = AUDIT_FEATURE_TO_MASK(i); u32 old_feature, new_feature, old_lock, new_lock; /* if we are not changing this feature, move along */ if (!(feature & uaf->mask)) continue; old_feature = af.features & feature; new_feature = uaf->features & feature; new_lock = (uaf->lock | af.lock) & feature; old_lock = af.lock & feature; /* are we changing a locked feature? */ if (old_lock && (new_feature != old_feature)) { audit_log_feature_change(i, old_feature, new_feature, old_lock, new_lock, 0); return -EPERM; } } /* nothing invalid, do the changes */ for (i = 0; i <= AUDIT_LAST_FEATURE; i++) { u32 feature = AUDIT_FEATURE_TO_MASK(i); u32 old_feature, new_feature, old_lock, new_lock; /* if we are not changing this feature, move along */ if (!(feature & uaf->mask)) continue; old_feature = af.features & feature; new_feature = uaf->features & feature; old_lock = af.lock & feature; new_lock = (uaf->lock | af.lock) & feature; if (new_feature != old_feature) audit_log_feature_change(i, old_feature, new_feature, old_lock, new_lock, 1); if (new_feature) af.features |= feature; else af.features &= ~feature; af.lock |= new_lock; } return 0; } static int audit_replace(struct pid *pid) { pid_t pvnr; struct sk_buff *skb; pvnr = pid_vnr(pid); skb = audit_make_reply(0, AUDIT_REPLACE, 0, 0, &pvnr, sizeof(pvnr)); if (!skb) return -ENOMEM; return auditd_send_unicast_skb(skb); } static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, bool *ack) { u32 seq; void *data; int data_len; int err; struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; struct audit_sig_info *sig_data; char *ctx = NULL; u32 len; err = audit_netlink_ok(skb, msg_type); if (err) return err; seq = nlh->nlmsg_seq; data = nlmsg_data(nlh); data_len = nlmsg_len(nlh); switch (msg_type) { case AUDIT_GET: { struct audit_status s; memset(&s, 0, sizeof(s)); s.enabled = audit_enabled; s.failure = audit_failure; /* NOTE: use pid_vnr() so the PID is relative to the current * namespace */ s.pid = auditd_pid_vnr(); s.rate_limit = audit_rate_limit; s.backlog_limit = audit_backlog_limit; s.lost = atomic_read(&audit_lost); s.backlog = skb_queue_len(&audit_queue); s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL; s.backlog_wait_time = audit_backlog_wait_time; s.backlog_wait_time_actual = atomic_read(&audit_backlog_wait_time_actual); audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_SET: { struct audit_status s; memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ memcpy(&s, data, min_t(size_t, sizeof(s), data_len)); if (s.mask & AUDIT_STATUS_ENABLED) { err = audit_set_enabled(s.enabled); if (err < 0) return err; } if (s.mask & AUDIT_STATUS_FAILURE) { err = audit_set_failure(s.failure); if (err < 0) return err; } if (s.mask & AUDIT_STATUS_PID) { /* NOTE: we are using the vnr PID functions below * because the s.pid value is relative to the * namespace of the caller; at present this * doesn't matter much since you can really only * run auditd from the initial pid namespace, but * something to keep in mind if this changes */ pid_t new_pid = s.pid; pid_t auditd_pid; struct pid *req_pid = task_tgid(current); /* Sanity check - PID values must match. Setting * pid to 0 is how auditd ends auditing. */ if (new_pid && (new_pid != pid_vnr(req_pid))) return -EINVAL; /* test the auditd connection */ audit_replace(req_pid); auditd_pid = auditd_pid_vnr(); if (auditd_pid) { /* replacing a healthy auditd is not allowed */ if (new_pid) { audit_log_config_change("audit_pid", new_pid, auditd_pid, 0); return -EEXIST; } /* only current auditd can unregister itself */ if (pid_vnr(req_pid) != auditd_pid) { audit_log_config_change("audit_pid", new_pid, auditd_pid, 0); return -EACCES; } } if (new_pid) { /* register a new auditd connection */ err = auditd_set(req_pid, NETLINK_CB(skb).portid, sock_net(NETLINK_CB(skb).sk), skb, ack); if (audit_enabled != AUDIT_OFF) audit_log_config_change("audit_pid", new_pid, auditd_pid, err ? 0 : 1); if (err) return err; /* try to process any backlog */ wake_up_interruptible(&kauditd_wait); } else { if (audit_enabled != AUDIT_OFF) audit_log_config_change("audit_pid", new_pid, auditd_pid, 1); /* unregister the auditd connection */ auditd_reset(NULL); } } if (s.mask & AUDIT_STATUS_RATE_LIMIT) { err = audit_set_rate_limit(s.rate_limit); if (err < 0) return err; } if (s.mask & AUDIT_STATUS_BACKLOG_LIMIT) { err = audit_set_backlog_limit(s.backlog_limit); if (err < 0) return err; } if (s.mask & AUDIT_STATUS_BACKLOG_WAIT_TIME) { if (sizeof(s) > (size_t)nlh->nlmsg_len) return -EINVAL; if (s.backlog_wait_time > 10*AUDIT_BACKLOG_WAIT_TIME) return -EINVAL; err = audit_set_backlog_wait_time(s.backlog_wait_time); if (err < 0) return err; } if (s.mask == AUDIT_STATUS_LOST) { u32 lost = atomic_xchg(&audit_lost, 0); audit_log_config_change("lost", 0, lost, 1); return lost; } if (s.mask == AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL) { u32 actual = atomic_xchg(&audit_backlog_wait_time_actual, 0); audit_log_config_change("backlog_wait_time_actual", 0, actual, 1); return actual; } break; } case AUDIT_GET_FEATURE: err = audit_get_feature(skb); if (err) return err; break; case AUDIT_SET_FEATURE: if (data_len < sizeof(struct audit_features)) return -EINVAL; err = audit_set_feature(data); if (err) return err; break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: if (!audit_enabled && msg_type != AUDIT_USER_AVC) return 0; /* exit early if there isn't at least one character to print */ if (data_len < 2) return -EINVAL; err = audit_filter(msg_type, AUDIT_FILTER_USER); if (err == 1) { /* match or error */ char *str = data; err = 0; if (msg_type == AUDIT_USER_TTY) { err = tty_audit_push(); if (err) break; } audit_log_user_recv_msg(&ab, msg_type); if (msg_type != AUDIT_USER_TTY) { /* ensure NULL termination */ str[data_len - 1] = '\0'; audit_log_format(ab, " msg='%.*s'", AUDIT_MESSAGE_TEXT_MAX, str); } else { audit_log_format(ab, " data="); if (str[data_len - 1] == '\0') data_len--; audit_log_n_untrustedstring(ab, str, data_len); } audit_log_end(ab); } break; case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: if (data_len < sizeof(struct audit_rule_data)) return -EINVAL; if (audit_enabled == AUDIT_LOCKED) { audit_log_common_recv_msg(audit_context(), &ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=%s audit_enabled=%d res=0", msg_type == AUDIT_ADD_RULE ? "add_rule" : "remove_rule", audit_enabled); audit_log_end(ab); return -EPERM; } err = audit_rule_change(msg_type, seq, data, data_len); break; case AUDIT_LIST_RULES: err = audit_list_rules_send(skb, seq); break; case AUDIT_TRIM: audit_trim_trees(); audit_log_common_recv_msg(audit_context(), &ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=trim res=1"); audit_log_end(ab); break; case AUDIT_MAKE_EQUIV: { void *bufp = data; u32 sizes[2]; size_t msglen = data_len; char *old, *new; err = -EINVAL; if (msglen < 2 * sizeof(u32)) break; memcpy(sizes, bufp, 2 * sizeof(u32)); bufp += 2 * sizeof(u32); msglen -= 2 * sizeof(u32); old = audit_unpack_string(&bufp, &msglen, sizes[0]); if (IS_ERR(old)) { err = PTR_ERR(old); break; } new = audit_unpack_string(&bufp, &msglen, sizes[1]); if (IS_ERR(new)) { err = PTR_ERR(new); kfree(old); break; } /* OK, here comes... */ err = audit_tag_tree(old, new); audit_log_common_recv_msg(audit_context(), &ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=make_equiv old="); audit_log_untrustedstring(ab, old); audit_log_format(ab, " new="); audit_log_untrustedstring(ab, new); audit_log_format(ab, " res=%d", !err); audit_log_end(ab); kfree(old); kfree(new); break; } case AUDIT_SIGNAL_INFO: len = 0; if (audit_sig_sid) { err = security_secid_to_secctx(audit_sig_sid, &ctx, &len); if (err) return err; } sig_data = kmalloc(struct_size(sig_data, ctx, len), GFP_KERNEL); if (!sig_data) { if (audit_sig_sid) security_release_secctx(ctx, len); return -ENOMEM; } sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid); sig_data->pid = audit_sig_pid; if (audit_sig_sid) { memcpy(sig_data->ctx, ctx, len); security_release_secctx(ctx, len); } audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0, sig_data, struct_size(sig_data, ctx, len)); kfree(sig_data); break; case AUDIT_TTY_GET: { struct audit_tty_status s; unsigned int t; t = READ_ONCE(current->signal->audit_tty); s.enabled = t & AUDIT_TTY_ENABLE; s.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD); audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_TTY_SET: { struct audit_tty_status s, old; struct audit_buffer *ab; unsigned int t; memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ memcpy(&s, data, min_t(size_t, sizeof(s), data_len)); /* check if new data is valid */ if ((s.enabled != 0 && s.enabled != 1) || (s.log_passwd != 0 && s.log_passwd != 1)) err = -EINVAL; if (err) t = READ_ONCE(current->signal->audit_tty); else { t = s.enabled | (-s.log_passwd & AUDIT_TTY_LOG_PASSWD); t = xchg(&current->signal->audit_tty, t); } old.enabled = t & AUDIT_TTY_ENABLE; old.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD); audit_log_common_recv_msg(audit_context(), &ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d" " old-log_passwd=%d new-log_passwd=%d res=%d", old.enabled, s.enabled, old.log_passwd, s.log_passwd, !err); audit_log_end(ab); break; } default: err = -EINVAL; break; } return err < 0 ? err : 0; } /** * audit_receive - receive messages from a netlink control socket * @skb: the message buffer * * Parse the provided skb and deal with any messages that may be present, * malformed skbs are discarded. */ static void audit_receive(struct sk_buff *skb) { struct nlmsghdr *nlh; bool ack; /* * len MUST be signed for nlmsg_next to be able to dec it below 0 * if the nlmsg_len was not aligned */ int len; int err; nlh = nlmsg_hdr(skb); len = skb->len; audit_ctl_lock(); while (nlmsg_ok(nlh, len)) { ack = nlh->nlmsg_flags & NLM_F_ACK; err = audit_receive_msg(skb, nlh, &ack); /* send an ack if the user asked for one and audit_receive_msg * didn't already do it, or if there was an error. */ if (ack || err) netlink_ack(skb, nlh, err, NULL); nlh = nlmsg_next(nlh, &len); } audit_ctl_unlock(); /* can't block with the ctrl lock, so penalize the sender now */ if (audit_backlog_limit && (skb_queue_len(&audit_queue) > audit_backlog_limit)) { DECLARE_WAITQUEUE(wait, current); /* wake kauditd to try and flush the queue */ wake_up_interruptible(&kauditd_wait); add_wait_queue_exclusive(&audit_backlog_wait, &wait); set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(audit_backlog_wait_time); remove_wait_queue(&audit_backlog_wait, &wait); } } /* Log information about who is connecting to the audit multicast socket */ static void audit_log_multicast(int group, const char *op, int err) { const struct cred *cred; struct tty_struct *tty; char comm[sizeof(current->comm)]; struct audit_buffer *ab; if (!audit_enabled) return; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_EVENT_LISTENER); if (!ab) return; cred = current_cred(); tty = audit_get_tty(); audit_log_format(ab, "pid=%u uid=%u auid=%u tty=%s ses=%u", task_pid_nr(current), from_kuid(&init_user_ns, cred->uid), from_kuid(&init_user_ns, audit_get_loginuid(current)), tty ? tty_name(tty) : "(none)", audit_get_sessionid(current)); audit_put_tty(tty); audit_log_task_context(ab); /* subj= */ audit_log_format(ab, " comm="); audit_log_untrustedstring(ab, get_task_comm(comm, current)); audit_log_d_path_exe(ab, current->mm); /* exe= */ audit_log_format(ab, " nl-mcgrp=%d op=%s res=%d", group, op, !err); audit_log_end(ab); } /* Run custom bind function on netlink socket group connect or bind requests. */ static int audit_multicast_bind(struct net *net, int group) { int err = 0; if (!capable(CAP_AUDIT_READ)) err = -EPERM; audit_log_multicast(group, "connect", err); return err; } static void audit_multicast_unbind(struct net *net, int group) { audit_log_multicast(group, "disconnect", 0); } static int __net_init audit_net_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = audit_receive, .bind = audit_multicast_bind, .unbind = audit_multicast_unbind, .flags = NL_CFG_F_NONROOT_RECV, .groups = AUDIT_NLGRP_MAX, }; struct audit_net *aunet = net_generic(net, audit_net_id); aunet->sk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg); if (aunet->sk == NULL) { audit_panic("cannot initialize netlink socket in namespace"); return -ENOMEM; } /* limit the timeout in case auditd is blocked/stopped */ aunet->sk->sk_sndtimeo = HZ / 10; return 0; } static void __net_exit audit_net_exit(struct net *net) { struct audit_net *aunet = net_generic(net, audit_net_id); /* NOTE: you would think that we would want to check the auditd * connection and potentially reset it here if it lives in this * namespace, but since the auditd connection tracking struct holds a * reference to this namespace (see auditd_set()) we are only ever * going to get here after that connection has been released */ netlink_kernel_release(aunet->sk); } static struct pernet_operations audit_net_ops __net_initdata = { .init = audit_net_init, .exit = audit_net_exit, .id = &audit_net_id, .size = sizeof(struct audit_net), }; /* Initialize audit support at boot time. */ static int __init audit_init(void) { int i; if (audit_initialized == AUDIT_DISABLED) return 0; audit_buffer_cache = KMEM_CACHE(audit_buffer, SLAB_PANIC); skb_queue_head_init(&audit_queue); skb_queue_head_init(&audit_retry_queue); skb_queue_head_init(&audit_hold_queue); for (i = 0; i < AUDIT_INODE_BUCKETS; i++) INIT_LIST_HEAD(&audit_inode_hash[i]); mutex_init(&audit_cmd_mutex.lock); audit_cmd_mutex.owner = NULL; pr_info("initializing netlink subsys (%s)\n", audit_default ? "enabled" : "disabled"); register_pernet_subsys(&audit_net_ops); audit_initialized = AUDIT_INITIALIZED; kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); if (IS_ERR(kauditd_task)) { int err = PTR_ERR(kauditd_task); panic("audit: failed to start the kauditd thread (%d)\n", err); } audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "state=initialized audit_enabled=%u res=1", audit_enabled); return 0; } postcore_initcall(audit_init); /* * Process kernel command-line parameter at boot time. * audit={0|off} or audit={1|on}. */ static int __init audit_enable(char *str) { if (!strcasecmp(str, "off") || !strcmp(str, "0")) audit_default = AUDIT_OFF; else if (!strcasecmp(str, "on") || !strcmp(str, "1")) audit_default = AUDIT_ON; else { pr_err("audit: invalid 'audit' parameter value (%s)\n", str); audit_default = AUDIT_ON; } if (audit_default == AUDIT_OFF) audit_initialized = AUDIT_DISABLED; if (audit_set_enabled(audit_default)) pr_err("audit: error setting audit state (%d)\n", audit_default); pr_info("%s\n", audit_default ? "enabled (after initialization)" : "disabled (until reboot)"); return 1; } __setup("audit=", audit_enable); /* Process kernel command-line parameter at boot time. * audit_backlog_limit=<n> */ static int __init audit_backlog_limit_set(char *str) { u32 audit_backlog_limit_arg; pr_info("audit_backlog_limit: "); if (kstrtouint(str, 0, &audit_backlog_limit_arg)) { pr_cont("using default of %u, unable to parse %s\n", audit_backlog_limit, str); return 1; } audit_backlog_limit = audit_backlog_limit_arg; pr_cont("%d\n", audit_backlog_limit); return 1; } __setup("audit_backlog_limit=", audit_backlog_limit_set); static void audit_buffer_free(struct audit_buffer *ab) { if (!ab) return; kfree_skb(ab->skb); kmem_cache_free(audit_buffer_cache, ab); } static struct audit_buffer *audit_buffer_alloc(struct audit_context *ctx, gfp_t gfp_mask, int type) { struct audit_buffer *ab; ab = kmem_cache_alloc(audit_buffer_cache, gfp_mask); if (!ab) return NULL; ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); if (!ab->skb) goto err; if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0)) goto err; ab->ctx = ctx; ab->gfp_mask = gfp_mask; return ab; err: audit_buffer_free(ab); return NULL; } /** * audit_serial - compute a serial number for the audit record * * Compute a serial number for the audit record. Audit records are * written to user-space as soon as they are generated, so a complete * audit record may be written in several pieces. The timestamp of the * record and this serial number are used by the user-space tools to * determine which pieces belong to the same audit record. The * (timestamp,serial) tuple is unique for each syscall and is live from * syscall entry to syscall exit. * * NOTE: Another possibility is to store the formatted records off the * audit context (for those records that have a context), and emit them * all at syscall exit. However, this could delay the reporting of * significant errors until syscall exit (or never, if the system * halts). */ unsigned int audit_serial(void) { static atomic_t serial = ATOMIC_INIT(0); return atomic_inc_return(&serial); } static inline void audit_get_stamp(struct audit_context *ctx, struct timespec64 *t, unsigned int *serial) { if (!ctx || !auditsc_get_stamp(ctx, t, serial)) { ktime_get_coarse_real_ts64(t); *serial = audit_serial(); } } /** * audit_log_start - obtain an audit buffer * @ctx: audit_context (may be NULL) * @gfp_mask: type of allocation * @type: audit message type * * Returns audit_buffer pointer on success or NULL on error. * * Obtain an audit buffer. This routine does locking to obtain the * audit buffer, but then no locking is required for calls to * audit_log_*format. If the task (ctx) is a task that is currently in a * syscall, then the syscall is marked as auditable and an audit record * will be written at syscall exit. If there is no associated task, then * task context (ctx) should be NULL. */ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { struct audit_buffer *ab; struct timespec64 t; unsigned int serial; if (audit_initialized != AUDIT_INITIALIZED) return NULL; if (unlikely(!audit_filter(type, AUDIT_FILTER_EXCLUDE))) return NULL; /* NOTE: don't ever fail/sleep on these two conditions: * 1. auditd generated record - since we need auditd to drain the * queue; also, when we are checking for auditd, compare PIDs using * task_tgid_vnr() since auditd_pid is set in audit_receive_msg() * using a PID anchored in the caller's namespace * 2. generator holding the audit_cmd_mutex - we don't want to block * while holding the mutex, although we do penalize the sender * later in audit_receive() when it is safe to block */ if (!(auditd_test_task(current) || audit_ctl_owner_current())) { long stime = audit_backlog_wait_time; while (audit_backlog_limit && (skb_queue_len(&audit_queue) > audit_backlog_limit)) { /* wake kauditd to try and flush the queue */ wake_up_interruptible(&kauditd_wait); /* sleep if we are allowed and we haven't exhausted our * backlog wait limit */ if (gfpflags_allow_blocking(gfp_mask) && (stime > 0)) { long rtime = stime; DECLARE_WAITQUEUE(wait, current); add_wait_queue_exclusive(&audit_backlog_wait, &wait); set_current_state(TASK_UNINTERRUPTIBLE); stime = schedule_timeout(rtime); atomic_add(rtime - stime, &audit_backlog_wait_time_actual); remove_wait_queue(&audit_backlog_wait, &wait); } else { if (audit_rate_check() && printk_ratelimit()) pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n", skb_queue_len(&audit_queue), audit_backlog_limit); audit_log_lost("backlog limit exceeded"); return NULL; } } } ab = audit_buffer_alloc(ctx, gfp_mask, type); if (!ab) { audit_log_lost("out of memory in audit_log_start"); return NULL; } audit_get_stamp(ab->ctx, &t, &serial); /* cancel dummy context to enable supporting records */ if (ctx) ctx->dummy = 0; audit_log_format(ab, "audit(%llu.%03lu:%u): ", (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial); return ab; } /** * audit_expand - expand skb in the audit buffer * @ab: audit_buffer * @extra: space to add at tail of the skb * * Returns 0 (no space) on failed expansion, or available space if * successful. */ static inline int audit_expand(struct audit_buffer *ab, int extra) { struct sk_buff *skb = ab->skb; int oldtail = skb_tailroom(skb); int ret = pskb_expand_head(skb, 0, extra, ab->gfp_mask); int newtail = skb_tailroom(skb); if (ret < 0) { audit_log_lost("out of memory in audit_expand"); return 0; } skb->truesize += newtail - oldtail; return newtail; } /* * Format an audit message into the audit buffer. If there isn't enough * room in the audit buffer, more room will be allocated and vsnprint * will be called a second time. Currently, we assume that a printk * can't format message larger than 1024 bytes, so we don't either. */ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, va_list args) { int len, avail; struct sk_buff *skb; va_list args2; if (!ab) return; BUG_ON(!ab->skb); skb = ab->skb; avail = skb_tailroom(skb); if (avail == 0) { avail = audit_expand(ab, AUDIT_BUFSIZ); if (!avail) goto out; } va_copy(args2, args); len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args); if (len >= avail) { /* The printk buffer is 1024 bytes long, so if we get * here and AUDIT_BUFSIZ is at least 1024, then we can * log everything that printk could have logged. */ avail = audit_expand(ab, max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); if (!avail) goto out_va_end; len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2); } if (len > 0) skb_put(skb, len); out_va_end: va_end(args2); out: return; } /** * audit_log_format - format a message into the audit buffer. * @ab: audit_buffer * @fmt: format string * @...: optional parameters matching @fmt string * * All the work is done in audit_log_vformat. */ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) { va_list args; if (!ab) return; va_start(args, fmt); audit_log_vformat(ab, fmt, args); va_end(args); } /** * audit_log_n_hex - convert a buffer to hex and append it to the audit skb * @ab: the audit_buffer * @buf: buffer to convert to hex * @len: length of @buf to be converted * * No return value; failure to expand is silently ignored. * * This function will take the passed buf and convert it into a string of * ascii hex digits. The new string is placed onto the skb. */ void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len) { int i, avail, new_len; unsigned char *ptr; struct sk_buff *skb; if (!ab) return; BUG_ON(!ab->skb); skb = ab->skb; avail = skb_tailroom(skb); new_len = len<<1; if (new_len >= avail) { /* Round the buffer request up to the next multiple */ new_len = AUDIT_BUFSIZ*(((new_len-avail)/AUDIT_BUFSIZ) + 1); avail = audit_expand(ab, new_len); if (!avail) return; } ptr = skb_tail_pointer(skb); for (i = 0; i < len; i++) ptr = hex_byte_pack_upper(ptr, buf[i]); *ptr = 0; skb_put(skb, len << 1); /* new string is twice the old string */ } /* * Format a string of no more than slen characters into the audit buffer, * enclosed in quote marks. */ void audit_log_n_string(struct audit_buffer *ab, const char *string, size_t slen) { int avail, new_len; unsigned char *ptr; struct sk_buff *skb; if (!ab) return; BUG_ON(!ab->skb); skb = ab->skb; avail = skb_tailroom(skb); new_len = slen + 3; /* enclosing quotes + null terminator */ if (new_len > avail) { avail = audit_expand(ab, new_len); if (!avail) return; } ptr = skb_tail_pointer(skb); *ptr++ = '"'; memcpy(ptr, string, slen); ptr += slen; *ptr++ = '"'; *ptr = 0; skb_put(skb, slen + 2); /* don't include null terminator */ } /** * audit_string_contains_control - does a string need to be logged in hex * @string: string to be checked * @len: max length of the string to check */ bool audit_string_contains_control(const char *string, size_t len) { const unsigned char *p; for (p = string; p < (const unsigned char *)string + len; p++) { if (*p == '"' || *p < 0x21 || *p > 0x7e) return true; } return false; } /** * audit_log_n_untrustedstring - log a string that may contain random characters * @ab: audit_buffer * @len: length of string (not including trailing null) * @string: string to be logged * * This code will escape a string that is passed to it if the string * contains a control character, unprintable character, double quote mark, * or a space. Unescaped strings will start and end with a double quote mark. * Strings that are escaped are printed in hex (2 digits per char). * * The caller specifies the number of characters in the string to log, which may * or may not be the entire string. */ void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string, size_t len) { if (audit_string_contains_control(string, len)) audit_log_n_hex(ab, string, len); else audit_log_n_string(ab, string, len); } /** * audit_log_untrustedstring - log a string that may contain random characters * @ab: audit_buffer * @string: string to be logged * * Same as audit_log_n_untrustedstring(), except that strlen is used to * determine string length. */ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) { audit_log_n_untrustedstring(ab, string, strlen(string)); } /* This is a helper-function to print the escaped d_path */ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, const struct path *path) { char *p, *pathname; if (prefix) audit_log_format(ab, "%s", prefix); /* We will allow 11 spaces for ' (deleted)' to be appended */ pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); if (!pathname) { audit_log_format(ab, "\"<no_memory>\""); return; } p = d_path(path, pathname, PATH_MAX+11); if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */ /* FIXME: can we save some information here? */ audit_log_format(ab, "\"<too_long>\""); } else audit_log_untrustedstring(ab, p); kfree(pathname); } void audit_log_session_info(struct audit_buffer *ab) { unsigned int sessionid = audit_get_sessionid(current); uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current)); audit_log_format(ab, "auid=%u ses=%u", auid, sessionid); } void audit_log_key(struct audit_buffer *ab, char *key) { audit_log_format(ab, " key="); if (key) audit_log_untrustedstring(ab, key); else audit_log_format(ab, "(null)"); } int audit_log_task_context(struct audit_buffer *ab) { char *ctx = NULL; unsigned len; int error; u32 sid; security_current_getsecid_subj(&sid); if (!sid) return 0; error = security_secid_to_secctx(sid, &ctx, &len); if (error) { if (error != -EINVAL) goto error_path; return 0; } audit_log_format(ab, " subj=%s", ctx); security_release_secctx(ctx, len); return 0; error_path: audit_panic("error in audit_log_task_context"); return error; } EXPORT_SYMBOL(audit_log_task_context); void audit_log_d_path_exe(struct audit_buffer *ab, struct mm_struct *mm) { struct file *exe_file; if (!mm) goto out_null; exe_file = get_mm_exe_file(mm); if (!exe_file) goto out_null; audit_log_d_path(ab, " exe=", &exe_file->f_path); fput(exe_file); return; out_null: audit_log_format(ab, " exe=(null)"); } struct tty_struct *audit_get_tty(void) { struct tty_struct *tty = NULL; unsigned long flags; spin_lock_irqsave(&current->sighand->siglock, flags); if (current->signal) tty = tty_kref_get(current->signal->tty); spin_unlock_irqrestore(&current->sighand->siglock, flags); return tty; } void audit_put_tty(struct tty_struct *tty) { tty_kref_put(tty); } void audit_log_task_info(struct audit_buffer *ab) { const struct cred *cred; char comm[sizeof(current->comm)]; struct tty_struct *tty; if (!ab) return; cred = current_cred(); tty = audit_get_tty(); audit_log_format(ab, " ppid=%d pid=%d auid=%u uid=%u gid=%u" " euid=%u suid=%u fsuid=%u" " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", task_ppid_nr(current), task_tgid_nr(current), from_kuid(&init_user_ns, audit_get_loginuid(current)), from_kuid(&init_user_ns, cred->uid), from_kgid(&init_user_ns, cred->gid), from_kuid(&init_user_ns, cred->euid), from_kuid(&init_user_ns, cred->suid), from_kuid(&init_user_ns, cred->fsuid), from_kgid(&init_user_ns, cred->egid), from_kgid(&init_user_ns, cred->sgid), from_kgid(&init_user_ns, cred->fsgid), tty ? tty_name(tty) : "(none)", audit_get_sessionid(current)); audit_put_tty(tty); audit_log_format(ab, " comm="); audit_log_untrustedstring(ab, get_task_comm(comm, current)); audit_log_d_path_exe(ab, current->mm); audit_log_task_context(ab); } EXPORT_SYMBOL(audit_log_task_info); /** * audit_log_path_denied - report a path restriction denial * @type: audit message type (AUDIT_ANOM_LINK, AUDIT_ANOM_CREAT, etc) * @operation: specific operation name */ void audit_log_path_denied(int type, const char *operation) { struct audit_buffer *ab; if (!audit_enabled || audit_dummy_context()) return; /* Generate log with subject, operation, outcome. */ ab = audit_log_start(audit_context(), GFP_KERNEL, type); if (!ab) return; audit_log_format(ab, "op=%s", operation); audit_log_task_info(ab); audit_log_format(ab, " res=0"); audit_log_end(ab); } /* global counter which is incremented every time something logs in */ static atomic_t session_id = ATOMIC_INIT(0); static int audit_set_loginuid_perm(kuid_t loginuid) { /* if we are unset, we don't need privs */ if (!audit_loginuid_set(current)) return 0; /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/ if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE)) return -EPERM; /* it is set, you need permission */ if (!capable(CAP_AUDIT_CONTROL)) return -EPERM; /* reject if this is not an unset and we don't allow that */ if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) && uid_valid(loginuid)) return -EPERM; return 0; } static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, unsigned int oldsessionid, unsigned int sessionid, int rc) { struct audit_buffer *ab; uid_t uid, oldloginuid, loginuid; struct tty_struct *tty; if (!audit_enabled) return; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_LOGIN); if (!ab) return; uid = from_kuid(&init_user_ns, task_uid(current)); oldloginuid = from_kuid(&init_user_ns, koldloginuid); loginuid = from_kuid(&init_user_ns, kloginuid); tty = audit_get_tty(); audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid); audit_log_task_context(ab); audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", oldloginuid, loginuid, tty ? tty_name(tty) : "(none)", oldsessionid, sessionid, !rc); audit_put_tty(tty); audit_log_end(ab); } /** * audit_set_loginuid - set current task's loginuid * @loginuid: loginuid value * * Returns 0. * * Called (set) from fs/proc/base.c::proc_loginuid_write(). */ int audit_set_loginuid(kuid_t loginuid) { unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET; kuid_t oldloginuid; int rc; oldloginuid = audit_get_loginuid(current); oldsessionid = audit_get_sessionid(current); rc = audit_set_loginuid_perm(loginuid); if (rc) goto out; /* are we setting or clearing? */ if (uid_valid(loginuid)) { sessionid = (unsigned int)atomic_inc_return(&session_id); if (unlikely(sessionid == AUDIT_SID_UNSET)) sessionid = (unsigned int)atomic_inc_return(&session_id); } current->sessionid = sessionid; current->loginuid = loginuid; out: audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc); return rc; } /** * audit_signal_info - record signal info for shutting down audit subsystem * @sig: signal value * @t: task being signaled * * If the audit subsystem is being terminated, record the task (pid) * and uid that is doing that. */ int audit_signal_info(int sig, struct task_struct *t) { kuid_t uid = current_uid(), auid; if (auditd_test_task(t) && (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2)) { audit_sig_pid = task_tgid_nr(current); auid = audit_get_loginuid(current); if (uid_valid(auid)) audit_sig_uid = auid; else audit_sig_uid = uid; security_current_getsecid_subj(&audit_sig_sid); } return audit_signal_info_syscall(t); } /** * audit_log_end - end one audit record * @ab: the audit_buffer * * We can not do a netlink send inside an irq context because it blocks (last * arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed on a * queue and a kthread is scheduled to remove them from the queue outside the * irq context. May be called in any context. */ void audit_log_end(struct audit_buffer *ab) { struct sk_buff *skb; struct nlmsghdr *nlh; if (!ab) return; if (audit_rate_check()) { skb = ab->skb; ab->skb = NULL; /* setup the netlink header, see the comments in * kauditd_send_multicast_skb() for length quirks */ nlh = nlmsg_hdr(skb); nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; /* queue the netlink packet and poke the kauditd thread */ skb_queue_tail(&audit_queue, skb); wake_up_interruptible(&kauditd_wait); } else audit_log_lost("rate limit exceeded"); audit_buffer_free(ab); } /** * audit_log - Log an audit record * @ctx: audit context * @gfp_mask: type of allocation * @type: audit message type * @fmt: format string to use * @...: variable parameters matching the format string * * This is a convenience function that calls audit_log_start, * audit_log_vformat, and audit_log_end. It may be called * in any context. */ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, const char *fmt, ...) { struct audit_buffer *ab; va_list args; ab = audit_log_start(ctx, gfp_mask, type); if (ab) { va_start(args, fmt); audit_log_vformat(ab, fmt, args); va_end(args); audit_log_end(ab); } } EXPORT_SYMBOL(audit_log_start); EXPORT_SYMBOL(audit_log_end); EXPORT_SYMBOL(audit_log_format); EXPORT_SYMBOL(audit_log);
10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 /* * net/tipc/addr.c: TIPC address utility routines * * Copyright (c) 2000-2006, 2018, Ericsson AB * Copyright (c) 2004-2005, 2010-2011, Wind River Systems * Copyright (c) 2020-2021, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "addr.h" #include "core.h" bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr) { if (!domain || (domain == addr)) return true; if (!legacy_format) return false; if (domain == tipc_cluster_mask(addr)) /* domain <Z.C.0> */ return true; if (domain == (addr & TIPC_ZONE_CLUSTER_MASK)) /* domain <Z.C.0> */ return true; if (domain == (addr & TIPC_ZONE_MASK)) /* domain <Z.0.0> */ return true; return false; } void tipc_set_node_id(struct net *net, u8 *id) { struct tipc_net *tn = tipc_net(net); memcpy(tn->node_id, id, NODE_ID_LEN); tipc_nodeid2string(tn->node_id_string, id); tn->trial_addr = hash128to32(id); pr_info("Node identity %s, cluster identity %u\n", tipc_own_id_string(net), tn->net_id); } void tipc_set_node_addr(struct net *net, u32 addr) { struct tipc_net *tn = tipc_net(net); u8 node_id[NODE_ID_LEN] = {0,}; tn->node_addr = addr; if (!tipc_own_id(net)) { sprintf(node_id, "%x", addr); tipc_set_node_id(net, node_id); } tn->trial_addr = addr; tn->addr_trial_end = jiffies; pr_info("Node number set to %u\n", addr); } char *tipc_nodeid2string(char *str, u8 *id) { int i; u8 c; /* Already a string ? */ for (i = 0; i < NODE_ID_LEN; i++) { c = id[i]; if (c >= '0' && c <= '9') continue; if (c >= 'A' && c <= 'Z') continue; if (c >= 'a' && c <= 'z') continue; if (c == '.') continue; if (c == ':') continue; if (c == '_') continue; if (c == '-') continue; if (c == '@') continue; if (c != 0) break; } if (i == NODE_ID_LEN) { memcpy(str, id, NODE_ID_LEN); str[NODE_ID_LEN] = 0; return str; } /* Translate to hex string */ for (i = 0; i < NODE_ID_LEN; i++) sprintf(&str[2 * i], "%02x", id[i]); /* Strip off trailing zeroes */ for (i = NODE_ID_STR_LEN - 2; str[i] == '0'; i--) str[i] = 0; return str; }
4 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 /* net/tipc/udp_media.c: IP bearer support for TIPC * * Copyright (c) 2015, Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <linux/socket.h> #include <linux/ip.h> #include <linux/udp.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/kernel.h> #include <linux/workqueue.h> #include <linux/list.h> #include <net/sock.h> #include <net/ip.h> #include <net/udp_tunnel.h> #include <net/ipv6_stubs.h> #include <linux/tipc_netlink.h> #include "core.h" #include "addr.h" #include "net.h" #include "bearer.h" #include "netlink.h" #include "msg.h" #include "udp_media.h" /* IANA assigned UDP port */ #define UDP_PORT_DEFAULT 6118 #define UDP_MIN_HEADROOM 48 /** * struct udp_media_addr - IP/UDP addressing information * * This is the bearer level originating address used in neighbor discovery * messages, and all fields should be in network byte order * * @proto: Ethernet protocol in use * @port: port being used * @ipv4: IPv4 address of neighbor * @ipv6: IPv6 address of neighbor */ struct udp_media_addr { __be16 proto; __be16 port; union { struct in_addr ipv4; struct in6_addr ipv6; }; }; /* struct udp_replicast - container for UDP remote addresses */ struct udp_replicast { struct udp_media_addr addr; struct dst_cache dst_cache; struct rcu_head rcu; struct list_head list; }; /** * struct udp_bearer - ip/udp bearer data structure * @bearer: associated generic tipc bearer * @ubsock: bearer associated socket * @ifindex: local address scope * @work: used to schedule deferred work on a bearer * @rcast: associated udp_replicast container */ struct udp_bearer { struct tipc_bearer __rcu *bearer; struct socket *ubsock; u32 ifindex; struct work_struct work; struct udp_replicast rcast; }; static int tipc_udp_is_mcast_addr(struct udp_media_addr *addr) { if (ntohs(addr->proto) == ETH_P_IP) return ipv4_is_multicast(addr->ipv4.s_addr); #if IS_ENABLED(CONFIG_IPV6) else return ipv6_addr_is_multicast(&addr->ipv6); #endif return 0; } /* udp_media_addr_set - convert a ip/udp address to a TIPC media address */ static void tipc_udp_media_addr_set(struct tipc_media_addr *addr, struct udp_media_addr *ua) { memset(addr, 0, sizeof(struct tipc_media_addr)); addr->media_id = TIPC_MEDIA_TYPE_UDP; memcpy(addr->value, ua, sizeof(struct udp_media_addr)); if (tipc_udp_is_mcast_addr(ua)) addr->broadcast = TIPC_BROADCAST_SUPPORT; } /* tipc_udp_addr2str - convert ip/udp address to string */ static int tipc_udp_addr2str(struct tipc_media_addr *a, char *buf, int size) { struct udp_media_addr *ua = (struct udp_media_addr *)&a->value; if (ntohs(ua->proto) == ETH_P_IP) snprintf(buf, size, "%pI4:%u", &ua->ipv4, ntohs(ua->port)); else if (ntohs(ua->proto) == ETH_P_IPV6) snprintf(buf, size, "%pI6:%u", &ua->ipv6, ntohs(ua->port)); else pr_err("Invalid UDP media address\n"); return 0; } /* tipc_udp_msg2addr - extract an ip/udp address from a TIPC ndisc message */ static int tipc_udp_msg2addr(struct tipc_bearer *b, struct tipc_media_addr *a, char *msg) { struct udp_media_addr *ua; ua = (struct udp_media_addr *) (msg + TIPC_MEDIA_ADDR_OFFSET); if (msg[TIPC_MEDIA_TYPE_OFFSET] != TIPC_MEDIA_TYPE_UDP) return -EINVAL; tipc_udp_media_addr_set(a, ua); return 0; } /* tipc_udp_addr2msg - write an ip/udp address to a TIPC ndisc message */ static int tipc_udp_addr2msg(char *msg, struct tipc_media_addr *a) { memset(msg, 0, TIPC_MEDIA_INFO_SIZE); msg[TIPC_MEDIA_TYPE_OFFSET] = TIPC_MEDIA_TYPE_UDP; memcpy(msg + TIPC_MEDIA_ADDR_OFFSET, a->value, sizeof(struct udp_media_addr)); return 0; } /* tipc_send_msg - enqueue a send request */ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, struct udp_bearer *ub, struct udp_media_addr *src, struct udp_media_addr *dst, struct dst_cache *cache) { struct dst_entry *ndst; int ttl, err = 0; local_bh_disable(); ndst = dst_cache_get(cache); if (dst->proto == htons(ETH_P_IP)) { struct rtable *rt = (struct rtable *)ndst; if (!rt) { struct flowi4 fl = { .daddr = dst->ipv4.s_addr, .saddr = src->ipv4.s_addr, .flowi4_mark = skb->mark, .flowi4_proto = IPPROTO_UDP }; rt = ip_route_output_key(net, &fl); if (IS_ERR(rt)) { err = PTR_ERR(rt); goto tx_error; } dst_cache_set_ip4(cache, &rt->dst, fl.saddr); } ttl = ip4_dst_hoplimit(&rt->dst); udp_tunnel_xmit_skb(rt, ub->ubsock->sk, skb, src->ipv4.s_addr, dst->ipv4.s_addr, 0, ttl, 0, src->port, dst->port, false, true); #if IS_ENABLED(CONFIG_IPV6) } else { if (!ndst) { struct flowi6 fl6 = { .flowi6_oif = ub->ifindex, .daddr = dst->ipv6, .saddr = src->ipv6, .flowi6_proto = IPPROTO_UDP }; ndst = ipv6_stub->ipv6_dst_lookup_flow(net, ub->ubsock->sk, &fl6, NULL); if (IS_ERR(ndst)) { err = PTR_ERR(ndst); goto tx_error; } dst_cache_set_ip6(cache, ndst, &fl6.saddr); } ttl = ip6_dst_hoplimit(ndst); err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, NULL, &src->ipv6, &dst->ipv6, 0, ttl, 0, src->port, dst->port, false); #endif } local_bh_enable(); return err; tx_error: local_bh_enable(); kfree_skb(skb); return err; } static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, struct tipc_bearer *b, struct tipc_media_addr *addr) { struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value; struct udp_media_addr *dst = (struct udp_media_addr *)&addr->value; struct udp_replicast *rcast; struct udp_bearer *ub; int err = 0; if (skb_headroom(skb) < UDP_MIN_HEADROOM) { err = pskb_expand_head(skb, UDP_MIN_HEADROOM, 0, GFP_ATOMIC); if (err) goto out; } skb_set_inner_protocol(skb, htons(ETH_P_TIPC)); ub = rcu_dereference(b->media_ptr); if (!ub) { err = -ENODEV; goto out; } if (addr->broadcast != TIPC_REPLICAST_SUPPORT) return tipc_udp_xmit(net, skb, ub, src, dst, &ub->rcast.dst_cache); /* Replicast, send an skb to each configured IP address */ list_for_each_entry_rcu(rcast, &ub->rcast.list, list) { struct sk_buff *_skb; _skb = pskb_copy(skb, GFP_ATOMIC); if (!_skb) { err = -ENOMEM; goto out; } err = tipc_udp_xmit(net, _skb, ub, src, &rcast->addr, &rcast->dst_cache); if (err) goto out; } err = 0; out: kfree_skb(skb); return err; } static bool tipc_udp_is_known_peer(struct tipc_bearer *b, struct udp_media_addr *addr) { struct udp_replicast *rcast, *tmp; struct udp_bearer *ub; ub = rcu_dereference_rtnl(b->media_ptr); if (!ub) { pr_err_ratelimited("UDP bearer instance not found\n"); return false; } list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) { if (!memcmp(&rcast->addr, addr, sizeof(struct udp_media_addr))) return true; } return false; } static int tipc_udp_rcast_add(struct tipc_bearer *b, struct udp_media_addr *addr) { struct udp_replicast *rcast; struct udp_bearer *ub; ub = rcu_dereference_rtnl(b->media_ptr); if (!ub) return -ENODEV; rcast = kmalloc(sizeof(*rcast), GFP_ATOMIC); if (!rcast) return -ENOMEM; if (dst_cache_init(&rcast->dst_cache, GFP_ATOMIC)) { kfree(rcast); return -ENOMEM; } memcpy(&rcast->addr, addr, sizeof(struct udp_media_addr)); if (ntohs(addr->proto) == ETH_P_IP) pr_info("New replicast peer: %pI4\n", &rcast->addr.ipv4); #if IS_ENABLED(CONFIG_IPV6) else if (ntohs(addr->proto) == ETH_P_IPV6) pr_info("New replicast peer: %pI6\n", &rcast->addr.ipv6); #endif b->bcast_addr.broadcast = TIPC_REPLICAST_SUPPORT; list_add_rcu(&rcast->list, &ub->rcast.list); return 0; } static int tipc_udp_rcast_disc(struct tipc_bearer *b, struct sk_buff *skb) { struct udp_media_addr src = {0}; struct udp_media_addr *dst; dst = (struct udp_media_addr *)&b->bcast_addr.value; if (tipc_udp_is_mcast_addr(dst)) return 0; src.port = udp_hdr(skb)->source; if (ip_hdr(skb)->version == 4) { struct iphdr *iphdr = ip_hdr(skb); src.proto = htons(ETH_P_IP); src.ipv4.s_addr = iphdr->saddr; if (ipv4_is_multicast(iphdr->daddr)) return 0; #if IS_ENABLED(CONFIG_IPV6) } else if (ip_hdr(skb)->version == 6) { struct ipv6hdr *iphdr = ipv6_hdr(skb); src.proto = htons(ETH_P_IPV6); src.ipv6 = iphdr->saddr; if (ipv6_addr_is_multicast(&iphdr->daddr)) return 0; #endif } else { return 0; } if (likely(tipc_udp_is_known_peer(b, &src))) return 0; return tipc_udp_rcast_add(b, &src); } /* tipc_udp_recv - read data from bearer socket */ static int tipc_udp_recv(struct sock *sk, struct sk_buff *skb) { struct udp_bearer *ub; struct tipc_bearer *b; struct tipc_msg *hdr; int err; ub = rcu_dereference_sk_user_data(sk); if (!ub) { pr_err_ratelimited("Failed to get UDP bearer reference"); goto out; } skb_pull(skb, sizeof(struct udphdr)); hdr = buf_msg(skb); b = rcu_dereference(ub->bearer); if (!b) goto out; if (b && test_bit(0, &b->up)) { TIPC_SKB_CB(skb)->flags = 0; tipc_rcv(sock_net(sk), skb, b); return 0; } if (unlikely(msg_user(hdr) == LINK_CONFIG)) { err = tipc_udp_rcast_disc(b, skb); if (err) goto out; } out: kfree_skb(skb); return 0; } static int enable_mcast(struct udp_bearer *ub, struct udp_media_addr *remote) { int err = 0; struct ip_mreqn mreqn; struct sock *sk = ub->ubsock->sk; if (ntohs(remote->proto) == ETH_P_IP) { mreqn.imr_multiaddr = remote->ipv4; mreqn.imr_ifindex = ub->ifindex; err = ip_mc_join_group(sk, &mreqn); #if IS_ENABLED(CONFIG_IPV6) } else { lock_sock(sk); err = ipv6_stub->ipv6_sock_mc_join(sk, ub->ifindex, &remote->ipv6); release_sock(sk); #endif } return err; } static int __tipc_nl_add_udp_addr(struct sk_buff *skb, struct udp_media_addr *addr, int nla_t) { if (ntohs(addr->proto) == ETH_P_IP) { struct sockaddr_in ip4; memset(&ip4, 0, sizeof(ip4)); ip4.sin_family = AF_INET; ip4.sin_port = addr->port; ip4.sin_addr.s_addr = addr->ipv4.s_addr; if (nla_put(skb, nla_t, sizeof(ip4), &ip4)) return -EMSGSIZE; #if IS_ENABLED(CONFIG_IPV6) } else if (ntohs(addr->proto) == ETH_P_IPV6) { struct sockaddr_in6 ip6; memset(&ip6, 0, sizeof(ip6)); ip6.sin6_family = AF_INET6; ip6.sin6_port = addr->port; memcpy(&ip6.sin6_addr, &addr->ipv6, sizeof(struct in6_addr)); if (nla_put(skb, nla_t, sizeof(ip6), &ip6)) return -EMSGSIZE; #endif } return 0; } int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb) { u32 bid = cb->args[0]; u32 skip_cnt = cb->args[1]; u32 portid = NETLINK_CB(cb->skb).portid; struct udp_replicast *rcast, *tmp; struct tipc_bearer *b; struct udp_bearer *ub; void *hdr; int err; int i; if (!bid && !skip_cnt) { struct nlattr **attrs = genl_dumpit_info(cb)->info.attrs; struct net *net = sock_net(skb->sk); struct nlattr *battrs[TIPC_NLA_BEARER_MAX + 1]; char *bname; if (!attrs[TIPC_NLA_BEARER]) return -EINVAL; err = nla_parse_nested_deprecated(battrs, TIPC_NLA_BEARER_MAX, attrs[TIPC_NLA_BEARER], tipc_nl_bearer_policy, NULL); if (err) return err; if (!battrs[TIPC_NLA_BEARER_NAME]) return -EINVAL; bname = nla_data(battrs[TIPC_NLA_BEARER_NAME]); rtnl_lock(); b = tipc_bearer_find(net, bname); if (!b) { rtnl_unlock(); return -EINVAL; } bid = b->identity; } else { struct net *net = sock_net(skb->sk); struct tipc_net *tn = net_generic(net, tipc_net_id); rtnl_lock(); b = rtnl_dereference(tn->bearer_list[bid]); if (!b) { rtnl_unlock(); return -EINVAL; } } ub = rtnl_dereference(b->media_ptr); if (!ub) { rtnl_unlock(); return -EINVAL; } i = 0; list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) { if (i < skip_cnt) goto count; hdr = genlmsg_put(skb, portid, cb->nlh->nlmsg_seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_BEARER_GET); if (!hdr) goto done; err = __tipc_nl_add_udp_addr(skb, &rcast->addr, TIPC_NLA_UDP_REMOTE); if (err) { genlmsg_cancel(skb, hdr); goto done; } genlmsg_end(skb, hdr); count: i++; } done: rtnl_unlock(); cb->args[0] = bid; cb->args[1] = i; return skb->len; } int tipc_udp_nl_add_bearer_data(struct tipc_nl_msg *msg, struct tipc_bearer *b) { struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value; struct udp_media_addr *dst; struct udp_bearer *ub; struct nlattr *nest; ub = rtnl_dereference(b->media_ptr); if (!ub) return -ENODEV; nest = nla_nest_start_noflag(msg->skb, TIPC_NLA_BEARER_UDP_OPTS); if (!nest) goto msg_full; if (__tipc_nl_add_udp_addr(msg->skb, src, TIPC_NLA_UDP_LOCAL)) goto msg_full; dst = (struct udp_media_addr *)&b->bcast_addr.value; if (__tipc_nl_add_udp_addr(msg->skb, dst, TIPC_NLA_UDP_REMOTE)) goto msg_full; if (!list_empty(&ub->rcast.list)) { if (nla_put_flag(msg->skb, TIPC_NLA_UDP_MULTI_REMOTEIP)) goto msg_full; } nla_nest_end(msg->skb, nest); return 0; msg_full: nla_nest_cancel(msg->skb, nest); return -EMSGSIZE; } /** * tipc_parse_udp_addr - build udp media address from netlink data * @nla: netlink attribute containing sockaddr storage aligned address * @addr: tipc media address to fill with address, port and protocol type * @scope_id: IPv6 scope id pointer, not NULL indicates it's required */ static int tipc_parse_udp_addr(struct nlattr *nla, struct udp_media_addr *addr, u32 *scope_id) { struct sockaddr_storage sa; nla_memcpy(&sa, nla, sizeof(sa)); if (sa.ss_family == AF_INET) { struct sockaddr_in *ip4 = (struct sockaddr_in *)&sa; addr->proto = htons(ETH_P_IP); addr->port = ip4->sin_port; addr->ipv4.s_addr = ip4->sin_addr.s_addr; return 0; #if IS_ENABLED(CONFIG_IPV6) } else if (sa.ss_family == AF_INET6) { struct sockaddr_in6 *ip6 = (struct sockaddr_in6 *)&sa; addr->proto = htons(ETH_P_IPV6); addr->port = ip6->sin6_port; memcpy(&addr->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr)); /* Scope ID is only interesting for local addresses */ if (scope_id) { int atype; atype = ipv6_addr_type(&ip6->sin6_addr); if (__ipv6_addr_needs_scope_id(atype) && !ip6->sin6_scope_id) { return -EINVAL; } *scope_id = ip6->sin6_scope_id ? : 0; } return 0; #endif } return -EADDRNOTAVAIL; } int tipc_udp_nl_bearer_add(struct tipc_bearer *b, struct nlattr *attr) { int err; struct udp_media_addr addr = {0}; struct nlattr *opts[TIPC_NLA_UDP_MAX + 1]; struct udp_media_addr *dst; if (nla_parse_nested_deprecated(opts, TIPC_NLA_UDP_MAX, attr, tipc_nl_udp_policy, NULL)) return -EINVAL; if (!opts[TIPC_NLA_UDP_REMOTE]) return -EINVAL; err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_REMOTE], &addr, NULL); if (err) return err; dst = (struct udp_media_addr *)&b->bcast_addr.value; if (tipc_udp_is_mcast_addr(dst)) { pr_err("Can't add remote ip to TIPC UDP multicast bearer\n"); return -EINVAL; } if (tipc_udp_is_known_peer(b, &addr)) return 0; return tipc_udp_rcast_add(b, &addr); } /** * tipc_udp_enable - callback to create a new udp bearer instance * @net: network namespace * @b: pointer to generic tipc_bearer * @attrs: netlink bearer configuration * * validate the bearer parameters and initialize the udp bearer * rtnl_lock should be held */ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, struct nlattr *attrs[]) { int err = -EINVAL; struct udp_bearer *ub; struct udp_media_addr remote = {0}; struct udp_media_addr local = {0}; struct udp_port_cfg udp_conf = {0}; struct udp_tunnel_sock_cfg tuncfg = {NULL}; struct nlattr *opts[TIPC_NLA_UDP_MAX + 1]; u8 node_id[NODE_ID_LEN] = {0,}; struct net_device *dev; int rmcast = 0; ub = kzalloc(sizeof(*ub), GFP_ATOMIC); if (!ub) return -ENOMEM; INIT_LIST_HEAD(&ub->rcast.list); if (!attrs[TIPC_NLA_BEARER_UDP_OPTS]) goto err; if (nla_parse_nested_deprecated(opts, TIPC_NLA_UDP_MAX, attrs[TIPC_NLA_BEARER_UDP_OPTS], tipc_nl_udp_policy, NULL)) goto err; if (!opts[TIPC_NLA_UDP_LOCAL] || !opts[TIPC_NLA_UDP_REMOTE]) { pr_err("Invalid UDP bearer configuration"); err = -EINVAL; goto err; } err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_LOCAL], &local, &ub->ifindex); if (err) goto err; err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_REMOTE], &remote, NULL); if (err) goto err; if (remote.proto != local.proto) { err = -EINVAL; goto err; } /* Checking remote ip address */ rmcast = tipc_udp_is_mcast_addr(&remote); /* Autoconfigure own node identity if needed */ if (!tipc_own_id(net)) { memcpy(node_id, local.ipv6.in6_u.u6_addr8, 16); tipc_net_init(net, node_id, 0); } if (!tipc_own_id(net)) { pr_warn("Failed to set node id, please configure manually\n"); err = -EINVAL; goto err; } b->bcast_addr.media_id = TIPC_MEDIA_TYPE_UDP; b->bcast_addr.broadcast = TIPC_BROADCAST_SUPPORT; rcu_assign_pointer(b->media_ptr, ub); rcu_assign_pointer(ub->bearer, b); tipc_udp_media_addr_set(&b->addr, &local); if (local.proto == htons(ETH_P_IP)) { dev = __ip_dev_find(net, local.ipv4.s_addr, false); if (!dev) { err = -ENODEV; goto err; } udp_conf.family = AF_INET; /* Switch to use ANY to receive packets from group */ if (rmcast) udp_conf.local_ip.s_addr = htonl(INADDR_ANY); else udp_conf.local_ip.s_addr = local.ipv4.s_addr; udp_conf.use_udp_checksums = false; ub->ifindex = dev->ifindex; b->encap_hlen = sizeof(struct iphdr) + sizeof(struct udphdr); b->mtu = b->media->mtu; #if IS_ENABLED(CONFIG_IPV6) } else if (local.proto == htons(ETH_P_IPV6)) { dev = ub->ifindex ? __dev_get_by_index(net, ub->ifindex) : NULL; dev = ipv6_dev_find(net, &local.ipv6, dev); if (!dev) { err = -ENODEV; goto err; } udp_conf.family = AF_INET6; udp_conf.use_udp6_tx_checksums = true; udp_conf.use_udp6_rx_checksums = true; if (rmcast) udp_conf.local_ip6 = in6addr_any; else udp_conf.local_ip6 = local.ipv6; ub->ifindex = dev->ifindex; b->encap_hlen = sizeof(struct ipv6hdr) + sizeof(struct udphdr); b->mtu = 1280; #endif } else { err = -EAFNOSUPPORT; goto err; } udp_conf.local_udp_port = local.port; err = udp_sock_create(net, &udp_conf, &ub->ubsock); if (err) goto err; tuncfg.sk_user_data = ub; tuncfg.encap_type = 1; tuncfg.encap_rcv = tipc_udp_recv; tuncfg.encap_destroy = NULL; setup_udp_tunnel_sock(net, ub->ubsock, &tuncfg); err = dst_cache_init(&ub->rcast.dst_cache, GFP_ATOMIC); if (err) goto free; /* * The bcast media address port is used for all peers and the ip * is used if it's a multicast address. */ memcpy(&b->bcast_addr.value, &remote, sizeof(remote)); if (rmcast) err = enable_mcast(ub, &remote); else err = tipc_udp_rcast_add(b, &remote); if (err) goto free; return 0; free: dst_cache_destroy(&ub->rcast.dst_cache); udp_tunnel_sock_release(ub->ubsock); err: kfree(ub); return err; } /* cleanup_bearer - break the socket/bearer association */ static void cleanup_bearer(struct work_struct *work) { struct udp_bearer *ub = container_of(work, struct udp_bearer, work); struct udp_replicast *rcast, *tmp; list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) { dst_cache_destroy(&rcast->dst_cache); list_del_rcu(&rcast->list); kfree_rcu(rcast, rcu); } atomic_dec(&tipc_net(sock_net(ub->ubsock->sk))->wq_count); dst_cache_destroy(&ub->rcast.dst_cache); udp_tunnel_sock_release(ub->ubsock); synchronize_net(); kfree(ub); } /* tipc_udp_disable - detach bearer from socket */ static void tipc_udp_disable(struct tipc_bearer *b) { struct udp_bearer *ub; ub = rtnl_dereference(b->media_ptr); if (!ub) { pr_err("UDP bearer instance not found\n"); return; } sock_set_flag(ub->ubsock->sk, SOCK_DEAD); RCU_INIT_POINTER(ub->bearer, NULL); /* sock_release need to be done outside of rtnl lock */ atomic_inc(&tipc_net(sock_net(ub->ubsock->sk))->wq_count); INIT_WORK(&ub->work, cleanup_bearer); schedule_work(&ub->work); } struct tipc_media udp_media_info = { .send_msg = tipc_udp_send_msg, .enable_media = tipc_udp_enable, .disable_media = tipc_udp_disable, .addr2str = tipc_udp_addr2str, .addr2msg = tipc_udp_addr2msg, .msg2addr = tipc_udp_msg2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, .min_win = TIPC_DEF_LINK_WIN, .max_win = TIPC_DEF_LINK_WIN, .mtu = TIPC_DEF_LINK_UDP_MTU, .type_id = TIPC_MEDIA_TYPE_UDP, .hwaddr_len = 0, .name = "udp" };
226 4236 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 // SPDX-License-Identifier: GPL-2.0 /* * IA-32 Huge TLB Page Support for Kernel. * * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> */ #include <linux/init.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/hugetlb.h> #include <linux/pagemap.h> #include <linux/err.h> #include <linux/sysctl.h> #include <linux/compat.h> #include <asm/mman.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/elf.h> /* * pmd_huge() returns 1 if @pmd is hugetlb related entry, that is normal * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry. * Otherwise, returns 0. */ int pmd_huge(pmd_t pmd) { return !pmd_none(pmd) && (pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT; } /* * pud_huge() returns 1 if @pud is hugetlb related entry, that is normal * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry. * Otherwise, returns 0. */ int pud_huge(pud_t pud) { #if CONFIG_PGTABLE_LEVELS > 2 return !pud_none(pud) && (pud_val(pud) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT; #else return 0; #endif } #ifdef CONFIG_HUGETLB_PAGE static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct hstate *h = hstate_file(file); struct vm_unmapped_area_info info; info.flags = 0; info.length = len; info.low_limit = get_mmap_base(1); /* * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area * in the full address space. */ info.high_limit = in_32bit_syscall() ? task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; return vm_unmapped_area(&info); } static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct hstate *h = hstate_file(file); struct vm_unmapped_area_info info; info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = get_mmap_base(0); /* * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area * in the full address space. */ if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall()) info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; addr = vm_unmapped_area(&info); /* * A failed mmap() very likely causes application failure, * so fall back to the bottom-up function here. This scenario * can happen with large stack limits and large mmap() * allocations. */ if (addr & ~PAGE_MASK) { VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = TASK_UNMAPPED_BASE; info.high_limit = TASK_SIZE_LOW; addr = vm_unmapped_area(&info); } return addr; } unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct hstate *h = hstate_file(file); struct mm_struct *mm = current->mm; struct vm_area_struct *vma; if (len & ~huge_page_mask(h)) return -EINVAL; if (len > TASK_SIZE) return -ENOMEM; /* No address checking. See comment at mmap_address_hint_valid() */ if (flags & MAP_FIXED) { if (prepare_hugepage_range(file, addr, len)) return -EINVAL; return addr; } if (addr) { addr &= huge_page_mask(h); if (!mmap_address_hint_valid(addr, len)) goto get_unmapped_area; vma = find_vma(mm, addr); if (!vma || addr + len <= vm_start_gap(vma)) return addr; } get_unmapped_area: if (mm->get_unmapped_area == arch_get_unmapped_area) return hugetlb_get_unmapped_area_bottomup(file, addr, len, pgoff, flags); else return hugetlb_get_unmapped_area_topdown(file, addr, len, pgoff, flags); } #endif /* CONFIG_HUGETLB_PAGE */ #ifdef CONFIG_X86_64 bool __init arch_hugetlb_valid_size(unsigned long size) { if (size == PMD_SIZE) return true; else if (size == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES)) return true; else return false; } #ifdef CONFIG_CONTIG_ALLOC static __init int gigantic_pages_init(void) { /* With compaction or CMA we can allocate gigantic pages at runtime */ if (boot_cpu_has(X86_FEATURE_GBPAGES)) hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); return 0; } arch_initcall(gigantic_pages_init); #endif #endif
2 108 90 53 86 68 68 44 44 44 108 119 6 5 6 117 1 110 6 116 5 5 116 3 3 1 1 46 46 2 46 46 92 115 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright Red Hat Inc. 2017 * * This file is part of the SCTP kernel implementation * * These functions manipulate sctp stream queue/scheduling. * * Please send any bug reports or fixes you make to the * email addresched(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> */ #include <linux/list.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> /* First Come First Serve (a.k.a. FIFO) * RFC DRAFT ndata Section 3.1 */ static int sctp_sched_fcfs_set(struct sctp_stream *stream, __u16 sid, __u16 value, gfp_t gfp) { return 0; } static int sctp_sched_fcfs_get(struct sctp_stream *stream, __u16 sid, __u16 *value) { *value = 0; return 0; } static int sctp_sched_fcfs_init(struct sctp_stream *stream) { return 0; } static int sctp_sched_fcfs_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) { return 0; } static void sctp_sched_fcfs_free_sid(struct sctp_stream *stream, __u16 sid) { } static void sctp_sched_fcfs_enqueue(struct sctp_outq *q, struct sctp_datamsg *msg) { } static struct sctp_chunk *sctp_sched_fcfs_dequeue(struct sctp_outq *q) { struct sctp_stream *stream = &q->asoc->stream; struct sctp_chunk *ch = NULL; struct list_head *entry; if (list_empty(&q->out_chunk_list)) goto out; if (stream->out_curr) { ch = list_entry(stream->out_curr->ext->outq.next, struct sctp_chunk, stream_list); } else { entry = q->out_chunk_list.next; ch = list_entry(entry, struct sctp_chunk, list); } sctp_sched_dequeue_common(q, ch); out: return ch; } static void sctp_sched_fcfs_dequeue_done(struct sctp_outq *q, struct sctp_chunk *chunk) { } static void sctp_sched_fcfs_sched_all(struct sctp_stream *stream) { } static void sctp_sched_fcfs_unsched_all(struct sctp_stream *stream) { } static struct sctp_sched_ops sctp_sched_fcfs = { .set = sctp_sched_fcfs_set, .get = sctp_sched_fcfs_get, .init = sctp_sched_fcfs_init, .init_sid = sctp_sched_fcfs_init_sid, .free_sid = sctp_sched_fcfs_free_sid, .enqueue = sctp_sched_fcfs_enqueue, .dequeue = sctp_sched_fcfs_dequeue, .dequeue_done = sctp_sched_fcfs_dequeue_done, .sched_all = sctp_sched_fcfs_sched_all, .unsched_all = sctp_sched_fcfs_unsched_all, }; static void sctp_sched_ops_fcfs_init(void) { sctp_sched_ops_register(SCTP_SS_FCFS, &sctp_sched_fcfs); } /* API to other parts of the stack */ static struct sctp_sched_ops *sctp_sched_ops[SCTP_SS_MAX + 1]; void sctp_sched_ops_register(enum sctp_sched_type sched, struct sctp_sched_ops *sched_ops) { sctp_sched_ops[sched] = sched_ops; } void sctp_sched_ops_init(void) { sctp_sched_ops_fcfs_init(); sctp_sched_ops_prio_init(); sctp_sched_ops_rr_init(); sctp_sched_ops_fc_init(); sctp_sched_ops_wfq_init(); } static void sctp_sched_free_sched(struct sctp_stream *stream) { struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); struct sctp_stream_out_ext *soute; int i; sched->unsched_all(stream); for (i = 0; i < stream->outcnt; i++) { soute = SCTP_SO(stream, i)->ext; if (!soute) continue; sched->free_sid(stream, i); /* Give the next scheduler a clean slate. */ memset_after(soute, 0, outq); } } int sctp_sched_set_sched(struct sctp_association *asoc, enum sctp_sched_type sched) { struct sctp_sched_ops *old = asoc->outqueue.sched; struct sctp_datamsg *msg = NULL; struct sctp_sched_ops *n; struct sctp_chunk *ch; int i, ret = 0; if (sched > SCTP_SS_MAX) return -EINVAL; n = sctp_sched_ops[sched]; if (old == n) return ret; if (old) sctp_sched_free_sched(&asoc->stream); asoc->outqueue.sched = n; n->init(&asoc->stream); for (i = 0; i < asoc->stream.outcnt; i++) { if (!SCTP_SO(&asoc->stream, i)->ext) continue; ret = n->init_sid(&asoc->stream, i, GFP_ATOMIC); if (ret) goto err; } /* We have to requeue all chunks already queued. */ list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) { if (ch->msg == msg) continue; msg = ch->msg; n->enqueue(&asoc->outqueue, msg); } return ret; err: sctp_sched_free_sched(&asoc->stream); asoc->outqueue.sched = &sctp_sched_fcfs; /* Always safe */ return ret; } int sctp_sched_get_sched(struct sctp_association *asoc) { int i; for (i = 0; i <= SCTP_SS_MAX; i++) if (asoc->outqueue.sched == sctp_sched_ops[i]) return i; return 0; } int sctp_sched_set_value(struct sctp_association *asoc, __u16 sid, __u16 value, gfp_t gfp) { if (sid >= asoc->stream.outcnt) return -EINVAL; if (!SCTP_SO(&asoc->stream, sid)->ext) { int ret; ret = sctp_stream_init_ext(&asoc->stream, sid); if (ret) return ret; } return asoc->outqueue.sched->set(&asoc->stream, sid, value, gfp); } int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid, __u16 *value) { if (sid >= asoc->stream.outcnt) return -EINVAL; if (!SCTP_SO(&asoc->stream, sid)->ext) return 0; return asoc->outqueue.sched->get(&asoc->stream, sid, value); } void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch) { if (!list_is_last(&ch->frag_list, &ch->msg->chunks) && !q->asoc->peer.intl_capable) { struct sctp_stream_out *sout; __u16 sid; /* datamsg is not finish, so save it as current one, * in case application switch scheduler or a higher * priority stream comes in. */ sid = sctp_chunk_stream_no(ch); sout = SCTP_SO(&q->asoc->stream, sid); q->asoc->stream.out_curr = sout; return; } q->asoc->stream.out_curr = NULL; q->sched->dequeue_done(q, ch); } /* Auxiliary functions for the schedulers */ void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch) { list_del_init(&ch->list); list_del_init(&ch->stream_list); q->out_qlen -= ch->skb->len; } int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) { struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); struct sctp_stream_out_ext *ext = SCTP_SO(stream, sid)->ext; INIT_LIST_HEAD(&ext->outq); return sched->init_sid(stream, sid, gfp); } struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream) { struct sctp_association *asoc; asoc = container_of(stream, struct sctp_association, stream); return asoc->outqueue.sched; }
35 40 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 /* SPDX-License-Identifier: GPL-2.0 */ /* * INETPEER - A storage for permanent information about peers * * Authors: Andrey V. Savochkin <saw@msu.ru> */ #ifndef _NET_INETPEER_H #define _NET_INETPEER_H #include <linux/types.h> #include <linux/init.h> #include <linux/jiffies.h> #include <linux/spinlock.h> #include <linux/rtnetlink.h> #include <net/ipv6.h> #include <linux/atomic.h> /* IPv4 address key for cache lookups */ struct ipv4_addr_key { __be32 addr; int vif; }; #define INETPEER_MAXKEYSZ (sizeof(struct in6_addr) / sizeof(u32)) struct inetpeer_addr { union { struct ipv4_addr_key a4; struct in6_addr a6; u32 key[INETPEER_MAXKEYSZ]; }; __u16 family; }; struct inet_peer { struct rb_node rb_node; struct inetpeer_addr daddr; u32 metrics[RTAX_MAX]; u32 rate_tokens; /* rate limiting for ICMP */ u32 n_redirects; unsigned long rate_last; /* * Once inet_peer is queued for deletion (refcnt == 0), following field * is not available: rid * We can share memory with rcu_head to help keep inet_peer small. */ union { struct { atomic_t rid; /* Frag reception counter */ }; struct rcu_head rcu; }; /* following fields might be frequently dirtied */ __u32 dtime; /* the time of last use of not referenced entries */ refcount_t refcnt; }; struct inet_peer_base { struct rb_root rb_root; seqlock_t lock; int total; }; void inet_peer_base_init(struct inet_peer_base *); void inet_initpeers(void) __init; #define INETPEER_METRICS_NEW (~(u32) 0) static inline void inetpeer_set_addr_v4(struct inetpeer_addr *iaddr, __be32 ip) { iaddr->a4.addr = ip; iaddr->a4.vif = 0; iaddr->family = AF_INET; } static inline __be32 inetpeer_get_addr_v4(struct inetpeer_addr *iaddr) { return iaddr->a4.addr; } static inline void inetpeer_set_addr_v6(struct inetpeer_addr *iaddr, struct in6_addr *in6) { iaddr->a6 = *in6; iaddr->family = AF_INET6; } static inline struct in6_addr *inetpeer_get_addr_v6(struct inetpeer_addr *iaddr) { return &iaddr->a6; } /* can be called with or without local BH being disabled */ struct inet_peer *inet_getpeer(struct inet_peer_base *base, const struct inetpeer_addr *daddr, int create); static inline struct inet_peer *inet_getpeer_v4(struct inet_peer_base *base, __be32 v4daddr, int vif, int create) { struct inetpeer_addr daddr; daddr.a4.addr = v4daddr; daddr.a4.vif = vif; daddr.family = AF_INET; return inet_getpeer(base, &daddr, create); } static inline struct inet_peer *inet_getpeer_v6(struct inet_peer_base *base, const struct in6_addr *v6daddr, int create) { struct inetpeer_addr daddr; daddr.a6 = *v6daddr; daddr.family = AF_INET6; return inet_getpeer(base, &daddr, create); } static inline int inetpeer_addr_cmp(const struct inetpeer_addr *a, const struct inetpeer_addr *b) { int i, n; if (a->family == AF_INET) n = sizeof(a->a4) / sizeof(u32); else n = sizeof(a->a6) / sizeof(u32); for (i = 0; i < n; i++) { if (a->key[i] == b->key[i]) continue; if (a->key[i] < b->key[i]) return -1; return 1; } return 0; } /* can be called from BH context or outside */ void inet_putpeer(struct inet_peer *p); bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout); void inetpeer_invalidate_tree(struct inet_peer_base *); #endif /* _NET_INETPEER_H */
49 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BLOCK_BLK_PM_H_ #define _BLOCK_BLK_PM_H_ #include <linux/pm_runtime.h> #ifdef CONFIG_PM static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q) { if (!q->dev || !blk_queue_pm_only(q)) return 1; /* Nothing to do */ if (pm && q->rpm_status != RPM_SUSPENDED) return 1; /* Request allowed */ pm_request_resume(q->dev); return 0; } static inline void blk_pm_mark_last_busy(struct request *rq) { if (rq->q->dev && !(rq->rq_flags & RQF_PM)) pm_runtime_mark_last_busy(rq->q->dev); } #else static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q) { return 1; } static inline void blk_pm_mark_last_busy(struct request *rq) { } #endif #endif /* _BLOCK_BLK_PM_H_ */
36 31 5 1 1 5 1 4 2 1 5 23 17 24 5 5 4 18 21 18 18 2 1 21 1 6 17 17 17 10 21 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 /* SPDX-License-Identifier: GPL-2.0 */ /* * Portions of this file * Copyright(c) 2016-2017 Intel Deutschland GmbH * Copyright (C) 2018 - 2024 Intel Corporation */ #if !defined(__MAC80211_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ) #define __MAC80211_DRIVER_TRACE #include <linux/tracepoint.h> #include <net/mac80211.h> #include "ieee80211_i.h" #undef TRACE_SYSTEM #define TRACE_SYSTEM mac80211 #define MAXNAME 32 #define LOCAL_ENTRY __array(char, wiphy_name, 32) #define LOCAL_ASSIGN strscpy(__entry->wiphy_name, wiphy_name(local->hw.wiphy), MAXNAME) #define LOCAL_PR_FMT "%s" #define LOCAL_PR_ARG __entry->wiphy_name #define STA_ENTRY __array(char, sta_addr, ETH_ALEN) #define STA_ASSIGN (sta ? memcpy(__entry->sta_addr, sta->addr, ETH_ALEN) : \ eth_zero_addr(__entry->sta_addr)) #define STA_NAMED_ASSIGN(s) memcpy(__entry->sta_addr, (s)->addr, ETH_ALEN) #define STA_PR_FMT " sta:%pM" #define STA_PR_ARG __entry->sta_addr #define VIF_ENTRY __field(enum nl80211_iftype, vif_type) __field(void *, sdata) \ __field(bool, p2p) \ __string(vif_name, sdata->name) #define VIF_ASSIGN __entry->vif_type = sdata->vif.type; __entry->sdata = sdata; \ __entry->p2p = sdata->vif.p2p; \ __assign_str(vif_name, sdata->name) #define VIF_PR_FMT " vif:%s(%d%s)" #define VIF_PR_ARG __get_str(vif_name), __entry->vif_type, __entry->p2p ? "/p2p" : "" #define CHANDEF_ENTRY __field(u32, control_freq) \ __field(u32, freq_offset) \ __field(u32, chan_width) \ __field(u32, center_freq1) \ __field(u32, freq1_offset) \ __field(u32, center_freq2) #define CHANDEF_ASSIGN(c) \ __entry->control_freq = (c) ? ((c)->chan ? (c)->chan->center_freq : 0) : 0; \ __entry->freq_offset = (c) ? ((c)->chan ? (c)->chan->freq_offset : 0) : 0; \ __entry->chan_width = (c) ? (c)->width : 0; \ __entry->center_freq1 = (c) ? (c)->center_freq1 : 0; \ __entry->freq1_offset = (c) ? (c)->freq1_offset : 0; \ __entry->center_freq2 = (c) ? (c)->center_freq2 : 0; #define CHANDEF_PR_FMT " chandef(%d.%03d MHz,width:%d,center: %d.%03d/%d MHz)" #define CHANDEF_PR_ARG __entry->control_freq, __entry->freq_offset, __entry->chan_width, \ __entry->center_freq1, __entry->freq1_offset, __entry->center_freq2 #define MIN_CHANDEF_ENTRY \ __field(u32, min_control_freq) \ __field(u32, min_freq_offset) \ __field(u32, min_chan_width) \ __field(u32, min_center_freq1) \ __field(u32, min_freq1_offset) \ __field(u32, min_center_freq2) #define MIN_CHANDEF_ASSIGN(c) \ __entry->min_control_freq = (c)->chan ? (c)->chan->center_freq : 0; \ __entry->min_freq_offset = (c)->chan ? (c)->chan->freq_offset : 0; \ __entry->min_chan_width = (c)->width; \ __entry->min_center_freq1 = (c)->center_freq1; \ __entry->min_freq1_offset = (c)->freq1_offset; \ __entry->min_center_freq2 = (c)->center_freq2; #define MIN_CHANDEF_PR_FMT " mindef(%d.%03d MHz,width:%d,center: %d.%03d/%d MHz)" #define MIN_CHANDEF_PR_ARG __entry->min_control_freq, __entry->min_freq_offset, \ __entry->min_chan_width, \ __entry->min_center_freq1, __entry->min_freq1_offset, \ __entry->min_center_freq2 #define AP_CHANDEF_ENTRY \ __field(u32, ap_control_freq) \ __field(u32, ap_freq_offset) \ __field(u32, ap_chan_width) \ __field(u32, ap_center_freq1) \ __field(u32, ap_freq1_offset) \ __field(u32, ap_center_freq2) #define AP_CHANDEF_ASSIGN(c) \ __entry->ap_control_freq = (c)->chan ? (c)->chan->center_freq : 0;\ __entry->ap_freq_offset = (c)->chan ? (c)->chan->freq_offset : 0;\ __entry->ap_chan_width = (c)->chan ? (c)->width : 0; \ __entry->ap_center_freq1 = (c)->chan ? (c)->center_freq1 : 0; \ __entry->ap_freq1_offset = (c)->chan ? (c)->freq1_offset : 0; \ __entry->ap_center_freq2 = (c)->chan ? (c)->center_freq2 : 0; #define AP_CHANDEF_PR_FMT " ap(%d.%03d MHz,width:%d,center: %d.%03d/%d MHz)" #define AP_CHANDEF_PR_ARG __entry->ap_control_freq, __entry->ap_freq_offset, \ __entry->ap_chan_width, \ __entry->ap_center_freq1, __entry->ap_freq1_offset, \ __entry->ap_center_freq2 #define CHANCTX_ENTRY CHANDEF_ENTRY \ MIN_CHANDEF_ENTRY \ AP_CHANDEF_ENTRY \ __field(u8, rx_chains_static) \ __field(u8, rx_chains_dynamic) #define CHANCTX_ASSIGN CHANDEF_ASSIGN(&ctx->conf.def) \ MIN_CHANDEF_ASSIGN(&ctx->conf.min_def) \ AP_CHANDEF_ASSIGN(&ctx->conf.ap) \ __entry->rx_chains_static = ctx->conf.rx_chains_static; \ __entry->rx_chains_dynamic = ctx->conf.rx_chains_dynamic #define CHANCTX_PR_FMT CHANDEF_PR_FMT MIN_CHANDEF_PR_FMT AP_CHANDEF_PR_FMT " chains:%d/%d" #define CHANCTX_PR_ARG CHANDEF_PR_ARG, MIN_CHANDEF_PR_ARG, AP_CHANDEF_PR_ARG, \ __entry->rx_chains_static, __entry->rx_chains_dynamic #define KEY_ENTRY __field(u32, cipher) \ __field(u8, hw_key_idx) \ __field(u8, flags) \ __field(s8, keyidx) #define KEY_ASSIGN(k) __entry->cipher = (k)->cipher; \ __entry->flags = (k)->flags; \ __entry->keyidx = (k)->keyidx; \ __entry->hw_key_idx = (k)->hw_key_idx; #define KEY_PR_FMT " cipher:0x%x, flags=%#x, keyidx=%d, hw_key_idx=%d" #define KEY_PR_ARG __entry->cipher, __entry->flags, __entry->keyidx, __entry->hw_key_idx #define AMPDU_ACTION_ENTRY __field(enum ieee80211_ampdu_mlme_action, \ ieee80211_ampdu_mlme_action) \ STA_ENTRY \ __field(u16, tid) \ __field(u16, ssn) \ __field(u16, buf_size) \ __field(bool, amsdu) \ __field(u16, timeout) \ __field(u16, action) #define AMPDU_ACTION_ASSIGN STA_NAMED_ASSIGN(params->sta); \ __entry->tid = params->tid; \ __entry->ssn = params->ssn; \ __entry->buf_size = params->buf_size; \ __entry->amsdu = params->amsdu; \ __entry->timeout = params->timeout; \ __entry->action = params->action; #define AMPDU_ACTION_PR_FMT STA_PR_FMT " tid %d, ssn %d, buf_size %u, amsdu %d, timeout %d action %d" #define AMPDU_ACTION_PR_ARG STA_PR_ARG, __entry->tid, __entry->ssn, \ __entry->buf_size, __entry->amsdu, __entry->timeout, \ __entry->action /* * Tracing for driver callbacks. */ DECLARE_EVENT_CLASS(local_only_evt, TP_PROTO(struct ieee80211_local *local), TP_ARGS(local), TP_STRUCT__entry( LOCAL_ENTRY ), TP_fast_assign( LOCAL_ASSIGN; ), TP_printk(LOCAL_PR_FMT, LOCAL_PR_ARG) ); DECLARE_EVENT_CLASS(local_sdata_addr_evt, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), TP_ARGS(local, sdata), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __array(char, addr, ETH_ALEN) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; memcpy(__entry->addr, sdata->vif.addr, ETH_ALEN); ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT " addr:%pM", LOCAL_PR_ARG, VIF_PR_ARG, __entry->addr ) ); DECLARE_EVENT_CLASS(local_u32_evt, TP_PROTO(struct ieee80211_local *local, u32 value), TP_ARGS(local, value), TP_STRUCT__entry( LOCAL_ENTRY __field(u32, value) ), TP_fast_assign( LOCAL_ASSIGN; __entry->value = value; ), TP_printk( LOCAL_PR_FMT " value:%d", LOCAL_PR_ARG, __entry->value ) ); DECLARE_EVENT_CLASS(local_sdata_evt, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), TP_ARGS(local, sdata), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT, LOCAL_PR_ARG, VIF_PR_ARG ) ); DEFINE_EVENT(local_only_evt, drv_return_void, TP_PROTO(struct ieee80211_local *local), TP_ARGS(local) ); TRACE_EVENT(drv_return_int, TP_PROTO(struct ieee80211_local *local, int ret), TP_ARGS(local, ret), TP_STRUCT__entry( LOCAL_ENTRY __field(int, ret) ), TP_fast_assign( LOCAL_ASSIGN; __entry->ret = ret; ), TP_printk(LOCAL_PR_FMT " - %d", LOCAL_PR_ARG, __entry->ret) ); TRACE_EVENT(drv_return_bool, TP_PROTO(struct ieee80211_local *local, bool ret), TP_ARGS(local, ret), TP_STRUCT__entry( LOCAL_ENTRY __field(bool, ret) ), TP_fast_assign( LOCAL_ASSIGN; __entry->ret = ret; ), TP_printk(LOCAL_PR_FMT " - %s", LOCAL_PR_ARG, (__entry->ret) ? "true" : "false") ); TRACE_EVENT(drv_return_u32, TP_PROTO(struct ieee80211_local *local, u32 ret), TP_ARGS(local, ret), TP_STRUCT__entry( LOCAL_ENTRY __field(u32, ret) ), TP_fast_assign( LOCAL_ASSIGN; __entry->ret = ret; ), TP_printk(LOCAL_PR_FMT " - %u", LOCAL_PR_ARG, __entry->ret) ); TRACE_EVENT(drv_return_u64, TP_PROTO(struct ieee80211_local *local, u64 ret), TP_ARGS(local, ret), TP_STRUCT__entry( LOCAL_ENTRY __field(u64, ret) ), TP_fast_assign( LOCAL_ASSIGN; __entry->ret = ret; ), TP_printk(LOCAL_PR_FMT " - %llu", LOCAL_PR_ARG, __entry->ret) ); DEFINE_EVENT(local_only_evt, drv_start, TP_PROTO(struct ieee80211_local *local), TP_ARGS(local) ); DEFINE_EVENT(local_u32_evt, drv_get_et_strings, TP_PROTO(struct ieee80211_local *local, u32 sset), TP_ARGS(local, sset) ); DEFINE_EVENT(local_u32_evt, drv_get_et_sset_count, TP_PROTO(struct ieee80211_local *local, u32 sset), TP_ARGS(local, sset) ); DEFINE_EVENT(local_only_evt, drv_get_et_stats, TP_PROTO(struct ieee80211_local *local), TP_ARGS(local) ); DEFINE_EVENT(local_only_evt, drv_suspend, TP_PROTO(struct ieee80211_local *local), TP_ARGS(local) ); DEFINE_EVENT(local_only_evt, drv_resume, TP_PROTO(struct ieee80211_local *local), TP_ARGS(local) ); TRACE_EVENT(drv_set_wakeup, TP_PROTO(struct ieee80211_local *local, bool enabled), TP_ARGS(local, enabled), TP_STRUCT__entry( LOCAL_ENTRY __field(bool, enabled) ), TP_fast_assign( LOCAL_ASSIGN; __entry->enabled = enabled; ), TP_printk(LOCAL_PR_FMT " enabled:%d", LOCAL_PR_ARG, __entry->enabled) ); DEFINE_EVENT(local_only_evt, drv_stop, TP_PROTO(struct ieee80211_local *local), TP_ARGS(local) ); DEFINE_EVENT(local_sdata_addr_evt, drv_add_interface, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), TP_ARGS(local, sdata) ); TRACE_EVENT(drv_change_interface, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, enum nl80211_iftype type, bool p2p), TP_ARGS(local, sdata, type, p2p), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(u32, new_type) __field(bool, new_p2p) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->new_type = type; __entry->new_p2p = p2p; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT " new type:%d%s", LOCAL_PR_ARG, VIF_PR_ARG, __entry->new_type, __entry->new_p2p ? "/p2p" : "" ) ); DEFINE_EVENT(local_sdata_addr_evt, drv_remove_interface, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), TP_ARGS(local, sdata) ); TRACE_EVENT(drv_config, TP_PROTO(struct ieee80211_local *local, u32 changed), TP_ARGS(local, changed), TP_STRUCT__entry( LOCAL_ENTRY __field(u32, changed) __field(u32, flags) __field(int, power_level) __field(int, dynamic_ps_timeout) __field(u16, listen_interval) __field(u8, long_frame_max_tx_count) __field(u8, short_frame_max_tx_count) CHANDEF_ENTRY __field(int, smps) ), TP_fast_assign( LOCAL_ASSIGN; __entry->changed = changed; __entry->flags = local->hw.conf.flags; __entry->power_level = local->hw.conf.power_level; __entry->dynamic_ps_timeout = local->hw.conf.dynamic_ps_timeout; __entry->listen_interval = local->hw.conf.listen_interval; __entry->long_frame_max_tx_count = local->hw.conf.long_frame_max_tx_count; __entry->short_frame_max_tx_count = local->hw.conf.short_frame_max_tx_count; CHANDEF_ASSIGN(&local->hw.conf.chandef) __entry->smps = local->hw.conf.smps_mode; ), TP_printk( LOCAL_PR_FMT " ch:%#x" CHANDEF_PR_FMT, LOCAL_PR_ARG, __entry->changed, CHANDEF_PR_ARG ) ); TRACE_EVENT(drv_vif_cfg_changed, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u64 changed), TP_ARGS(local, sdata, changed), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(u64, changed) __field(bool, assoc) __field(bool, ibss_joined) __field(bool, ibss_creator) __field(u16, aid) __dynamic_array(u32, arp_addr_list, sdata->vif.cfg.arp_addr_cnt > IEEE80211_BSS_ARP_ADDR_LIST_LEN ? IEEE80211_BSS_ARP_ADDR_LIST_LEN : sdata->vif.cfg.arp_addr_cnt) __field(int, arp_addr_cnt) __dynamic_array(u8, ssid, sdata->vif.cfg.ssid_len) __field(int, s1g) __field(bool, idle) __field(bool, ps) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->changed = changed; __entry->aid = sdata->vif.cfg.aid; __entry->assoc = sdata->vif.cfg.assoc; __entry->ibss_joined = sdata->vif.cfg.ibss_joined; __entry->ibss_creator = sdata->vif.cfg.ibss_creator; __entry->ps = sdata->vif.cfg.ps; __entry->arp_addr_cnt = sdata->vif.cfg.arp_addr_cnt; memcpy(__get_dynamic_array(arp_addr_list), sdata->vif.cfg.arp_addr_list, sizeof(u32) * (sdata->vif.cfg.arp_addr_cnt > IEEE80211_BSS_ARP_ADDR_LIST_LEN ? IEEE80211_BSS_ARP_ADDR_LIST_LEN : sdata->vif.cfg.arp_addr_cnt)); memcpy(__get_dynamic_array(ssid), sdata->vif.cfg.ssid, sdata->vif.cfg.ssid_len); __entry->s1g = sdata->vif.cfg.s1g; __entry->idle = sdata->vif.cfg.idle; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT " changed:%#llx", LOCAL_PR_ARG, VIF_PR_ARG, __entry->changed ) ); TRACE_EVENT(drv_link_info_changed, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *link_conf, u64 changed), TP_ARGS(local, sdata, link_conf, changed), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(u64, changed) __field(int, link_id) __field(bool, cts) __field(bool, shortpre) __field(bool, shortslot) __field(bool, enable_beacon) __field(u8, dtimper) __field(u16, bcnint) __field(u16, assoc_cap) __field(u64, sync_tsf) __field(u32, sync_device_ts) __field(u8, sync_dtim_count) __field(u32, basic_rates) __array(int, mcast_rate, NUM_NL80211_BANDS) __field(u16, ht_operation_mode) __field(s32, cqm_rssi_thold) __field(s32, cqm_rssi_hyst) __field(u32, channel_width) __field(u32, channel_cfreq1) __field(u32, channel_cfreq1_offset) __field(bool, qos) __field(bool, hidden_ssid) __field(int, txpower) __field(u8, p2p_oppps_ctwindow) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->changed = changed; __entry->link_id = link_conf->link_id; __entry->shortpre = link_conf->use_short_preamble; __entry->cts = link_conf->use_cts_prot; __entry->shortslot = link_conf->use_short_slot; __entry->enable_beacon = link_conf->enable_beacon; __entry->dtimper = link_conf->dtim_period; __entry->bcnint = link_conf->beacon_int; __entry->assoc_cap = link_conf->assoc_capability; __entry->sync_tsf = link_conf->sync_tsf; __entry->sync_device_ts = link_conf->sync_device_ts; __entry->sync_dtim_count = link_conf->sync_dtim_count; __entry->basic_rates = link_conf->basic_rates; memcpy(__entry->mcast_rate, link_conf->mcast_rate, sizeof(__entry->mcast_rate)); __entry->ht_operation_mode = link_conf->ht_operation_mode; __entry->cqm_rssi_thold = link_conf->cqm_rssi_thold; __entry->cqm_rssi_hyst = link_conf->cqm_rssi_hyst; __entry->channel_width = link_conf->chanreq.oper.width; __entry->channel_cfreq1 = link_conf->chanreq.oper.center_freq1; __entry->channel_cfreq1_offset = link_conf->chanreq.oper.freq1_offset; __entry->qos = link_conf->qos; __entry->hidden_ssid = link_conf->hidden_ssid; __entry->txpower = link_conf->txpower; __entry->p2p_oppps_ctwindow = link_conf->p2p_noa_attr.oppps_ctwindow; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT " link_id:%d, changed:%#llx", LOCAL_PR_ARG, VIF_PR_ARG, __entry->link_id, __entry->changed ) ); TRACE_EVENT(drv_prepare_multicast, TP_PROTO(struct ieee80211_local *local, int mc_count), TP_ARGS(local, mc_count), TP_STRUCT__entry( LOCAL_ENTRY __field(int, mc_count) ), TP_fast_assign( LOCAL_ASSIGN; __entry->mc_count = mc_count; ), TP_printk( LOCAL_PR_FMT " prepare mc (%d)", LOCAL_PR_ARG, __entry->mc_count ) ); TRACE_EVENT(drv_configure_filter, TP_PROTO(struct ieee80211_local *local, unsigned int changed_flags, unsigned int *total_flags, u64 multicast), TP_ARGS(local, changed_flags, total_flags, multicast), TP_STRUCT__entry( LOCAL_ENTRY __field(unsigned int, changed) __field(unsigned int, total) __field(u64, multicast) ), TP_fast_assign( LOCAL_ASSIGN; __entry->changed = changed_flags; __entry->total = *total_flags; __entry->multicast = multicast; ), TP_printk( LOCAL_PR_FMT " changed:%#x total:%#x", LOCAL_PR_ARG, __entry->changed, __entry->total ) ); TRACE_EVENT(drv_config_iface_filter, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, unsigned int filter_flags, unsigned int changed_flags), TP_ARGS(local, sdata, filter_flags, changed_flags), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(unsigned int, filter_flags) __field(unsigned int, changed_flags) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->filter_flags = filter_flags; __entry->changed_flags = changed_flags; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT " filter_flags: %#x changed_flags: %#x", LOCAL_PR_ARG, VIF_PR_ARG, __entry->filter_flags, __entry->changed_flags ) ); TRACE_EVENT(drv_set_tim, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sta *sta, bool set), TP_ARGS(local, sta, set), TP_STRUCT__entry( LOCAL_ENTRY STA_ENTRY __field(bool, set) ), TP_fast_assign( LOCAL_ASSIGN; STA_ASSIGN; __entry->set = set; ), TP_printk( LOCAL_PR_FMT STA_PR_FMT " set:%d", LOCAL_PR_ARG, STA_PR_ARG, __entry->set ) ); TRACE_EVENT(drv_set_key, TP_PROTO(struct ieee80211_local *local, enum set_key_cmd cmd, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct ieee80211_key_conf *key), TP_ARGS(local, cmd, sdata, sta, key), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY STA_ENTRY __field(u32, cmd) KEY_ENTRY ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; STA_ASSIGN; __entry->cmd = cmd; KEY_ASSIGN(key); ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " cmd: %d" KEY_PR_FMT, LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->cmd, KEY_PR_ARG ) ); TRACE_EVENT(drv_update_tkip_key, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_key_conf *conf, struct ieee80211_sta *sta, u32 iv32), TP_ARGS(local, sdata, conf, sta, iv32), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY STA_ENTRY __field(u32, iv32) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; STA_ASSIGN; __entry->iv32 = iv32; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " iv32:%#x", LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->iv32 ) ); DEFINE_EVENT(local_sdata_evt, drv_hw_scan, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), TP_ARGS(local, sdata) ); DEFINE_EVENT(local_sdata_evt, drv_cancel_hw_scan, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), TP_ARGS(local, sdata) ); DEFINE_EVENT(local_sdata_evt, drv_sched_scan_start, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), TP_ARGS(local, sdata) ); DEFINE_EVENT(local_sdata_evt, drv_sched_scan_stop, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), TP_ARGS(local, sdata) ); TRACE_EVENT(drv_sw_scan_start, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, const u8 *mac_addr), TP_ARGS(local, sdata, mac_addr), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __array(char, mac_addr, ETH_ALEN) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; memcpy(__entry->mac_addr, mac_addr, ETH_ALEN); ), TP_printk(LOCAL_PR_FMT ", " VIF_PR_FMT ", addr:%pM", LOCAL_PR_ARG, VIF_PR_ARG, __entry->mac_addr) ); DEFINE_EVENT(local_sdata_evt, drv_sw_scan_complete, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), TP_ARGS(local, sdata) ); TRACE_EVENT(drv_get_stats, TP_PROTO(struct ieee80211_local *local, struct ieee80211_low_level_stats *stats, int ret), TP_ARGS(local, stats, ret), TP_STRUCT__entry( LOCAL_ENTRY __field(int, ret) __field(unsigned int, ackfail) __field(unsigned int, rtsfail) __field(unsigned int, fcserr) __field(unsigned int, rtssucc) ), TP_fast_assign( LOCAL_ASSIGN; __entry->ret = ret; __entry->ackfail = stats->dot11ACKFailureCount; __entry->rtsfail = stats->dot11RTSFailureCount; __entry->fcserr = stats->dot11FCSErrorCount; __entry->rtssucc = stats->dot11RTSSuccessCount; ), TP_printk( LOCAL_PR_FMT " ret:%d", LOCAL_PR_ARG, __entry->ret ) ); TRACE_EVENT(drv_get_key_seq, TP_PROTO(struct ieee80211_local *local, struct ieee80211_key_conf *key), TP_ARGS(local, key), TP_STRUCT__entry( LOCAL_ENTRY KEY_ENTRY ), TP_fast_assign( LOCAL_ASSIGN; KEY_ASSIGN(key); ), TP_printk( LOCAL_PR_FMT KEY_PR_FMT, LOCAL_PR_ARG, KEY_PR_ARG ) ); DEFINE_EVENT(local_u32_evt, drv_set_frag_threshold, TP_PROTO(struct ieee80211_local *local, u32 value), TP_ARGS(local, value) ); DEFINE_EVENT(local_u32_evt, drv_set_rts_threshold, TP_PROTO(struct ieee80211_local *local, u32 value), TP_ARGS(local, value) ); TRACE_EVENT(drv_set_coverage_class, TP_PROTO(struct ieee80211_local *local, s16 value), TP_ARGS(local, value), TP_STRUCT__entry( LOCAL_ENTRY __field(s16, value) ), TP_fast_assign( LOCAL_ASSIGN; __entry->value = value; ), TP_printk( LOCAL_PR_FMT " value:%d", LOCAL_PR_ARG, __entry->value ) ); TRACE_EVENT(drv_sta_notify, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, enum sta_notify_cmd cmd, struct ieee80211_sta *sta), TP_ARGS(local, sdata, cmd, sta), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY STA_ENTRY __field(u32, cmd) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; STA_ASSIGN; __entry->cmd = cmd; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " cmd:%d", LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->cmd ) ); TRACE_EVENT(drv_sta_state, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, enum ieee80211_sta_state old_state, enum ieee80211_sta_state new_state), TP_ARGS(local, sdata, sta, old_state, new_state), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY STA_ENTRY __field(u32, old_state) __field(u32, new_state) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; STA_ASSIGN; __entry->old_state = old_state; __entry->new_state = new_state; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " state: %d->%d", LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->old_state, __entry->new_state ) ); TRACE_EVENT(drv_sta_set_txpwr, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta), TP_ARGS(local, sdata, sta), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY STA_ENTRY __field(s16, txpwr) __field(u8, type) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; STA_ASSIGN; __entry->txpwr = sta->deflink.txpwr.power; __entry->type = sta->deflink.txpwr.type; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " txpwr: %d type %d", LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->txpwr, __entry->type ) ); TRACE_EVENT(drv_sta_rc_update, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, u32 changed), TP_ARGS(local, sdata, sta, changed), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY STA_ENTRY __field(u32, changed) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; STA_ASSIGN; __entry->changed = changed; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " changed: 0x%x", LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->changed ) ); DECLARE_EVENT_CLASS(sta_event, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta), TP_ARGS(local, sdata, sta), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY STA_ENTRY ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; STA_ASSIGN; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT, LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG ) ); DEFINE_EVENT(sta_event, drv_sta_statistics, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta), TP_ARGS(local, sdata, sta) ); DEFINE_EVENT(sta_event, drv_sta_add, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta), TP_ARGS(local, sdata, sta) ); DEFINE_EVENT(sta_event, drv_sta_remove, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta), TP_ARGS(local, sdata, sta) ); DEFINE_EVENT(sta_event, drv_sta_pre_rcu_remove, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta), TP_ARGS(local, sdata, sta) ); DEFINE_EVENT(sta_event, drv_sync_rx_queues, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta), TP_ARGS(local, sdata, sta) ); DEFINE_EVENT(sta_event, drv_sta_rate_tbl_update, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta), TP_ARGS(local, sdata, sta) ); TRACE_EVENT(drv_conf_tx, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, unsigned int link_id, u16 ac, const struct ieee80211_tx_queue_params *params), TP_ARGS(local, sdata, link_id, ac, params), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(unsigned int, link_id) __field(u16, ac) __field(u16, txop) __field(u16, cw_min) __field(u16, cw_max) __field(u8, aifs) __field(bool, uapsd) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->link_id = link_id; __entry->ac = ac; __entry->txop = params->txop; __entry->cw_max = params->cw_max; __entry->cw_min = params->cw_min; __entry->aifs = params->aifs; __entry->uapsd = params->uapsd; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT " link_id: %d, AC:%d", LOCAL_PR_ARG, VIF_PR_ARG, __entry->link_id, __entry->ac ) ); DEFINE_EVENT(local_sdata_evt, drv_get_tsf, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), TP_ARGS(local, sdata) ); TRACE_EVENT(drv_set_tsf, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u64 tsf), TP_ARGS(local, sdata, tsf), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(u64, tsf) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->tsf = tsf; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT " tsf:%llu", LOCAL_PR_ARG, VIF_PR_ARG, (unsigned long long)__entry->tsf ) ); TRACE_EVENT(drv_offset_tsf, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, s64 offset), TP_ARGS(local, sdata, offset), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(s64, tsf_offset) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->tsf_offset = offset; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT " tsf offset:%lld", LOCAL_PR_ARG, VIF_PR_ARG, (unsigned long long)__entry->tsf_offset ) ); DEFINE_EVENT(local_sdata_evt, drv_reset_tsf, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), TP_ARGS(local, sdata) ); DEFINE_EVENT(local_only_evt, drv_tx_last_beacon, TP_PROTO(struct ieee80211_local *local), TP_ARGS(local) ); TRACE_EVENT(drv_ampdu_action, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_ampdu_params *params), TP_ARGS(local, sdata, params), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY AMPDU_ACTION_ENTRY ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; AMPDU_ACTION_ASSIGN; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT AMPDU_ACTION_PR_FMT, LOCAL_PR_ARG, VIF_PR_ARG, AMPDU_ACTION_PR_ARG ) ); TRACE_EVENT(drv_get_survey, TP_PROTO(struct ieee80211_local *local, int _idx, struct survey_info *survey), TP_ARGS(local, _idx, survey), TP_STRUCT__entry( LOCAL_ENTRY __field(int, idx) ), TP_fast_assign( LOCAL_ASSIGN; __entry->idx = _idx; ), TP_printk( LOCAL_PR_FMT " idx:%d", LOCAL_PR_ARG, __entry->idx ) ); TRACE_EVENT(drv_flush, TP_PROTO(struct ieee80211_local *local, u32 queues, bool drop), TP_ARGS(local, queues, drop), TP_STRUCT__entry( LOCAL_ENTRY __field(bool, drop) __field(u32, queues) ), TP_fast_assign( LOCAL_ASSIGN; __entry->drop = drop; __entry->queues = queues; ), TP_printk( LOCAL_PR_FMT " queues:0x%x drop:%d", LOCAL_PR_ARG, __entry->queues, __entry->drop ) ); DEFINE_EVENT(sta_event, drv_flush_sta, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta), TP_ARGS(local, sdata, sta) ); DECLARE_EVENT_CLASS(chanswitch_evt, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_channel_switch *ch_switch), TP_ARGS(local, sdata, ch_switch), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY CHANDEF_ENTRY __field(u64, timestamp) __field(u32, device_timestamp) __field(bool, block_tx) __field(u8, count) __field(u8, link_id) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; CHANDEF_ASSIGN(&ch_switch->chandef) __entry->timestamp = ch_switch->timestamp; __entry->device_timestamp = ch_switch->device_timestamp; __entry->block_tx = ch_switch->block_tx; __entry->count = ch_switch->count; __entry->link_id = ch_switch->link_id; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT CHANDEF_PR_FMT " count:%d block_tx:%d timestamp:%llu device_ts:%u link_id:%d", LOCAL_PR_ARG, VIF_PR_ARG, CHANDEF_PR_ARG, __entry->count, __entry->block_tx, __entry->timestamp, __entry->device_timestamp, __entry->link_id ) ); DEFINE_EVENT(chanswitch_evt, drv_channel_switch, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_channel_switch *ch_switch), TP_ARGS(local, sdata, ch_switch) ); TRACE_EVENT(drv_set_antenna, TP_PROTO(struct ieee80211_local *local, u32 tx_ant, u32 rx_ant, int ret), TP_ARGS(local, tx_ant, rx_ant, ret), TP_STRUCT__entry( LOCAL_ENTRY __field(u32, tx_ant) __field(u32, rx_ant) __field(int, ret) ), TP_fast_assign( LOCAL_ASSIGN; __entry->tx_ant = tx_ant; __entry->rx_ant = rx_ant; __entry->ret = ret; ), TP_printk( LOCAL_PR_FMT " tx_ant:%d rx_ant:%d ret:%d", LOCAL_PR_ARG, __entry->tx_ant, __entry->rx_ant, __entry->ret ) ); TRACE_EVENT(drv_get_antenna, TP_PROTO(struct ieee80211_local *local, u32 tx_ant, u32 rx_ant, int ret), TP_ARGS(local, tx_ant, rx_ant, ret), TP_STRUCT__entry( LOCAL_ENTRY __field(u32, tx_ant) __field(u32, rx_ant) __field(int, ret) ), TP_fast_assign( LOCAL_ASSIGN; __entry->tx_ant = tx_ant; __entry->rx_ant = rx_ant; __entry->ret = ret; ), TP_printk( LOCAL_PR_FMT " tx_ant:%d rx_ant:%d ret:%d", LOCAL_PR_ARG, __entry->tx_ant, __entry->rx_ant, __entry->ret ) ); TRACE_EVENT(drv_remain_on_channel, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *chan, unsigned int duration, enum ieee80211_roc_type type), TP_ARGS(local, sdata, chan, duration, type), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(int, center_freq) __field(int, freq_offset) __field(unsigned int, duration) __field(u32, type) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->center_freq = chan->center_freq; __entry->freq_offset = chan->freq_offset; __entry->duration = duration; __entry->type = type; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT " freq:%d.%03dMHz duration:%dms type=%d", LOCAL_PR_ARG, VIF_PR_ARG, __entry->center_freq, __entry->freq_offset, __entry->duration, __entry->type ) ); DEFINE_EVENT(local_sdata_evt, drv_cancel_remain_on_channel, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), TP_ARGS(local, sdata) ); TRACE_EVENT(drv_set_ringparam, TP_PROTO(struct ieee80211_local *local, u32 tx, u32 rx), TP_ARGS(local, tx, rx), TP_STRUCT__entry( LOCAL_ENTRY __field(u32, tx) __field(u32, rx) ), TP_fast_assign( LOCAL_ASSIGN; __entry->tx = tx; __entry->rx = rx; ), TP_printk( LOCAL_PR_FMT " tx:%d rx %d", LOCAL_PR_ARG, __entry->tx, __entry->rx ) ); TRACE_EVENT(drv_get_ringparam, TP_PROTO(struct ieee80211_local *local, u32 *tx, u32 *tx_max, u32 *rx, u32 *rx_max), TP_ARGS(local, tx, tx_max, rx, rx_max), TP_STRUCT__entry( LOCAL_ENTRY __field(u32, tx) __field(u32, tx_max) __field(u32, rx) __field(u32, rx_max) ), TP_fast_assign( LOCAL_ASSIGN; __entry->tx = *tx; __entry->tx_max = *tx_max; __entry->rx = *rx; __entry->rx_max = *rx_max; ), TP_printk( LOCAL_PR_FMT " tx:%d tx_max %d rx %d rx_max %d", LOCAL_PR_ARG, __entry->tx, __entry->tx_max, __entry->rx, __entry->rx_max ) ); DEFINE_EVENT(local_only_evt, drv_tx_frames_pending, TP_PROTO(struct ieee80211_local *local), TP_ARGS(local) ); DEFINE_EVENT(local_only_evt, drv_offchannel_tx_cancel_wait, TP_PROTO(struct ieee80211_local *local), TP_ARGS(local) ); TRACE_EVENT(drv_set_bitrate_mask, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, const struct cfg80211_bitrate_mask *mask), TP_ARGS(local, sdata, mask), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(u32, legacy_2g) __field(u32, legacy_5g) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->legacy_2g = mask->control[NL80211_BAND_2GHZ].legacy; __entry->legacy_5g = mask->control[NL80211_BAND_5GHZ].legacy; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT " 2G Mask:0x%x 5G Mask:0x%x", LOCAL_PR_ARG, VIF_PR_ARG, __entry->legacy_2g, __entry->legacy_5g ) ); TRACE_EVENT(drv_set_rekey_data, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_gtk_rekey_data *data), TP_ARGS(local, sdata, data), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __array(u8, kek, NL80211_KEK_LEN) __array(u8, kck, NL80211_KCK_LEN) __array(u8, replay_ctr, NL80211_REPLAY_CTR_LEN) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; memcpy(__entry->kek, data->kek, NL80211_KEK_LEN); memcpy(__entry->kck, data->kck, NL80211_KCK_LEN); memcpy(__entry->replay_ctr, data->replay_ctr, NL80211_REPLAY_CTR_LEN); ), TP_printk(LOCAL_PR_FMT VIF_PR_FMT, LOCAL_PR_ARG, VIF_PR_ARG) ); TRACE_EVENT(drv_event_callback, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, const struct ieee80211_event *_event), TP_ARGS(local, sdata, _event), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(u32, type) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->type = _event->type; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT " event:%d", LOCAL_PR_ARG, VIF_PR_ARG, __entry->type ) ); DECLARE_EVENT_CLASS(release_evt, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sta *sta, u16 tids, int num_frames, enum ieee80211_frame_release_type reason, bool more_data), TP_ARGS(local, sta, tids, num_frames, reason, more_data), TP_STRUCT__entry( LOCAL_ENTRY STA_ENTRY __field(u16, tids) __field(int, num_frames) __field(int, reason) __field(bool, more_data) ), TP_fast_assign( LOCAL_ASSIGN; STA_ASSIGN; __entry->tids = tids; __entry->num_frames = num_frames; __entry->reason = reason; __entry->more_data = more_data; ), TP_printk( LOCAL_PR_FMT STA_PR_FMT " TIDs:0x%.4x frames:%d reason:%d more:%d", LOCAL_PR_ARG, STA_PR_ARG, __entry->tids, __entry->num_frames, __entry->reason, __entry->more_data ) ); DEFINE_EVENT(release_evt, drv_release_buffered_frames, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sta *sta, u16 tids, int num_frames, enum ieee80211_frame_release_type reason, bool more_data), TP_ARGS(local, sta, tids, num_frames, reason, more_data) ); DEFINE_EVENT(release_evt, drv_allow_buffered_frames, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sta *sta, u16 tids, int num_frames, enum ieee80211_frame_release_type reason, bool more_data), TP_ARGS(local, sta, tids, num_frames, reason, more_data) ); DECLARE_EVENT_CLASS(mgd_prepare_complete_tx_evt, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u16 duration, u16 subtype, bool success), TP_ARGS(local, sdata, duration, subtype, success), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(u32, duration) __field(u16, subtype) __field(u8, success) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->duration = duration; __entry->subtype = subtype; __entry->success = success; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT " duration: %u, subtype:0x%x, success:%d", LOCAL_PR_ARG, VIF_PR_ARG, __entry->duration, __entry->subtype, __entry->success ) ); DEFINE_EVENT(mgd_prepare_complete_tx_evt, drv_mgd_prepare_tx, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u16 duration, u16 subtype, bool success), TP_ARGS(local, sdata, duration, subtype, success) ); DEFINE_EVENT(mgd_prepare_complete_tx_evt, drv_mgd_complete_tx, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u16 duration, u16 subtype, bool success), TP_ARGS(local, sdata, duration, subtype, success) ); DEFINE_EVENT(local_sdata_evt, drv_mgd_protect_tdls_discover, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), TP_ARGS(local, sdata) ); DECLARE_EVENT_CLASS(local_chanctx, TP_PROTO(struct ieee80211_local *local, struct ieee80211_chanctx *ctx), TP_ARGS(local, ctx), TP_STRUCT__entry( LOCAL_ENTRY CHANCTX_ENTRY ), TP_fast_assign( LOCAL_ASSIGN; CHANCTX_ASSIGN; ), TP_printk( LOCAL_PR_FMT CHANCTX_PR_FMT, LOCAL_PR_ARG, CHANCTX_PR_ARG ) ); DEFINE_EVENT(local_chanctx, drv_add_chanctx, TP_PROTO(struct ieee80211_local *local, struct ieee80211_chanctx *ctx), TP_ARGS(local, ctx) ); DEFINE_EVENT(local_chanctx, drv_remove_chanctx, TP_PROTO(struct ieee80211_local *local, struct ieee80211_chanctx *ctx), TP_ARGS(local, ctx) ); TRACE_EVENT(drv_change_chanctx, TP_PROTO(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, u32 changed), TP_ARGS(local, ctx, changed), TP_STRUCT__entry( LOCAL_ENTRY CHANCTX_ENTRY __field(u32, changed) ), TP_fast_assign( LOCAL_ASSIGN; CHANCTX_ASSIGN; __entry->changed = changed; ), TP_printk( LOCAL_PR_FMT CHANCTX_PR_FMT " changed:%#x", LOCAL_PR_ARG, CHANCTX_PR_ARG, __entry->changed ) ); #if !defined(__TRACE_VIF_ENTRY) #define __TRACE_VIF_ENTRY struct trace_vif_entry { enum nl80211_iftype vif_type; bool p2p; char vif_name[IFNAMSIZ]; } __packed; struct trace_chandef_entry { u32 control_freq; u32 freq_offset; u32 chan_width; u32 center_freq1; u32 freq1_offset; u32 center_freq2; } __packed; struct trace_switch_entry { struct trace_vif_entry vif; unsigned int link_id; struct trace_chandef_entry old_chandef; struct trace_chandef_entry new_chandef; } __packed; #define SWITCH_ENTRY_ASSIGN(to, from) local_vifs[i].to = vifs[i].from #endif TRACE_EVENT(drv_switch_vif_chanctx, TP_PROTO(struct ieee80211_local *local, struct ieee80211_vif_chanctx_switch *vifs, int n_vifs, enum ieee80211_chanctx_switch_mode mode), TP_ARGS(local, vifs, n_vifs, mode), TP_STRUCT__entry( LOCAL_ENTRY __field(int, n_vifs) __field(u32, mode) __dynamic_array(u8, vifs, sizeof(struct trace_switch_entry) * n_vifs) ), TP_fast_assign( LOCAL_ASSIGN; __entry->n_vifs = n_vifs; __entry->mode = mode; { struct trace_switch_entry *local_vifs = __get_dynamic_array(vifs); int i; for (i = 0; i < n_vifs; i++) { struct ieee80211_sub_if_data *sdata; sdata = container_of(vifs[i].vif, struct ieee80211_sub_if_data, vif); SWITCH_ENTRY_ASSIGN(vif.vif_type, vif->type); SWITCH_ENTRY_ASSIGN(vif.p2p, vif->p2p); SWITCH_ENTRY_ASSIGN(link_id, link_conf->link_id); strncpy(local_vifs[i].vif.vif_name, sdata->name, sizeof(local_vifs[i].vif.vif_name)); SWITCH_ENTRY_ASSIGN(old_chandef.control_freq, old_ctx->def.chan->center_freq); SWITCH_ENTRY_ASSIGN(old_chandef.freq_offset, old_ctx->def.chan->freq_offset); SWITCH_ENTRY_ASSIGN(old_chandef.chan_width, old_ctx->def.width); SWITCH_ENTRY_ASSIGN(old_chandef.center_freq1, old_ctx->def.center_freq1); SWITCH_ENTRY_ASSIGN(old_chandef.freq1_offset, old_ctx->def.freq1_offset); SWITCH_ENTRY_ASSIGN(old_chandef.center_freq2, old_ctx->def.center_freq2); SWITCH_ENTRY_ASSIGN(new_chandef.control_freq, new_ctx->def.chan->center_freq); SWITCH_ENTRY_ASSIGN(new_chandef.freq_offset, new_ctx->def.chan->freq_offset); SWITCH_ENTRY_ASSIGN(new_chandef.chan_width, new_ctx->def.width); SWITCH_ENTRY_ASSIGN(new_chandef.center_freq1, new_ctx->def.center_freq1); SWITCH_ENTRY_ASSIGN(new_chandef.freq1_offset, new_ctx->def.freq1_offset); SWITCH_ENTRY_ASSIGN(new_chandef.center_freq2, new_ctx->def.center_freq2); } } ), TP_printk( LOCAL_PR_FMT " n_vifs:%d mode:%d", LOCAL_PR_ARG, __entry->n_vifs, __entry->mode ) ); DECLARE_EVENT_CLASS(local_sdata_chanctx, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *link_conf, struct ieee80211_chanctx *ctx), TP_ARGS(local, sdata, link_conf, ctx), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY CHANCTX_ENTRY __field(unsigned int, link_id) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; CHANCTX_ASSIGN; __entry->link_id = link_conf->link_id; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT " link_id:%d" CHANCTX_PR_FMT, LOCAL_PR_ARG, VIF_PR_ARG, __entry->link_id, CHANCTX_PR_ARG ) ); DEFINE_EVENT(local_sdata_chanctx, drv_assign_vif_chanctx, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *link_conf, struct ieee80211_chanctx *ctx), TP_ARGS(local, sdata, link_conf, ctx) ); DEFINE_EVENT(local_sdata_chanctx, drv_unassign_vif_chanctx, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *link_conf, struct ieee80211_chanctx *ctx), TP_ARGS(local, sdata, link_conf, ctx) ); TRACE_EVENT(drv_start_ap, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *link_conf), TP_ARGS(local, sdata, link_conf), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(u32, link_id) __field(u8, dtimper) __field(u16, bcnint) __dynamic_array(u8, ssid, sdata->vif.cfg.ssid_len) __field(bool, hidden_ssid) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->link_id = link_conf->link_id; __entry->dtimper = link_conf->dtim_period; __entry->bcnint = link_conf->beacon_int; __entry->hidden_ssid = link_conf->hidden_ssid; memcpy(__get_dynamic_array(ssid), sdata->vif.cfg.ssid, sdata->vif.cfg.ssid_len); ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT " link id %u", LOCAL_PR_ARG, VIF_PR_ARG, __entry->link_id ) ); TRACE_EVENT(drv_stop_ap, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *link_conf), TP_ARGS(local, sdata, link_conf), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(u32, link_id) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->link_id = link_conf->link_id; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT " link id %u", LOCAL_PR_ARG, VIF_PR_ARG, __entry->link_id ) ); TRACE_EVENT(drv_reconfig_complete, TP_PROTO(struct ieee80211_local *local, enum ieee80211_reconfig_type reconfig_type), TP_ARGS(local, reconfig_type), TP_STRUCT__entry( LOCAL_ENTRY __field(u8, reconfig_type) ), TP_fast_assign( LOCAL_ASSIGN; __entry->reconfig_type = reconfig_type; ), TP_printk( LOCAL_PR_FMT " reconfig_type:%d", LOCAL_PR_ARG, __entry->reconfig_type ) ); #if IS_ENABLED(CONFIG_IPV6) DEFINE_EVENT(local_sdata_evt, drv_ipv6_addr_change, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), TP_ARGS(local, sdata) ); #endif TRACE_EVENT(drv_join_ibss, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *info), TP_ARGS(local, sdata, info), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(u8, dtimper) __field(u16, bcnint) __dynamic_array(u8, ssid, sdata->vif.cfg.ssid_len) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->dtimper = info->dtim_period; __entry->bcnint = info->beacon_int; memcpy(__get_dynamic_array(ssid), sdata->vif.cfg.ssid, sdata->vif.cfg.ssid_len); ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT, LOCAL_PR_ARG, VIF_PR_ARG ) ); DEFINE_EVENT(local_sdata_evt, drv_leave_ibss, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), TP_ARGS(local, sdata) ); TRACE_EVENT(drv_get_expected_throughput, TP_PROTO(struct ieee80211_sta *sta), TP_ARGS(sta), TP_STRUCT__entry( STA_ENTRY ), TP_fast_assign( STA_ASSIGN; ), TP_printk( STA_PR_FMT, STA_PR_ARG ) ); TRACE_EVENT(drv_start_nan, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_nan_conf *conf), TP_ARGS(local, sdata, conf), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(u8, master_pref) __field(u8, bands) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->master_pref = conf->master_pref; __entry->bands = conf->bands; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT ", master preference: %u, bands: 0x%0x", LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref, __entry->bands ) ); TRACE_EVENT(drv_stop_nan, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), TP_ARGS(local, sdata), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT, LOCAL_PR_ARG, VIF_PR_ARG ) ); TRACE_EVENT(drv_nan_change_conf, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_nan_conf *conf, u32 changes), TP_ARGS(local, sdata, conf, changes), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(u8, master_pref) __field(u8, bands) __field(u32, changes) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->master_pref = conf->master_pref; __entry->bands = conf->bands; __entry->changes = changes; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT ", master preference: %u, bands: 0x%0x, changes: 0x%x", LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref, __entry->bands, __entry->changes ) ); TRACE_EVENT(drv_add_nan_func, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, const struct cfg80211_nan_func *func), TP_ARGS(local, sdata, func), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(u8, type) __field(u8, inst_id) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->type = func->type; __entry->inst_id = func->instance_id; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT ", type: %u, inst_id: %u", LOCAL_PR_ARG, VIF_PR_ARG, __entry->type, __entry->inst_id ) ); TRACE_EVENT(drv_del_nan_func, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u8 instance_id), TP_ARGS(local, sdata, instance_id), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(u8, instance_id) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->instance_id = instance_id; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT ", instance_id: %u", LOCAL_PR_ARG, VIF_PR_ARG, __entry->instance_id ) ); DEFINE_EVENT(local_sdata_evt, drv_start_pmsr, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), TP_ARGS(local, sdata) ); DEFINE_EVENT(local_sdata_evt, drv_abort_pmsr, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), TP_ARGS(local, sdata) ); TRACE_EVENT(drv_set_default_unicast_key, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, int key_idx), TP_ARGS(local, sdata, key_idx), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(int, key_idx) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->key_idx = key_idx; ), TP_printk(LOCAL_PR_FMT VIF_PR_FMT " key_idx:%d", LOCAL_PR_ARG, VIF_PR_ARG, __entry->key_idx) ); TRACE_EVENT(drv_channel_switch_beacon, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_chan_def *chandef), TP_ARGS(local, sdata, chandef), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY CHANDEF_ENTRY ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; CHANDEF_ASSIGN(chandef); ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT " channel switch to " CHANDEF_PR_FMT, LOCAL_PR_ARG, VIF_PR_ARG, CHANDEF_PR_ARG ) ); DEFINE_EVENT(chanswitch_evt, drv_pre_channel_switch, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_channel_switch *ch_switch), TP_ARGS(local, sdata, ch_switch) ); DEFINE_EVENT(local_sdata_evt, drv_post_channel_switch, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), TP_ARGS(local, sdata) ); DEFINE_EVENT(local_sdata_evt, drv_abort_channel_switch, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), TP_ARGS(local, sdata) ); DEFINE_EVENT(chanswitch_evt, drv_channel_switch_rx_beacon, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_channel_switch *ch_switch), TP_ARGS(local, sdata, ch_switch) ); TRACE_EVENT(drv_get_txpower, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, int dbm, int ret), TP_ARGS(local, sdata, dbm, ret), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(int, dbm) __field(int, ret) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->dbm = dbm; __entry->ret = ret; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT " dbm:%d ret:%d", LOCAL_PR_ARG, VIF_PR_ARG, __entry->dbm, __entry->ret ) ); TRACE_EVENT(drv_tdls_channel_switch, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, u8 oper_class, struct cfg80211_chan_def *chandef), TP_ARGS(local, sdata, sta, oper_class, chandef), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY STA_ENTRY __field(u8, oper_class) CHANDEF_ENTRY ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; STA_ASSIGN; __entry->oper_class = oper_class; CHANDEF_ASSIGN(chandef) ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT " tdls channel switch to" CHANDEF_PR_FMT " oper_class:%d " STA_PR_FMT, LOCAL_PR_ARG, VIF_PR_ARG, CHANDEF_PR_ARG, __entry->oper_class, STA_PR_ARG ) ); TRACE_EVENT(drv_tdls_cancel_channel_switch, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta), TP_ARGS(local, sdata, sta), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY STA_ENTRY ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; STA_ASSIGN; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT " tdls cancel channel switch with " STA_PR_FMT, LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG ) ); TRACE_EVENT(drv_tdls_recv_channel_switch, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_tdls_ch_sw_params *params), TP_ARGS(local, sdata, params), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(u8, action_code) STA_ENTRY CHANDEF_ENTRY __field(u32, status) __field(bool, peer_initiator) __field(u32, timestamp) __field(u16, switch_time) __field(u16, switch_timeout) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; STA_NAMED_ASSIGN(params->sta); CHANDEF_ASSIGN(params->chandef) __entry->peer_initiator = params->sta->tdls_initiator; __entry->action_code = params->action_code; __entry->status = params->status; __entry->timestamp = params->timestamp; __entry->switch_time = params->switch_time; __entry->switch_timeout = params->switch_timeout; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT " received tdls channel switch packet" " action:%d status:%d time:%d switch time:%d switch" " timeout:%d initiator: %d chan:" CHANDEF_PR_FMT STA_PR_FMT, LOCAL_PR_ARG, VIF_PR_ARG, __entry->action_code, __entry->status, __entry->timestamp, __entry->switch_time, __entry->switch_timeout, __entry->peer_initiator, CHANDEF_PR_ARG, STA_PR_ARG ) ); TRACE_EVENT(drv_wake_tx_queue, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct txq_info *txq), TP_ARGS(local, sdata, txq), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY STA_ENTRY __field(u8, ac) __field(u8, tid) ), TP_fast_assign( struct ieee80211_sta *sta = txq->txq.sta; LOCAL_ASSIGN; VIF_ASSIGN; STA_ASSIGN; __entry->ac = txq->txq.ac; __entry->tid = txq->txq.tid; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " ac:%d tid:%d", LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->ac, __entry->tid ) ); TRACE_EVENT(drv_get_ftm_responder_stats, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_ftm_responder_stats *ftm_stats), TP_ARGS(local, sdata, ftm_stats), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT, LOCAL_PR_ARG, VIF_PR_ARG ) ); DEFINE_EVENT(local_sdata_addr_evt, drv_update_vif_offload, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), TP_ARGS(local, sdata) ); DECLARE_EVENT_CLASS(sta_flag_evt, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, bool enabled), TP_ARGS(local, sdata, sta, enabled), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY STA_ENTRY __field(bool, enabled) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; STA_ASSIGN; __entry->enabled = enabled; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " enabled:%d", LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->enabled ) ); DEFINE_EVENT(sta_flag_evt, drv_sta_set_4addr, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, bool enabled), TP_ARGS(local, sdata, sta, enabled) ); DEFINE_EVENT(sta_flag_evt, drv_sta_set_decap_offload, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, bool enabled), TP_ARGS(local, sdata, sta, enabled) ); TRACE_EVENT(drv_add_twt_setup, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sta *sta, struct ieee80211_twt_setup *twt, struct ieee80211_twt_params *twt_agrt), TP_ARGS(local, sta, twt, twt_agrt), TP_STRUCT__entry( LOCAL_ENTRY STA_ENTRY __field(u8, dialog_token) __field(u8, control) __field(__le16, req_type) __field(__le64, twt) __field(u8, duration) __field(__le16, mantissa) __field(u8, channel) ), TP_fast_assign( LOCAL_ASSIGN; STA_ASSIGN; __entry->dialog_token = twt->dialog_token; __entry->control = twt->control; __entry->req_type = twt_agrt->req_type; __entry->twt = twt_agrt->twt; __entry->duration = twt_agrt->min_twt_dur; __entry->mantissa = twt_agrt->mantissa; __entry->channel = twt_agrt->channel; ), TP_printk( LOCAL_PR_FMT STA_PR_FMT " token:%d control:0x%02x req_type:0x%04x" " twt:%llu duration:%d mantissa:%d channel:%d", LOCAL_PR_ARG, STA_PR_ARG, __entry->dialog_token, __entry->control, le16_to_cpu(__entry->req_type), le64_to_cpu(__entry->twt), __entry->duration, le16_to_cpu(__entry->mantissa), __entry->channel ) ); TRACE_EVENT(drv_twt_teardown_request, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sta *sta, u8 flowid), TP_ARGS(local, sta, flowid), TP_STRUCT__entry( LOCAL_ENTRY STA_ENTRY __field(u8, flowid) ), TP_fast_assign( LOCAL_ASSIGN; STA_ASSIGN; __entry->flowid = flowid; ), TP_printk( LOCAL_PR_FMT STA_PR_FMT " flowid:%d", LOCAL_PR_ARG, STA_PR_ARG, __entry->flowid ) ); DEFINE_EVENT(sta_event, drv_net_fill_forward_path, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta), TP_ARGS(local, sdata, sta) ); TRACE_EVENT(drv_net_setup_tc, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u8 type), TP_ARGS(local, sdata, type), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(u8, type) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->type = type; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT " type:%d\n", LOCAL_PR_ARG, VIF_PR_ARG, __entry->type ) ); TRACE_EVENT(drv_can_activate_links, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u16 active_links), TP_ARGS(local, sdata, active_links), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(u16, active_links) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->active_links = active_links; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT " requested active_links:0x%04x\n", LOCAL_PR_ARG, VIF_PR_ARG, __entry->active_links ) ); TRACE_EVENT(drv_change_vif_links, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u16 old_links, u16 new_links), TP_ARGS(local, sdata, old_links, new_links), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(u16, old_links) __field(u16, new_links) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->old_links = old_links; __entry->new_links = new_links; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT " old_links:0x%04x, new_links:0x%04x\n", LOCAL_PR_ARG, VIF_PR_ARG, __entry->old_links, __entry->new_links ) ); TRACE_EVENT(drv_change_sta_links, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, u16 old_links, u16 new_links), TP_ARGS(local, sdata, sta, old_links, new_links), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY STA_ENTRY __field(u16, old_links) __field(u16, new_links) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; STA_ASSIGN; __entry->old_links = old_links; __entry->new_links = new_links; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " old_links:0x%04x, new_links:0x%04x\n", LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->old_links, __entry->new_links ) ); /* * Tracing for API calls that drivers call. */ TRACE_EVENT(api_start_tx_ba_session, TP_PROTO(struct ieee80211_sta *sta, u16 tid), TP_ARGS(sta, tid), TP_STRUCT__entry( STA_ENTRY __field(u16, tid) ), TP_fast_assign( STA_ASSIGN; __entry->tid = tid; ), TP_printk( STA_PR_FMT " tid:%d", STA_PR_ARG, __entry->tid ) ); TRACE_EVENT(api_start_tx_ba_cb, TP_PROTO(struct ieee80211_sub_if_data *sdata, const u8 *ra, u16 tid), TP_ARGS(sdata, ra, tid), TP_STRUCT__entry( VIF_ENTRY __array(u8, ra, ETH_ALEN) __field(u16, tid) ), TP_fast_assign( VIF_ASSIGN; memcpy(__entry->ra, ra, ETH_ALEN); __entry->tid = tid; ), TP_printk( VIF_PR_FMT " ra:%pM tid:%d", VIF_PR_ARG, __entry->ra, __entry->tid ) ); TRACE_EVENT(api_stop_tx_ba_session, TP_PROTO(struct ieee80211_sta *sta, u16 tid), TP_ARGS(sta, tid), TP_STRUCT__entry( STA_ENTRY __field(u16, tid) ), TP_fast_assign( STA_ASSIGN; __entry->tid = tid; ), TP_printk( STA_PR_FMT " tid:%d", STA_PR_ARG, __entry->tid ) ); TRACE_EVENT(api_stop_tx_ba_cb, TP_PROTO(struct ieee80211_sub_if_data *sdata, const u8 *ra, u16 tid), TP_ARGS(sdata, ra, tid), TP_STRUCT__entry( VIF_ENTRY __array(u8, ra, ETH_ALEN) __field(u16, tid) ), TP_fast_assign( VIF_ASSIGN; memcpy(__entry->ra, ra, ETH_ALEN); __entry->tid = tid; ), TP_printk( VIF_PR_FMT " ra:%pM tid:%d", VIF_PR_ARG, __entry->ra, __entry->tid ) ); DEFINE_EVENT(local_only_evt, api_restart_hw, TP_PROTO(struct ieee80211_local *local), TP_ARGS(local) ); TRACE_EVENT(api_beacon_loss, TP_PROTO(struct ieee80211_sub_if_data *sdata), TP_ARGS(sdata), TP_STRUCT__entry( VIF_ENTRY ), TP_fast_assign( VIF_ASSIGN; ), TP_printk( VIF_PR_FMT, VIF_PR_ARG ) ); TRACE_EVENT(api_connection_loss, TP_PROTO(struct ieee80211_sub_if_data *sdata), TP_ARGS(sdata), TP_STRUCT__entry( VIF_ENTRY ), TP_fast_assign( VIF_ASSIGN; ), TP_printk( VIF_PR_FMT, VIF_PR_ARG ) ); TRACE_EVENT(api_disconnect, TP_PROTO(struct ieee80211_sub_if_data *sdata, bool reconnect), TP_ARGS(sdata, reconnect), TP_STRUCT__entry( VIF_ENTRY __field(int, reconnect) ), TP_fast_assign( VIF_ASSIGN; __entry->reconnect = reconnect; ), TP_printk( VIF_PR_FMT " reconnect:%d", VIF_PR_ARG, __entry->reconnect ) ); TRACE_EVENT(api_cqm_rssi_notify, TP_PROTO(struct ieee80211_sub_if_data *sdata, enum nl80211_cqm_rssi_threshold_event rssi_event, s32 rssi_level), TP_ARGS(sdata, rssi_event, rssi_level), TP_STRUCT__entry( VIF_ENTRY __field(u32, rssi_event) __field(s32, rssi_level) ), TP_fast_assign( VIF_ASSIGN; __entry->rssi_event = rssi_event; __entry->rssi_level = rssi_level; ), TP_printk( VIF_PR_FMT " event:%d rssi:%d", VIF_PR_ARG, __entry->rssi_event, __entry->rssi_level ) ); DEFINE_EVENT(local_sdata_evt, api_cqm_beacon_loss_notify, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), TP_ARGS(local, sdata) ); TRACE_EVENT(api_scan_completed, TP_PROTO(struct ieee80211_local *local, bool aborted), TP_ARGS(local, aborted), TP_STRUCT__entry( LOCAL_ENTRY __field(bool, aborted) ), TP_fast_assign( LOCAL_ASSIGN; __entry->aborted = aborted; ), TP_printk( LOCAL_PR_FMT " aborted:%d", LOCAL_PR_ARG, __entry->aborted ) ); TRACE_EVENT(api_sched_scan_results, TP_PROTO(struct ieee80211_local *local), TP_ARGS(local), TP_STRUCT__entry( LOCAL_ENTRY ), TP_fast_assign( LOCAL_ASSIGN; ), TP_printk( LOCAL_PR_FMT, LOCAL_PR_ARG ) ); TRACE_EVENT(api_sched_scan_stopped, TP_PROTO(struct ieee80211_local *local), TP_ARGS(local), TP_STRUCT__entry( LOCAL_ENTRY ), TP_fast_assign( LOCAL_ASSIGN; ), TP_printk( LOCAL_PR_FMT, LOCAL_PR_ARG ) ); TRACE_EVENT(api_sta_block_awake, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sta *sta, bool block), TP_ARGS(local, sta, block), TP_STRUCT__entry( LOCAL_ENTRY STA_ENTRY __field(bool, block) ), TP_fast_assign( LOCAL_ASSIGN; STA_ASSIGN; __entry->block = block; ), TP_printk( LOCAL_PR_FMT STA_PR_FMT " block:%d", LOCAL_PR_ARG, STA_PR_ARG, __entry->block ) ); TRACE_EVENT(api_chswitch_done, TP_PROTO(struct ieee80211_sub_if_data *sdata, bool success, unsigned int link_id), TP_ARGS(sdata, success, link_id), TP_STRUCT__entry( VIF_ENTRY __field(bool, success) __field(unsigned int, link_id) ), TP_fast_assign( VIF_ASSIGN; __entry->success = success; __entry->link_id = link_id; ), TP_printk( VIF_PR_FMT " success=%d link_id=%d", VIF_PR_ARG, __entry->success, __entry->link_id ) ); DEFINE_EVENT(local_only_evt, api_ready_on_channel, TP_PROTO(struct ieee80211_local *local), TP_ARGS(local) ); DEFINE_EVENT(local_only_evt, api_remain_on_channel_expired, TP_PROTO(struct ieee80211_local *local), TP_ARGS(local) ); TRACE_EVENT(api_gtk_rekey_notify, TP_PROTO(struct ieee80211_sub_if_data *sdata, const u8 *bssid, const u8 *replay_ctr), TP_ARGS(sdata, bssid, replay_ctr), TP_STRUCT__entry( VIF_ENTRY __array(u8, bssid, ETH_ALEN) __array(u8, replay_ctr, NL80211_REPLAY_CTR_LEN) ), TP_fast_assign( VIF_ASSIGN; memcpy(__entry->bssid, bssid, ETH_ALEN); memcpy(__entry->replay_ctr, replay_ctr, NL80211_REPLAY_CTR_LEN); ), TP_printk(VIF_PR_FMT, VIF_PR_ARG) ); TRACE_EVENT(api_enable_rssi_reports, TP_PROTO(struct ieee80211_sub_if_data *sdata, int rssi_min_thold, int rssi_max_thold), TP_ARGS(sdata, rssi_min_thold, rssi_max_thold), TP_STRUCT__entry( VIF_ENTRY __field(int, rssi_min_thold) __field(int, rssi_max_thold) ), TP_fast_assign( VIF_ASSIGN; __entry->rssi_min_thold = rssi_min_thold; __entry->rssi_max_thold = rssi_max_thold; ), TP_printk( VIF_PR_FMT " rssi_min_thold =%d, rssi_max_thold = %d", VIF_PR_ARG, __entry->rssi_min_thold, __entry->rssi_max_thold ) ); TRACE_EVENT(api_eosp, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sta *sta), TP_ARGS(local, sta), TP_STRUCT__entry( LOCAL_ENTRY STA_ENTRY ), TP_fast_assign( LOCAL_ASSIGN; STA_ASSIGN; ), TP_printk( LOCAL_PR_FMT STA_PR_FMT, LOCAL_PR_ARG, STA_PR_ARG ) ); TRACE_EVENT(api_send_eosp_nullfunc, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sta *sta, u8 tid), TP_ARGS(local, sta, tid), TP_STRUCT__entry( LOCAL_ENTRY STA_ENTRY __field(u8, tid) ), TP_fast_assign( LOCAL_ASSIGN; STA_ASSIGN; __entry->tid = tid; ), TP_printk( LOCAL_PR_FMT STA_PR_FMT " tid:%d", LOCAL_PR_ARG, STA_PR_ARG, __entry->tid ) ); TRACE_EVENT(api_sta_set_buffered, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sta *sta, u8 tid, bool buffered), TP_ARGS(local, sta, tid, buffered), TP_STRUCT__entry( LOCAL_ENTRY STA_ENTRY __field(u8, tid) __field(bool, buffered) ), TP_fast_assign( LOCAL_ASSIGN; STA_ASSIGN; __entry->tid = tid; __entry->buffered = buffered; ), TP_printk( LOCAL_PR_FMT STA_PR_FMT " tid:%d buffered:%d", LOCAL_PR_ARG, STA_PR_ARG, __entry->tid, __entry->buffered ) ); TRACE_EVENT(api_radar_detected, TP_PROTO(struct ieee80211_local *local), TP_ARGS(local), TP_STRUCT__entry( LOCAL_ENTRY ), TP_fast_assign( LOCAL_ASSIGN; ), TP_printk( LOCAL_PR_FMT " radar detected", LOCAL_PR_ARG ) ); TRACE_EVENT(api_request_smps, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, enum ieee80211_smps_mode smps_mode), TP_ARGS(local, sdata, link, smps_mode), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(int, link_id) __field(u32, smps_mode) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->link_id = link->link_id, __entry->smps_mode = smps_mode; ), TP_printk( LOCAL_PR_FMT " " VIF_PR_FMT " link:%d, smps_mode:%d", LOCAL_PR_ARG, VIF_PR_ARG, __entry->link_id, __entry->smps_mode ) ); /* * Tracing for internal functions * (which may also be called in response to driver calls) */ TRACE_EVENT(wake_queue, TP_PROTO(struct ieee80211_local *local, u16 queue, enum queue_stop_reason reason), TP_ARGS(local, queue, reason), TP_STRUCT__entry( LOCAL_ENTRY __field(u16, queue) __field(u32, reason) ), TP_fast_assign( LOCAL_ASSIGN; __entry->queue = queue; __entry->reason = reason; ), TP_printk( LOCAL_PR_FMT " queue:%d, reason:%d", LOCAL_PR_ARG, __entry->queue, __entry->reason ) ); TRACE_EVENT(stop_queue, TP_PROTO(struct ieee80211_local *local, u16 queue, enum queue_stop_reason reason), TP_ARGS(local, queue, reason), TP_STRUCT__entry( LOCAL_ENTRY __field(u16, queue) __field(u32, reason) ), TP_fast_assign( LOCAL_ASSIGN; __entry->queue = queue; __entry->reason = reason; ), TP_printk( LOCAL_PR_FMT " queue:%d, reason:%d", LOCAL_PR_ARG, __entry->queue, __entry->reason ) ); TRACE_EVENT(drv_can_neg_ttlm, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_neg_ttlm *neg_ttlm), TP_ARGS(local, sdata, neg_ttlm), TP_STRUCT__entry(LOCAL_ENTRY VIF_ENTRY __array(u16, downlink, sizeof(u16) * 8) __array(u16, uplink, sizeof(u16) * 8) ), TP_fast_assign(LOCAL_ASSIGN; VIF_ASSIGN; memcpy(__entry->downlink, neg_ttlm->downlink, sizeof(neg_ttlm->downlink)); memcpy(__entry->uplink, neg_ttlm->uplink, sizeof(neg_ttlm->uplink)); ), TP_printk(LOCAL_PR_FMT ", " VIF_PR_FMT, LOCAL_PR_ARG, VIF_PR_ARG) ); TRACE_EVENT(drv_neg_ttlm_res, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, enum ieee80211_neg_ttlm_res res, struct ieee80211_neg_ttlm *neg_ttlm), TP_ARGS(local, sdata, res, neg_ttlm), TP_STRUCT__entry(LOCAL_ENTRY VIF_ENTRY __field(u32, res) __array(u16, downlink, sizeof(u16) * 8) __array(u16, uplink, sizeof(u16) * 8) ), TP_fast_assign(LOCAL_ASSIGN; VIF_ASSIGN; __entry->res = res; memcpy(__entry->downlink, neg_ttlm->downlink, sizeof(neg_ttlm->downlink)); memcpy(__entry->uplink, neg_ttlm->uplink, sizeof(neg_ttlm->uplink)); ), TP_printk(LOCAL_PR_FMT VIF_PR_FMT " response: %d\n ", LOCAL_PR_ARG, VIF_PR_ARG, __entry->res ) ); #endif /* !__MAC80211_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace #include <trace/define_trace.h>
258 258 258 256 258 258 59 59 59 59 258 257 258 257 256 123 124 258 257 258 124 159 258 256 256 257 258 258 258 257 257 160 255 256 258 258 258 258 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 // SPDX-License-Identifier: GPL-2.0-or-later /* * x86 instruction analysis * * Copyright (C) IBM Corporation, 2002, 2004, 2009 */ #include <linux/kernel.h> #ifdef __KERNEL__ #include <linux/string.h> #else #include <string.h> #endif #include <asm/inat.h> /*__ignore_sync_check__ */ #include <asm/insn.h> /* __ignore_sync_check__ */ #include <asm/unaligned.h> /* __ignore_sync_check__ */ #include <linux/errno.h> #include <linux/kconfig.h> #include <asm/emulate_prefix.h> /* __ignore_sync_check__ */ #define leXX_to_cpu(t, r) \ ({ \ __typeof__(t) v; \ switch (sizeof(t)) { \ case 4: v = le32_to_cpu(r); break; \ case 2: v = le16_to_cpu(r); break; \ case 1: v = r; break; \ default: \ BUILD_BUG(); break; \ } \ v; \ }) /* Verify next sizeof(t) bytes can be on the same instruction */ #define validate_next(t, insn, n) \ ((insn)->next_byte + sizeof(t) + n <= (insn)->end_kaddr) #define __get_next(t, insn) \ ({ t r = get_unaligned((t *)(insn)->next_byte); (insn)->next_byte += sizeof(t); leXX_to_cpu(t, r); }) #define __peek_nbyte_next(t, insn, n) \ ({ t r = get_unaligned((t *)(insn)->next_byte + n); leXX_to_cpu(t, r); }) #define get_next(t, insn) \ ({ if (unlikely(!validate_next(t, insn, 0))) goto err_out; __get_next(t, insn); }) #define peek_nbyte_next(t, insn, n) \ ({ if (unlikely(!validate_next(t, insn, n))) goto err_out; __peek_nbyte_next(t, insn, n); }) #define peek_next(t, insn) peek_nbyte_next(t, insn, 0) /** * insn_init() - initialize struct insn * @insn: &struct insn to be initialized * @kaddr: address (in kernel memory) of instruction (or copy thereof) * @buf_len: length of the insn buffer at @kaddr * @x86_64: !0 for 64-bit kernel or 64-bit app */ void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64) { /* * Instructions longer than MAX_INSN_SIZE (15 bytes) are invalid * even if the input buffer is long enough to hold them. */ if (buf_len > MAX_INSN_SIZE) buf_len = MAX_INSN_SIZE; memset(insn, 0, sizeof(*insn)); insn->kaddr = kaddr; insn->end_kaddr = kaddr + buf_len; insn->next_byte = kaddr; insn->x86_64 = x86_64; insn->opnd_bytes = 4; if (x86_64) insn->addr_bytes = 8; else insn->addr_bytes = 4; } static const insn_byte_t xen_prefix[] = { __XEN_EMULATE_PREFIX }; static const insn_byte_t kvm_prefix[] = { __KVM_EMULATE_PREFIX }; static int __insn_get_emulate_prefix(struct insn *insn, const insn_byte_t *prefix, size_t len) { size_t i; for (i = 0; i < len; i++) { if (peek_nbyte_next(insn_byte_t, insn, i) != prefix[i]) goto err_out; } insn->emulate_prefix_size = len; insn->next_byte += len; return 1; err_out: return 0; } static void insn_get_emulate_prefix(struct insn *insn) { if (__insn_get_emulate_prefix(insn, xen_prefix, sizeof(xen_prefix))) return; __insn_get_emulate_prefix(insn, kvm_prefix, sizeof(kvm_prefix)); } /** * insn_get_prefixes - scan x86 instruction prefix bytes * @insn: &struct insn containing instruction * * Populates the @insn->prefixes bitmap, and updates @insn->next_byte * to point to the (first) opcode. No effect if @insn->prefixes.got * is already set. * * * Returns: * 0: on success * < 0: on error */ int insn_get_prefixes(struct insn *insn) { struct insn_field *prefixes = &insn->prefixes; insn_attr_t attr; insn_byte_t b, lb; int i, nb; if (prefixes->got) return 0; insn_get_emulate_prefix(insn); nb = 0; lb = 0; b = peek_next(insn_byte_t, insn); attr = inat_get_opcode_attribute(b); while (inat_is_legacy_prefix(attr)) { /* Skip if same prefix */ for (i = 0; i < nb; i++) if (prefixes->bytes[i] == b) goto found; if (nb == 4) /* Invalid instruction */ break; prefixes->bytes[nb++] = b; if (inat_is_address_size_prefix(attr)) { /* address size switches 2/4 or 4/8 */ if (insn->x86_64) insn->addr_bytes ^= 12; else insn->addr_bytes ^= 6; } else if (inat_is_operand_size_prefix(attr)) { /* oprand size switches 2/4 */ insn->opnd_bytes ^= 6; } found: prefixes->nbytes++; insn->next_byte++; lb = b; b = peek_next(insn_byte_t, insn); attr = inat_get_opcode_attribute(b); } /* Set the last prefix */ if (lb && lb != insn->prefixes.bytes[3]) { if (unlikely(insn->prefixes.bytes[3])) { /* Swap the last prefix */ b = insn->prefixes.bytes[3]; for (i = 0; i < nb; i++) if (prefixes->bytes[i] == lb) insn_set_byte(prefixes, i, b); } insn_set_byte(&insn->prefixes, 3, lb); } /* Decode REX prefix */ if (insn->x86_64) { b = peek_next(insn_byte_t, insn); attr = inat_get_opcode_attribute(b); if (inat_is_rex_prefix(attr)) { insn_field_set(&insn->rex_prefix, b, 1); insn->next_byte++; if (X86_REX_W(b)) /* REX.W overrides opnd_size */ insn->opnd_bytes = 8; } } insn->rex_prefix.got = 1; /* Decode VEX prefix */ b = peek_next(insn_byte_t, insn); attr = inat_get_opcode_attribute(b); if (inat_is_vex_prefix(attr)) { insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1); if (!insn->x86_64) { /* * In 32-bits mode, if the [7:6] bits (mod bits of * ModRM) on the second byte are not 11b, it is * LDS or LES or BOUND. */ if (X86_MODRM_MOD(b2) != 3) goto vex_end; } insn_set_byte(&insn->vex_prefix, 0, b); insn_set_byte(&insn->vex_prefix, 1, b2); if (inat_is_evex_prefix(attr)) { b2 = peek_nbyte_next(insn_byte_t, insn, 2); insn_set_byte(&insn->vex_prefix, 2, b2); b2 = peek_nbyte_next(insn_byte_t, insn, 3); insn_set_byte(&insn->vex_prefix, 3, b2); insn->vex_prefix.nbytes = 4; insn->next_byte += 4; if (insn->x86_64 && X86_VEX_W(b2)) /* VEX.W overrides opnd_size */ insn->opnd_bytes = 8; } else if (inat_is_vex3_prefix(attr)) { b2 = peek_nbyte_next(insn_byte_t, insn, 2); insn_set_byte(&insn->vex_prefix, 2, b2); insn->vex_prefix.nbytes = 3; insn->next_byte += 3; if (insn->x86_64 && X86_VEX_W(b2)) /* VEX.W overrides opnd_size */ insn->opnd_bytes = 8; } else { /* * For VEX2, fake VEX3-like byte#2. * Makes it easier to decode vex.W, vex.vvvv, * vex.L and vex.pp. Masking with 0x7f sets vex.W == 0. */ insn_set_byte(&insn->vex_prefix, 2, b2 & 0x7f); insn->vex_prefix.nbytes = 2; insn->next_byte += 2; } } vex_end: insn->vex_prefix.got = 1; prefixes->got = 1; return 0; err_out: return -ENODATA; } /** * insn_get_opcode - collect opcode(s) * @insn: &struct insn containing instruction * * Populates @insn->opcode, updates @insn->next_byte to point past the * opcode byte(s), and set @insn->attr (except for groups). * If necessary, first collects any preceding (prefix) bytes. * Sets @insn->opcode.value = opcode1. No effect if @insn->opcode.got * is already 1. * * Returns: * 0: on success * < 0: on error */ int insn_get_opcode(struct insn *insn) { struct insn_field *opcode = &insn->opcode; int pfx_id, ret; insn_byte_t op; if (opcode->got) return 0; ret = insn_get_prefixes(insn); if (ret) return ret; /* Get first opcode */ op = get_next(insn_byte_t, insn); insn_set_byte(opcode, 0, op); opcode->nbytes = 1; /* Check if there is VEX prefix or not */ if (insn_is_avx(insn)) { insn_byte_t m, p; m = insn_vex_m_bits(insn); p = insn_vex_p_bits(insn); insn->attr = inat_get_avx_attribute(op, m, p); if ((inat_must_evex(insn->attr) && !insn_is_evex(insn)) || (!inat_accept_vex(insn->attr) && !inat_is_group(insn->attr))) { /* This instruction is bad */ insn->attr = 0; return -EINVAL; } /* VEX has only 1 byte for opcode */ goto end; } insn->attr = inat_get_opcode_attribute(op); while (inat_is_escape(insn->attr)) { /* Get escaped opcode */ op = get_next(insn_byte_t, insn); opcode->bytes[opcode->nbytes++] = op; pfx_id = insn_last_prefix_id(insn); insn->attr = inat_get_escape_attribute(op, pfx_id, insn->attr); } if (inat_must_vex(insn->attr)) { /* This instruction is bad */ insn->attr = 0; return -EINVAL; } end: opcode->got = 1; return 0; err_out: return -ENODATA; } /** * insn_get_modrm - collect ModRM byte, if any * @insn: &struct insn containing instruction * * Populates @insn->modrm and updates @insn->next_byte to point past the * ModRM byte, if any. If necessary, first collects the preceding bytes * (prefixes and opcode(s)). No effect if @insn->modrm.got is already 1. * * Returns: * 0: on success * < 0: on error */ int insn_get_modrm(struct insn *insn) { struct insn_field *modrm = &insn->modrm; insn_byte_t pfx_id, mod; int ret; if (modrm->got) return 0; ret = insn_get_opcode(insn); if (ret) return ret; if (inat_has_modrm(insn->attr)) { mod = get_next(insn_byte_t, insn); insn_field_set(modrm, mod, 1); if (inat_is_group(insn->attr)) { pfx_id = insn_last_prefix_id(insn); insn->attr = inat_get_group_attribute(mod, pfx_id, insn->attr); if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) { /* Bad insn */ insn->attr = 0; return -EINVAL; } } } if (insn->x86_64 && inat_is_force64(insn->attr)) insn->opnd_bytes = 8; modrm->got = 1; return 0; err_out: return -ENODATA; } /** * insn_rip_relative() - Does instruction use RIP-relative addressing mode? * @insn: &struct insn containing instruction * * If necessary, first collects the instruction up to and including the * ModRM byte. No effect if @insn->x86_64 is 0. */ int insn_rip_relative(struct insn *insn) { struct insn_field *modrm = &insn->modrm; int ret; if (!insn->x86_64) return 0; ret = insn_get_modrm(insn); if (ret) return 0; /* * For rip-relative instructions, the mod field (top 2 bits) * is zero and the r/m field (bottom 3 bits) is 0x5. */ return (modrm->nbytes && (modrm->bytes[0] & 0xc7) == 0x5); } /** * insn_get_sib() - Get the SIB byte of instruction * @insn: &struct insn containing instruction * * If necessary, first collects the instruction up to and including the * ModRM byte. * * Returns: * 0: if decoding succeeded * < 0: otherwise. */ int insn_get_sib(struct insn *insn) { insn_byte_t modrm; int ret; if (insn->sib.got) return 0; ret = insn_get_modrm(insn); if (ret) return ret; if (insn->modrm.nbytes) { modrm = insn->modrm.bytes[0]; if (insn->addr_bytes != 2 && X86_MODRM_MOD(modrm) != 3 && X86_MODRM_RM(modrm) == 4) { insn_field_set(&insn->sib, get_next(insn_byte_t, insn), 1); } } insn->sib.got = 1; return 0; err_out: return -ENODATA; } /** * insn_get_displacement() - Get the displacement of instruction * @insn: &struct insn containing instruction * * If necessary, first collects the instruction up to and including the * SIB byte. * Displacement value is sign-expanded. * * * Returns: * 0: if decoding succeeded * < 0: otherwise. */ int insn_get_displacement(struct insn *insn) { insn_byte_t mod, rm, base; int ret; if (insn->displacement.got) return 0; ret = insn_get_sib(insn); if (ret) return ret; if (insn->modrm.nbytes) { /* * Interpreting the modrm byte: * mod = 00 - no displacement fields (exceptions below) * mod = 01 - 1-byte displacement field * mod = 10 - displacement field is 4 bytes, or 2 bytes if * address size = 2 (0x67 prefix in 32-bit mode) * mod = 11 - no memory operand * * If address size = 2... * mod = 00, r/m = 110 - displacement field is 2 bytes * * If address size != 2... * mod != 11, r/m = 100 - SIB byte exists * mod = 00, SIB base = 101 - displacement field is 4 bytes * mod = 00, r/m = 101 - rip-relative addressing, displacement * field is 4 bytes */ mod = X86_MODRM_MOD(insn->modrm.value); rm = X86_MODRM_RM(insn->modrm.value); base = X86_SIB_BASE(insn->sib.value); if (mod == 3) goto out; if (mod == 1) { insn_field_set(&insn->displacement, get_next(signed char, insn), 1); } else if (insn->addr_bytes == 2) { if ((mod == 0 && rm == 6) || mod == 2) { insn_field_set(&insn->displacement, get_next(short, insn), 2); } } else { if ((mod == 0 && rm == 5) || mod == 2 || (mod == 0 && base == 5)) { insn_field_set(&insn->displacement, get_next(int, insn), 4); } } } out: insn->displacement.got = 1; return 0; err_out: return -ENODATA; } /* Decode moffset16/32/64. Return 0 if failed */ static int __get_moffset(struct insn *insn) { switch (insn->addr_bytes) { case 2: insn_field_set(&insn->moffset1, get_next(short, insn), 2); break; case 4: insn_field_set(&insn->moffset1, get_next(int, insn), 4); break; case 8: insn_field_set(&insn->moffset1, get_next(int, insn), 4); insn_field_set(&insn->moffset2, get_next(int, insn), 4); break; default: /* opnd_bytes must be modified manually */ goto err_out; } insn->moffset1.got = insn->moffset2.got = 1; return 1; err_out: return 0; } /* Decode imm v32(Iz). Return 0 if failed */ static int __get_immv32(struct insn *insn) { switch (insn->opnd_bytes) { case 2: insn_field_set(&insn->immediate, get_next(short, insn), 2); break; case 4: case 8: insn_field_set(&insn->immediate, get_next(int, insn), 4); break; default: /* opnd_bytes must be modified manually */ goto err_out; } return 1; err_out: return 0; } /* Decode imm v64(Iv/Ov), Return 0 if failed */ static int __get_immv(struct insn *insn) { switch (insn->opnd_bytes) { case 2: insn_field_set(&insn->immediate1, get_next(short, insn), 2); break; case 4: insn_field_set(&insn->immediate1, get_next(int, insn), 4); insn->immediate1.nbytes = 4; break; case 8: insn_field_set(&insn->immediate1, get_next(int, insn), 4); insn_field_set(&insn->immediate2, get_next(int, insn), 4); break; default: /* opnd_bytes must be modified manually */ goto err_out; } insn->immediate1.got = insn->immediate2.got = 1; return 1; err_out: return 0; } /* Decode ptr16:16/32(Ap) */ static int __get_immptr(struct insn *insn) { switch (insn->opnd_bytes) { case 2: insn_field_set(&insn->immediate1, get_next(short, insn), 2); break; case 4: insn_field_set(&insn->immediate1, get_next(int, insn), 4); break; case 8: /* ptr16:64 is not exist (no segment) */ return 0; default: /* opnd_bytes must be modified manually */ goto err_out; } insn_field_set(&insn->immediate2, get_next(unsigned short, insn), 2); insn->immediate1.got = insn->immediate2.got = 1; return 1; err_out: return 0; } /** * insn_get_immediate() - Get the immediate in an instruction * @insn: &struct insn containing instruction * * If necessary, first collects the instruction up to and including the * displacement bytes. * Basically, most of immediates are sign-expanded. Unsigned-value can be * computed by bit masking with ((1 << (nbytes * 8)) - 1) * * Returns: * 0: on success * < 0: on error */ int insn_get_immediate(struct insn *insn) { int ret; if (insn->immediate.got) return 0; ret = insn_get_displacement(insn); if (ret) return ret; if (inat_has_moffset(insn->attr)) { if (!__get_moffset(insn)) goto err_out; goto done; } if (!inat_has_immediate(insn->attr)) /* no immediates */ goto done; switch (inat_immediate_size(insn->attr)) { case INAT_IMM_BYTE: insn_field_set(&insn->immediate, get_next(signed char, insn), 1); break; case INAT_IMM_WORD: insn_field_set(&insn->immediate, get_next(short, insn), 2); break; case INAT_IMM_DWORD: insn_field_set(&insn->immediate, get_next(int, insn), 4); break; case INAT_IMM_QWORD: insn_field_set(&insn->immediate1, get_next(int, insn), 4); insn_field_set(&insn->immediate2, get_next(int, insn), 4); break; case INAT_IMM_PTR: if (!__get_immptr(insn)) goto err_out; break; case INAT_IMM_VWORD32: if (!__get_immv32(insn)) goto err_out; break; case INAT_IMM_VWORD: if (!__get_immv(insn)) goto err_out; break; default: /* Here, insn must have an immediate, but failed */ goto err_out; } if (inat_has_second_immediate(insn->attr)) { insn_field_set(&insn->immediate2, get_next(signed char, insn), 1); } done: insn->immediate.got = 1; return 0; err_out: return -ENODATA; } /** * insn_get_length() - Get the length of instruction * @insn: &struct insn containing instruction * * If necessary, first collects the instruction up to and including the * immediates bytes. * * Returns: * - 0 on success * - < 0 on error */ int insn_get_length(struct insn *insn) { int ret; if (insn->length) return 0; ret = insn_get_immediate(insn); if (ret) return ret; insn->length = (unsigned char)((unsigned long)insn->next_byte - (unsigned long)insn->kaddr); return 0; } /* Ensure this instruction is decoded completely */ static inline int insn_complete(struct insn *insn) { return insn->opcode.got && insn->modrm.got && insn->sib.got && insn->displacement.got && insn->immediate.got; } /** * insn_decode() - Decode an x86 instruction * @insn: &struct insn to be initialized * @kaddr: address (in kernel memory) of instruction (or copy thereof) * @buf_len: length of the insn buffer at @kaddr * @m: insn mode, see enum insn_mode * * Returns: * 0: if decoding succeeded * < 0: otherwise. */ int insn_decode(struct insn *insn, const void *kaddr, int buf_len, enum insn_mode m) { int ret; /* #define INSN_MODE_KERN -1 __ignore_sync_check__ mode is only valid in the kernel */ if (m == INSN_MODE_KERN) insn_init(insn, kaddr, buf_len, IS_ENABLED(CONFIG_X86_64)); else insn_init(insn, kaddr, buf_len, m == INSN_MODE_64); ret = insn_get_length(insn); if (ret) return ret; if (insn_complete(insn)) return 0; return -EINVAL; }
9 1 1 7 9 9 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2010: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> * Copyright (C) 2015: Linus Lüssing <linus.luessing@c0d3.blue> * * Based on the MLD support added to br_multicast.c by YOSHIFUJI Hideaki. */ #include <linux/skbuff.h> #include <net/ipv6.h> #include <net/mld.h> #include <net/addrconf.h> #include <net/ip6_checksum.h> static int ipv6_mc_check_ip6hdr(struct sk_buff *skb) { const struct ipv6hdr *ip6h; unsigned int len; unsigned int offset = skb_network_offset(skb) + sizeof(*ip6h); if (!pskb_may_pull(skb, offset)) return -EINVAL; ip6h = ipv6_hdr(skb); if (ip6h->version != 6) return -EINVAL; len = offset + ntohs(ip6h->payload_len); if (skb->len < len || len <= offset) return -EINVAL; skb_set_transport_header(skb, offset); return 0; } static int ipv6_mc_check_exthdrs(struct sk_buff *skb) { const struct ipv6hdr *ip6h; int offset; u8 nexthdr; __be16 frag_off; ip6h = ipv6_hdr(skb); if (ip6h->nexthdr != IPPROTO_HOPOPTS) return -ENOMSG; nexthdr = ip6h->nexthdr; offset = skb_network_offset(skb) + sizeof(*ip6h); offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off); if (offset < 0) return -EINVAL; if (nexthdr != IPPROTO_ICMPV6) return -ENOMSG; skb_set_transport_header(skb, offset); return 0; } static int ipv6_mc_check_mld_reportv2(struct sk_buff *skb) { unsigned int len = skb_transport_offset(skb); len += sizeof(struct mld2_report); return ipv6_mc_may_pull(skb, len) ? 0 : -EINVAL; } static int ipv6_mc_check_mld_query(struct sk_buff *skb) { unsigned int transport_len = ipv6_transport_len(skb); struct mld_msg *mld; unsigned int len; /* RFC2710+RFC3810 (MLDv1+MLDv2) require link-local source addresses */ if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) return -EINVAL; /* MLDv1? */ if (transport_len != sizeof(struct mld_msg)) { /* or MLDv2? */ if (transport_len < sizeof(struct mld2_query)) return -EINVAL; len = skb_transport_offset(skb) + sizeof(struct mld2_query); if (!ipv6_mc_may_pull(skb, len)) return -EINVAL; } mld = (struct mld_msg *)skb_transport_header(skb); /* RFC2710+RFC3810 (MLDv1+MLDv2) require the multicast link layer * all-nodes destination address (ff02::1) for general queries */ if (ipv6_addr_any(&mld->mld_mca) && !ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr)) return -EINVAL; return 0; } static int ipv6_mc_check_mld_msg(struct sk_buff *skb) { unsigned int len = skb_transport_offset(skb) + sizeof(struct mld_msg); struct mld_msg *mld; if (!ipv6_mc_may_pull(skb, len)) return -ENODATA; mld = (struct mld_msg *)skb_transport_header(skb); switch (mld->mld_type) { case ICMPV6_MGM_REDUCTION: case ICMPV6_MGM_REPORT: return 0; case ICMPV6_MLD2_REPORT: return ipv6_mc_check_mld_reportv2(skb); case ICMPV6_MGM_QUERY: return ipv6_mc_check_mld_query(skb); default: return -ENODATA; } } static inline __sum16 ipv6_mc_validate_checksum(struct sk_buff *skb) { return skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo); } static int ipv6_mc_check_icmpv6(struct sk_buff *skb) { unsigned int len = skb_transport_offset(skb) + sizeof(struct icmp6hdr); unsigned int transport_len = ipv6_transport_len(skb); struct sk_buff *skb_chk; if (!ipv6_mc_may_pull(skb, len)) return -EINVAL; skb_chk = skb_checksum_trimmed(skb, transport_len, ipv6_mc_validate_checksum); if (!skb_chk) return -EINVAL; if (skb_chk != skb) kfree_skb(skb_chk); return 0; } /** * ipv6_mc_check_mld - checks whether this is a sane MLD packet * @skb: the skb to validate * * Checks whether an IPv6 packet is a valid MLD packet. If so sets * skb transport header accordingly and returns zero. * * -EINVAL: A broken packet was detected, i.e. it violates some internet * standard * -ENOMSG: IP header validation succeeded but it is not an ICMPv6 packet * with a hop-by-hop option. * -ENODATA: IP+ICMPv6 header with hop-by-hop option validation succeeded * but it is not an MLD packet. * -ENOMEM: A memory allocation failure happened. * * Caller needs to set the skb network header and free any returned skb if it * differs from the provided skb. */ int ipv6_mc_check_mld(struct sk_buff *skb) { int ret; ret = ipv6_mc_check_ip6hdr(skb); if (ret < 0) return ret; ret = ipv6_mc_check_exthdrs(skb); if (ret < 0) return ret; ret = ipv6_mc_check_icmpv6(skb); if (ret < 0) return ret; return ipv6_mc_check_mld_msg(skb); } EXPORT_SYMBOL(ipv6_mc_check_mld);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 #ifndef __LINUX_MROUTE_BASE_H #define __LINUX_MROUTE_BASE_H #include <linux/netdevice.h> #include <linux/rhashtable-types.h> #include <linux/spinlock.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/fib_notifier.h> #include <net/ip_fib.h> /** * struct vif_device - interface representor for multicast routing * @dev: network device being used * @dev_tracker: refcount tracker for @dev reference * @bytes_in: statistic; bytes ingressing * @bytes_out: statistic; bytes egresing * @pkt_in: statistic; packets ingressing * @pkt_out: statistic; packets egressing * @rate_limit: Traffic shaping (NI) * @threshold: TTL threshold * @flags: Control flags * @link: Physical interface index * @dev_parent_id: device parent id * @local: Local address * @remote: Remote address for tunnels */ struct vif_device { struct net_device __rcu *dev; netdevice_tracker dev_tracker; unsigned long bytes_in, bytes_out; unsigned long pkt_in, pkt_out; unsigned long rate_limit; unsigned char threshold; unsigned short flags; int link; /* Currently only used by ipmr */ struct netdev_phys_item_id dev_parent_id; __be32 local, remote; }; struct vif_entry_notifier_info { struct fib_notifier_info info; struct net_device *dev; unsigned short vif_index; unsigned short vif_flags; u32 tb_id; }; static inline int mr_call_vif_notifier(struct notifier_block *nb, unsigned short family, enum fib_event_type event_type, struct vif_device *vif, struct net_device *vif_dev, unsigned short vif_index, u32 tb_id, struct netlink_ext_ack *extack) { struct vif_entry_notifier_info info = { .info = { .family = family, .extack = extack, }, .dev = vif_dev, .vif_index = vif_index, .vif_flags = vif->flags, .tb_id = tb_id, }; return call_fib_notifier(nb, event_type, &info.info); } static inline int mr_call_vif_notifiers(struct net *net, unsigned short family, enum fib_event_type event_type, struct vif_device *vif, struct net_device *vif_dev, unsigned short vif_index, u32 tb_id, unsigned int *ipmr_seq) { struct vif_entry_notifier_info info = { .info = { .family = family, }, .dev = vif_dev, .vif_index = vif_index, .vif_flags = vif->flags, .tb_id = tb_id, }; ASSERT_RTNL(); (*ipmr_seq)++; return call_fib_notifiers(net, event_type, &info.info); } #ifndef MAXVIFS /* This one is nasty; value is defined in uapi using different symbols for * mroute and morute6 but both map into same 32. */ #define MAXVIFS 32 #endif /* Note: This helper is deprecated. */ #define VIF_EXISTS(_mrt, _idx) (!!rcu_access_pointer((_mrt)->vif_table[_idx].dev)) /* mfc_flags: * MFC_STATIC - the entry was added statically (not by a routing daemon) * MFC_OFFLOAD - the entry was offloaded to the hardware */ enum { MFC_STATIC = BIT(0), MFC_OFFLOAD = BIT(1), }; /** * struct mr_mfc - common multicast routing entries * @mnode: rhashtable list * @mfc_parent: source interface (iif) * @mfc_flags: entry flags * @expires: unresolved entry expire time * @unresolved: unresolved cached skbs * @last_assert: time of last assert * @minvif: minimum VIF id * @maxvif: maximum VIF id * @bytes: bytes that have passed for this entry * @pkt: packets that have passed for this entry * @wrong_if: number of wrong source interface hits * @lastuse: time of last use of the group (traffic or update) * @ttls: OIF TTL threshold array * @refcount: reference count for this entry * @list: global entry list * @rcu: used for entry destruction * @free: Operation used for freeing an entry under RCU */ struct mr_mfc { struct rhlist_head mnode; unsigned short mfc_parent; int mfc_flags; union { struct { unsigned long expires; struct sk_buff_head unresolved; } unres; struct { unsigned long last_assert; int minvif; int maxvif; unsigned long bytes; unsigned long pkt; unsigned long wrong_if; unsigned long lastuse; unsigned char ttls[MAXVIFS]; refcount_t refcount; } res; } mfc_un; struct list_head list; struct rcu_head rcu; void (*free)(struct rcu_head *head); }; static inline void mr_cache_put(struct mr_mfc *c) { if (refcount_dec_and_test(&c->mfc_un.res.refcount)) call_rcu(&c->rcu, c->free); } static inline void mr_cache_hold(struct mr_mfc *c) { refcount_inc(&c->mfc_un.res.refcount); } struct mfc_entry_notifier_info { struct fib_notifier_info info; struct mr_mfc *mfc; u32 tb_id; }; static inline int mr_call_mfc_notifier(struct notifier_block *nb, unsigned short family, enum fib_event_type event_type, struct mr_mfc *mfc, u32 tb_id, struct netlink_ext_ack *extack) { struct mfc_entry_notifier_info info = { .info = { .family = family, .extack = extack, }, .mfc = mfc, .tb_id = tb_id }; return call_fib_notifier(nb, event_type, &info.info); } static inline int mr_call_mfc_notifiers(struct net *net, unsigned short family, enum fib_event_type event_type, struct mr_mfc *mfc, u32 tb_id, unsigned int *ipmr_seq) { struct mfc_entry_notifier_info info = { .info = { .family = family, }, .mfc = mfc, .tb_id = tb_id }; ASSERT_RTNL(); (*ipmr_seq)++; return call_fib_notifiers(net, event_type, &info.info); } struct mr_table; /** * struct mr_table_ops - callbacks and info for protocol-specific ops * @rht_params: parameters for accessing the MFC hash * @cmparg_any: a hash key to be used for matching on (*,*) routes */ struct mr_table_ops { const struct rhashtable_params *rht_params; void *cmparg_any; }; /** * struct mr_table - a multicast routing table * @list: entry within a list of multicast routing tables * @net: net where this table belongs * @ops: protocol specific operations * @id: identifier of the table * @mroute_sk: socket associated with the table * @ipmr_expire_timer: timer for handling unresolved routes * @mfc_unres_queue: list of unresolved MFC entries * @vif_table: array containing all possible vifs * @mfc_hash: Hash table of all resolved routes for easy lookup * @mfc_cache_list: list of resovled routes for possible traversal * @maxvif: Identifier of highest value vif currently in use * @cache_resolve_queue_len: current size of unresolved queue * @mroute_do_assert: Whether to inform userspace on wrong ingress * @mroute_do_pim: Whether to receive IGMP PIMv1 * @mroute_reg_vif_num: PIM-device vif index */ struct mr_table { struct list_head list; possible_net_t net; struct mr_table_ops ops; u32 id; struct sock __rcu *mroute_sk; struct timer_list ipmr_expire_timer; struct list_head mfc_unres_queue; struct vif_device vif_table[MAXVIFS]; struct rhltable mfc_hash; struct list_head mfc_cache_list; int maxvif; atomic_t cache_resolve_queue_len; bool mroute_do_assert; bool mroute_do_pim; bool mroute_do_wrvifwhole; int mroute_reg_vif_num; }; #ifdef CONFIG_IP_MROUTE_COMMON void vif_device_init(struct vif_device *v, struct net_device *dev, unsigned long rate_limit, unsigned char threshold, unsigned short flags, unsigned short get_iflink_mask); struct mr_table * mr_table_alloc(struct net *net, u32 id, struct mr_table_ops *ops, void (*expire_func)(struct timer_list *t), void (*table_set)(struct mr_table *mrt, struct net *net)); /* These actually return 'struct mr_mfc *', but to avoid need for explicit * castings they simply return void. */ void *mr_mfc_find_parent(struct mr_table *mrt, void *hasharg, int parent); void *mr_mfc_find_any_parent(struct mr_table *mrt, int vifi); void *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg); int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct mr_mfc *c, struct rtmsg *rtm); int mr_table_dump(struct mr_table *mrt, struct sk_buff *skb, struct netlink_callback *cb, int (*fill)(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags), spinlock_t *lock, struct fib_dump_filter *filter); int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, struct mr_table *(*iter)(struct net *net, struct mr_table *mrt), int (*fill)(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags), spinlock_t *lock, struct fib_dump_filter *filter); int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, int (*rules_dump)(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack), struct mr_table *(*mr_iter)(struct net *net, struct mr_table *mrt), struct netlink_ext_ack *extack); #else static inline void vif_device_init(struct vif_device *v, struct net_device *dev, unsigned long rate_limit, unsigned char threshold, unsigned short flags, unsigned short get_iflink_mask) { } static inline void *mr_mfc_find_parent(struct mr_table *mrt, void *hasharg, int parent) { return NULL; } static inline void *mr_mfc_find_any_parent(struct mr_table *mrt, int vifi) { return NULL; } static inline struct mr_mfc *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg) { return NULL; } static inline int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct mr_mfc *c, struct rtmsg *rtm) { return -EINVAL; } static inline int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, struct mr_table *(*iter)(struct net *net, struct mr_table *mrt), int (*fill)(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags), spinlock_t *lock, struct fib_dump_filter *filter) { return -EINVAL; } static inline int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, int (*rules_dump)(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack), struct mr_table *(*mr_iter)(struct net *net, struct mr_table *mrt), struct netlink_ext_ack *extack) { return -EINVAL; } #endif static inline void *mr_mfc_find(struct mr_table *mrt, void *hasharg) { return mr_mfc_find_parent(mrt, hasharg, -1); } #ifdef CONFIG_PROC_FS struct mr_vif_iter { struct seq_net_private p; struct mr_table *mrt; int ct; }; struct mr_mfc_iter { struct seq_net_private p; struct mr_table *mrt; struct list_head *cache; /* Lock protecting the mr_table's unresolved queue */ spinlock_t *lock; }; #ifdef CONFIG_IP_MROUTE_COMMON void *mr_vif_seq_idx(struct net *net, struct mr_vif_iter *iter, loff_t pos); void *mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos); static inline void *mr_vif_seq_start(struct seq_file *seq, loff_t *pos) { return *pos ? mr_vif_seq_idx(seq_file_net(seq), seq->private, *pos - 1) : SEQ_START_TOKEN; } /* These actually return 'struct mr_mfc *', but to avoid need for explicit * castings they simply return void. */ void *mr_mfc_seq_idx(struct net *net, struct mr_mfc_iter *it, loff_t pos); void *mr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos); static inline void *mr_mfc_seq_start(struct seq_file *seq, loff_t *pos, struct mr_table *mrt, spinlock_t *lock) { struct mr_mfc_iter *it = seq->private; it->mrt = mrt; it->cache = NULL; it->lock = lock; return *pos ? mr_mfc_seq_idx(seq_file_net(seq), seq->private, *pos - 1) : SEQ_START_TOKEN; } static inline void mr_mfc_seq_stop(struct seq_file *seq, void *v) { struct mr_mfc_iter *it = seq->private; struct mr_table *mrt = it->mrt; if (it->cache == &mrt->mfc_unres_queue) spin_unlock_bh(it->lock); else if (it->cache == &mrt->mfc_cache_list) rcu_read_unlock(); } #else static inline void *mr_vif_seq_idx(struct net *net, struct mr_vif_iter *iter, loff_t pos) { return NULL; } static inline void *mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) { return NULL; } static inline void *mr_vif_seq_start(struct seq_file *seq, loff_t *pos) { return NULL; } static inline void *mr_mfc_seq_idx(struct net *net, struct mr_mfc_iter *it, loff_t pos) { return NULL; } static inline void *mr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) { return NULL; } static inline void *mr_mfc_seq_start(struct seq_file *seq, loff_t *pos, struct mr_table *mrt, spinlock_t *lock) { return NULL; } static inline void mr_mfc_seq_stop(struct seq_file *seq, void *v) { } #endif #endif #endif
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 // SPDX-License-Identifier: GPL-2.0-only /* * fs/kernfs/mount.c - kernfs mount implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> */ #include <linux/fs.h> #include <linux/mount.h> #include <linux/init.h> #include <linux/magic.h> #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/namei.h> #include <linux/seq_file.h> #include <linux/exportfs.h> #include <linux/uuid.h> #include <linux/statfs.h> #include "kernfs-internal.h" struct kmem_cache *kernfs_node_cache __ro_after_init; struct kmem_cache *kernfs_iattrs_cache __ro_after_init; struct kernfs_global_locks *kernfs_locks __ro_after_init; static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry) { struct kernfs_root *root = kernfs_root(kernfs_dentry_node(dentry)); struct kernfs_syscall_ops *scops = root->syscall_ops; if (scops && scops->show_options) return scops->show_options(sf, root); return 0; } static int kernfs_sop_show_path(struct seq_file *sf, struct dentry *dentry) { struct kernfs_node *node = kernfs_dentry_node(dentry); struct kernfs_root *root = kernfs_root(node); struct kernfs_syscall_ops *scops = root->syscall_ops; if (scops && scops->show_path) return scops->show_path(sf, node, root); seq_dentry(sf, dentry, " \t\n\\"); return 0; } static int kernfs_statfs(struct dentry *dentry, struct kstatfs *buf) { simple_statfs(dentry, buf); buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b); return 0; } const struct super_operations kernfs_sops = { .statfs = kernfs_statfs, .drop_inode = generic_delete_inode, .evict_inode = kernfs_evict_inode, .show_options = kernfs_sop_show_options, .show_path = kernfs_sop_show_path, }; static int kernfs_encode_fh(struct inode *inode, __u32 *fh, int *max_len, struct inode *parent) { struct kernfs_node *kn = inode->i_private; if (*max_len < 2) { *max_len = 2; return FILEID_INVALID; } *max_len = 2; *(u64 *)fh = kn->id; return FILEID_KERNFS; } static struct dentry *__kernfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type, bool get_parent) { struct kernfs_super_info *info = kernfs_info(sb); struct kernfs_node *kn; struct inode *inode; u64 id; if (fh_len < 2) return NULL; switch (fh_type) { case FILEID_KERNFS: id = *(u64 *)fid; break; case FILEID_INO32_GEN: case FILEID_INO32_GEN_PARENT: /* * blk_log_action() exposes "LOW32,HIGH32" pair without * type and userland can call us with generic fid * constructed from them. Combine it back to ID. See * blk_log_action(). */ id = ((u64)fid->i32.gen << 32) | fid->i32.ino; break; default: return NULL; } kn = kernfs_find_and_get_node_by_id(info->root, id); if (!kn) return ERR_PTR(-ESTALE); if (get_parent) { struct kernfs_node *parent; parent = kernfs_get_parent(kn); kernfs_put(kn); kn = parent; if (!kn) return ERR_PTR(-ESTALE); } inode = kernfs_get_inode(sb, kn); kernfs_put(kn); return d_obtain_alias(inode); } static struct dentry *kernfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return __kernfs_fh_to_dentry(sb, fid, fh_len, fh_type, false); } static struct dentry *kernfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return __kernfs_fh_to_dentry(sb, fid, fh_len, fh_type, true); } static struct dentry *kernfs_get_parent_dentry(struct dentry *child) { struct kernfs_node *kn = kernfs_dentry_node(child); return d_obtain_alias(kernfs_get_inode(child->d_sb, kn->parent)); } static const struct export_operations kernfs_export_ops = { .encode_fh = kernfs_encode_fh, .fh_to_dentry = kernfs_fh_to_dentry, .fh_to_parent = kernfs_fh_to_parent, .get_parent = kernfs_get_parent_dentry, }; /** * kernfs_root_from_sb - determine kernfs_root associated with a super_block * @sb: the super_block in question * * Return: the kernfs_root associated with @sb. If @sb is not a kernfs one, * %NULL is returned. */ struct kernfs_root *kernfs_root_from_sb(struct super_block *sb) { if (sb->s_op == &kernfs_sops) return kernfs_info(sb)->root; return NULL; } /* * find the next ancestor in the path down to @child, where @parent was the * ancestor whose descendant we want to find. * * Say the path is /a/b/c/d. @child is d, @parent is %NULL. We return the root * node. If @parent is b, then we return the node for c. * Passing in d as @parent is not ok. */ static struct kernfs_node *find_next_ancestor(struct kernfs_node *child, struct kernfs_node *parent) { if (child == parent) { pr_crit_once("BUG in find_next_ancestor: called with parent == child"); return NULL; } while (child->parent != parent) { if (!child->parent) return NULL; child = child->parent; } return child; } /** * kernfs_node_dentry - get a dentry for the given kernfs_node * @kn: kernfs_node for which a dentry is needed * @sb: the kernfs super_block * * Return: the dentry pointer */ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, struct super_block *sb) { struct dentry *dentry; struct kernfs_node *knparent = NULL; BUG_ON(sb->s_op != &kernfs_sops); dentry = dget(sb->s_root); /* Check if this is the root kernfs_node */ if (!kn->parent) return dentry; knparent = find_next_ancestor(kn, NULL); if (WARN_ON(!knparent)) { dput(dentry); return ERR_PTR(-EINVAL); } do { struct dentry *dtmp; struct kernfs_node *kntmp; if (kn == knparent) return dentry; kntmp = find_next_ancestor(kn, knparent); if (WARN_ON(!kntmp)) { dput(dentry); return ERR_PTR(-EINVAL); } dtmp = lookup_positive_unlocked(kntmp->name, dentry, strlen(kntmp->name)); dput(dentry); if (IS_ERR(dtmp)) return dtmp; knparent = kntmp; dentry = dtmp; } while (true); } static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *kfc) { struct kernfs_super_info *info = kernfs_info(sb); struct kernfs_root *kf_root = kfc->root; struct inode *inode; struct dentry *root; info->sb = sb; /* Userspace would break if executables or devices appear on sysfs */ sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = kfc->magic; sb->s_op = &kernfs_sops; sb->s_xattr = kernfs_xattr_handlers; if (info->root->flags & KERNFS_ROOT_SUPPORT_EXPORTOP) sb->s_export_op = &kernfs_export_ops; sb->s_time_gran = 1; /* sysfs dentries and inodes don't require IO to create */ sb->s_shrink->seeks = 0; /* get root inode, initialize and unlock it */ down_read(&kf_root->kernfs_rwsem); inode = kernfs_get_inode(sb, info->root->kn); up_read(&kf_root->kernfs_rwsem); if (!inode) { pr_debug("kernfs: could not get root inode\n"); return -ENOMEM; } /* instantiate and link root dentry */ root = d_make_root(inode); if (!root) { pr_debug("%s: could not get root dentry!\n", __func__); return -ENOMEM; } sb->s_root = root; sb->s_d_op = &kernfs_dops; return 0; } static int kernfs_test_super(struct super_block *sb, struct fs_context *fc) { struct kernfs_super_info *sb_info = kernfs_info(sb); struct kernfs_super_info *info = fc->s_fs_info; return sb_info->root == info->root && sb_info->ns == info->ns; } static int kernfs_set_super(struct super_block *sb, struct fs_context *fc) { struct kernfs_fs_context *kfc = fc->fs_private; kfc->ns_tag = NULL; return set_anon_super_fc(sb, fc); } /** * kernfs_super_ns - determine the namespace tag of a kernfs super_block * @sb: super_block of interest * * Return: the namespace tag associated with kernfs super_block @sb. */ const void *kernfs_super_ns(struct super_block *sb) { struct kernfs_super_info *info = kernfs_info(sb); return info->ns; } /** * kernfs_get_tree - kernfs filesystem access/retrieval helper * @fc: The filesystem context. * * This is to be called from each kernfs user's fs_context->ops->get_tree() * implementation, which should set the specified ->@fs_type and ->@flags, and * specify the hierarchy and namespace tag to mount via ->@root and ->@ns, * respectively. * * Return: %0 on success, -errno on failure. */ int kernfs_get_tree(struct fs_context *fc) { struct kernfs_fs_context *kfc = fc->fs_private; struct super_block *sb; struct kernfs_super_info *info; int error; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return -ENOMEM; info->root = kfc->root; info->ns = kfc->ns_tag; INIT_LIST_HEAD(&info->node); fc->s_fs_info = info; sb = sget_fc(fc, kernfs_test_super, kernfs_set_super); if (IS_ERR(sb)) return PTR_ERR(sb); if (!sb->s_root) { struct kernfs_super_info *info = kernfs_info(sb); struct kernfs_root *root = kfc->root; kfc->new_sb_created = true; error = kernfs_fill_super(sb, kfc); if (error) { deactivate_locked_super(sb); return error; } sb->s_flags |= SB_ACTIVE; uuid_t uuid; uuid_gen(&uuid); super_set_uuid(sb, uuid.b, sizeof(uuid)); down_write(&root->kernfs_supers_rwsem); list_add(&info->node, &info->root->supers); up_write(&root->kernfs_supers_rwsem); } fc->root = dget(sb->s_root); return 0; } void kernfs_free_fs_context(struct fs_context *fc) { /* Note that we don't deal with kfc->ns_tag here. */ kfree(fc->s_fs_info); fc->s_fs_info = NULL; } /** * kernfs_kill_sb - kill_sb for kernfs * @sb: super_block being killed * * This can be used directly for file_system_type->kill_sb(). If a kernfs * user needs extra cleanup, it can implement its own kill_sb() and call * this function at the end. */ void kernfs_kill_sb(struct super_block *sb) { struct kernfs_super_info *info = kernfs_info(sb); struct kernfs_root *root = info->root; down_write(&root->kernfs_supers_rwsem); list_del(&info->node); up_write(&root->kernfs_supers_rwsem); /* * Remove the superblock from fs_supers/s_instances * so we can't find it, before freeing kernfs_super_info. */ kill_anon_super(sb); kfree(info); } static void __init kernfs_mutex_init(void) { int count; for (count = 0; count < NR_KERNFS_LOCKS; count++) mutex_init(&kernfs_locks->open_file_mutex[count]); } static void __init kernfs_lock_init(void) { kernfs_locks = kmalloc(sizeof(struct kernfs_global_locks), GFP_KERNEL); WARN_ON(!kernfs_locks); kernfs_mutex_init(); } void __init kernfs_init(void) { kernfs_node_cache = kmem_cache_create("kernfs_node_cache", sizeof(struct kernfs_node), 0, SLAB_PANIC, NULL); /* Creates slab cache for kernfs inode attributes */ kernfs_iattrs_cache = kmem_cache_create("kernfs_iattrs_cache", sizeof(struct kernfs_iattrs), 0, SLAB_PANIC, NULL); kernfs_lock_init(); }
11 11 10 17 16 2 3 1 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 2 12 12 12 12 11 11 11 1 15 7 2 7 7 7 7 3 21 21 7 7 7 2 7 6 7 1 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 // SPDX-License-Identifier: GPL-2.0-or-later /* RxRPC individual remote procedure call handling * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/slab.h> #include <linux/module.h> #include <linux/circ_buf.h> #include <linux/spinlock_types.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include "ar-internal.h" const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = { [RXRPC_CALL_UNINITIALISED] = "Uninit ", [RXRPC_CALL_CLIENT_AWAIT_CONN] = "ClWtConn", [RXRPC_CALL_CLIENT_SEND_REQUEST] = "ClSndReq", [RXRPC_CALL_CLIENT_AWAIT_REPLY] = "ClAwtRpl", [RXRPC_CALL_CLIENT_RECV_REPLY] = "ClRcvRpl", [RXRPC_CALL_SERVER_PREALLOC] = "SvPrealc", [RXRPC_CALL_SERVER_SECURING] = "SvSecure", [RXRPC_CALL_SERVER_RECV_REQUEST] = "SvRcvReq", [RXRPC_CALL_SERVER_ACK_REQUEST] = "SvAckReq", [RXRPC_CALL_SERVER_SEND_REPLY] = "SvSndRpl", [RXRPC_CALL_SERVER_AWAIT_ACK] = "SvAwtACK", [RXRPC_CALL_COMPLETE] = "Complete", }; const char *const rxrpc_call_completions[NR__RXRPC_CALL_COMPLETIONS] = { [RXRPC_CALL_SUCCEEDED] = "Complete", [RXRPC_CALL_REMOTELY_ABORTED] = "RmtAbort", [RXRPC_CALL_LOCALLY_ABORTED] = "LocAbort", [RXRPC_CALL_LOCAL_ERROR] = "LocError", [RXRPC_CALL_NETWORK_ERROR] = "NetError", }; struct kmem_cache *rxrpc_call_jar; static DEFINE_SEMAPHORE(rxrpc_call_limiter, 1000); static DEFINE_SEMAPHORE(rxrpc_kernel_call_limiter, 1000); void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what) { struct rxrpc_local *local = call->local; bool busy; if (!test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) { spin_lock_bh(&local->lock); busy = !list_empty(&call->attend_link); trace_rxrpc_poke_call(call, busy, what); if (!busy && !rxrpc_try_get_call(call, rxrpc_call_get_poke)) busy = true; if (!busy) { list_add_tail(&call->attend_link, &local->call_attend_q); } spin_unlock_bh(&local->lock); if (!busy) rxrpc_wake_up_io_thread(local); } } static void rxrpc_call_timer_expired(struct timer_list *t) { struct rxrpc_call *call = from_timer(call, t, timer); _enter("%d", call->debug_id); if (!__rxrpc_call_is_complete(call)) { trace_rxrpc_timer_expired(call); rxrpc_poke_call(call, rxrpc_call_poke_timer); } } static struct lock_class_key rxrpc_call_user_mutex_lock_class_key; static void rxrpc_destroy_call(struct work_struct *); /* * find an extant server call * - called in process context with IRQs enabled */ struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *rx, unsigned long user_call_ID) { struct rxrpc_call *call; struct rb_node *p; _enter("%p,%lx", rx, user_call_ID); read_lock(&rx->call_lock); p = rx->calls.rb_node; while (p) { call = rb_entry(p, struct rxrpc_call, sock_node); if (user_call_ID < call->user_call_ID) p = p->rb_left; else if (user_call_ID > call->user_call_ID) p = p->rb_right; else goto found_extant_call; } read_unlock(&rx->call_lock); _leave(" = NULL"); return NULL; found_extant_call: rxrpc_get_call(call, rxrpc_call_get_sendmsg); read_unlock(&rx->call_lock); _leave(" = %p [%d]", call, refcount_read(&call->ref)); return call; } /* * allocate a new call */ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, unsigned int debug_id) { struct rxrpc_call *call; struct rxrpc_net *rxnet = rxrpc_net(sock_net(&rx->sk)); call = kmem_cache_zalloc(rxrpc_call_jar, gfp); if (!call) return NULL; mutex_init(&call->user_mutex); /* Prevent lockdep reporting a deadlock false positive between the afs * filesystem and sys_sendmsg() via the mmap sem. */ if (rx->sk.sk_kern_sock) lockdep_set_class(&call->user_mutex, &rxrpc_call_user_mutex_lock_class_key); timer_setup(&call->timer, rxrpc_call_timer_expired, 0); INIT_WORK(&call->destroyer, rxrpc_destroy_call); INIT_LIST_HEAD(&call->link); INIT_LIST_HEAD(&call->wait_link); INIT_LIST_HEAD(&call->accept_link); INIT_LIST_HEAD(&call->recvmsg_link); INIT_LIST_HEAD(&call->sock_link); INIT_LIST_HEAD(&call->attend_link); INIT_LIST_HEAD(&call->tx_sendmsg); INIT_LIST_HEAD(&call->tx_buffer); skb_queue_head_init(&call->recvmsg_queue); skb_queue_head_init(&call->rx_oos_queue); init_waitqueue_head(&call->waitq); spin_lock_init(&call->notify_lock); spin_lock_init(&call->tx_lock); refcount_set(&call->ref, 1); call->debug_id = debug_id; call->tx_total_len = -1; call->next_rx_timo = 20 * HZ; call->next_req_timo = 1 * HZ; call->ackr_window = 1; call->ackr_wtop = 1; call->delay_ack_at = KTIME_MAX; call->ack_lost_at = KTIME_MAX; call->resend_at = KTIME_MAX; call->ping_at = KTIME_MAX; call->keepalive_at = KTIME_MAX; call->expect_rx_by = KTIME_MAX; call->expect_req_by = KTIME_MAX; call->expect_term_by = KTIME_MAX; memset(&call->sock_node, 0xed, sizeof(call->sock_node)); call->rx_winsize = rxrpc_rx_window_size; call->tx_winsize = 16; if (RXRPC_TX_SMSS > 2190) call->cong_cwnd = 2; else if (RXRPC_TX_SMSS > 1095) call->cong_cwnd = 3; else call->cong_cwnd = 4; call->cong_ssthresh = RXRPC_TX_MAX_WINDOW; call->rxnet = rxnet; call->rtt_avail = RXRPC_CALL_RTT_AVAIL_MASK; atomic_inc(&rxnet->nr_calls); return call; } /* * Allocate a new client call. */ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, struct rxrpc_conn_parameters *cp, struct rxrpc_call_params *p, gfp_t gfp, unsigned int debug_id) { struct rxrpc_call *call; ktime_t now; int ret; _enter(""); call = rxrpc_alloc_call(rx, gfp, debug_id); if (!call) return ERR_PTR(-ENOMEM); now = ktime_get_real(); call->acks_latest_ts = now; call->cong_tstamp = now; call->dest_srx = cp->peer->srx; call->dest_srx.srx_service = cp->service_id; call->interruptibility = p->interruptibility; call->tx_total_len = p->tx_total_len; call->key = key_get(cp->key); call->peer = rxrpc_get_peer(cp->peer, rxrpc_peer_get_call); call->local = rxrpc_get_local(cp->local, rxrpc_local_get_call); call->security_level = cp->security_level; if (p->kernel) __set_bit(RXRPC_CALL_KERNEL, &call->flags); if (cp->upgrade) __set_bit(RXRPC_CALL_UPGRADE, &call->flags); if (cp->exclusive) __set_bit(RXRPC_CALL_EXCLUSIVE, &call->flags); if (p->timeouts.normal) call->next_rx_timo = min(p->timeouts.normal, 1); if (p->timeouts.idle) call->next_req_timo = min(p->timeouts.idle, 1); if (p->timeouts.hard) call->hard_timo = p->timeouts.hard; ret = rxrpc_init_client_call_security(call); if (ret < 0) { rxrpc_prefail_call(call, RXRPC_CALL_LOCAL_ERROR, ret); rxrpc_put_call(call, rxrpc_call_put_discard_error); return ERR_PTR(ret); } rxrpc_set_call_state(call, RXRPC_CALL_CLIENT_AWAIT_CONN); trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), p->user_call_ID, rxrpc_call_new_client); _leave(" = %p", call); return call; } /* * Initiate the call ack/resend/expiry timer. */ void rxrpc_start_call_timer(struct rxrpc_call *call) { if (call->hard_timo) { ktime_t delay = ms_to_ktime(call->hard_timo * 1000); call->expect_term_by = ktime_add(ktime_get_real(), delay); trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_hard); } call->timer.expires = jiffies; } /* * Wait for a call slot to become available. */ static struct semaphore *rxrpc_get_call_slot(struct rxrpc_call_params *p, gfp_t gfp) { struct semaphore *limiter = &rxrpc_call_limiter; if (p->kernel) limiter = &rxrpc_kernel_call_limiter; if (p->interruptibility == RXRPC_UNINTERRUPTIBLE) { down(limiter); return limiter; } return down_interruptible(limiter) < 0 ? NULL : limiter; } /* * Release a call slot. */ static void rxrpc_put_call_slot(struct rxrpc_call *call) { struct semaphore *limiter = &rxrpc_call_limiter; if (test_bit(RXRPC_CALL_KERNEL, &call->flags)) limiter = &rxrpc_kernel_call_limiter; up(limiter); } /* * Start the process of connecting a call. We obtain a peer and a connection * bundle, but the actual association of a call with a connection is offloaded * to the I/O thread to simplify locking. */ static int rxrpc_connect_call(struct rxrpc_call *call, gfp_t gfp) { struct rxrpc_local *local = call->local; int ret = -ENOMEM; _enter("{%d,%lx},", call->debug_id, call->user_call_ID); ret = rxrpc_look_up_bundle(call, gfp); if (ret < 0) goto error; trace_rxrpc_client(NULL, -1, rxrpc_client_queue_new_call); rxrpc_get_call(call, rxrpc_call_get_io_thread); spin_lock(&local->client_call_lock); list_add_tail(&call->wait_link, &local->new_client_calls); spin_unlock(&local->client_call_lock); rxrpc_wake_up_io_thread(local); return 0; error: __set_bit(RXRPC_CALL_DISCONNECTED, &call->flags); return ret; } /* * Set up a call for the given parameters. * - Called with the socket lock held, which it must release. * - If it returns a call, the call's lock will need releasing by the caller. */ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, struct rxrpc_conn_parameters *cp, struct rxrpc_call_params *p, gfp_t gfp, unsigned int debug_id) __releases(&rx->sk.sk_lock.slock) __acquires(&call->user_mutex) { struct rxrpc_call *call, *xcall; struct rxrpc_net *rxnet; struct semaphore *limiter; struct rb_node *parent, **pp; int ret; _enter("%p,%lx", rx, p->user_call_ID); if (WARN_ON_ONCE(!cp->peer)) { release_sock(&rx->sk); return ERR_PTR(-EIO); } limiter = rxrpc_get_call_slot(p, gfp); if (!limiter) { release_sock(&rx->sk); return ERR_PTR(-ERESTARTSYS); } call = rxrpc_alloc_client_call(rx, cp, p, gfp, debug_id); if (IS_ERR(call)) { release_sock(&rx->sk); up(limiter); _leave(" = %ld", PTR_ERR(call)); return call; } /* We need to protect a partially set up call against the user as we * will be acting outside the socket lock. */ mutex_lock(&call->user_mutex); /* Publish the call, even though it is incompletely set up as yet */ write_lock(&rx->call_lock); pp = &rx->calls.rb_node; parent = NULL; while (*pp) { parent = *pp; xcall = rb_entry(parent, struct rxrpc_call, sock_node); if (p->user_call_ID < xcall->user_call_ID) pp = &(*pp)->rb_left; else if (p->user_call_ID > xcall->user_call_ID) pp = &(*pp)->rb_right; else goto error_dup_user_ID; } rcu_assign_pointer(call->socket, rx); call->user_call_ID = p->user_call_ID; __set_bit(RXRPC_CALL_HAS_USERID, &call->flags); rxrpc_get_call(call, rxrpc_call_get_userid); rb_link_node(&call->sock_node, parent, pp); rb_insert_color(&call->sock_node, &rx->calls); list_add(&call->sock_link, &rx->sock_calls); write_unlock(&rx->call_lock); rxnet = call->rxnet; spin_lock(&rxnet->call_lock); list_add_tail_rcu(&call->link, &rxnet->calls); spin_unlock(&rxnet->call_lock); /* From this point on, the call is protected by its own lock. */ release_sock(&rx->sk); /* Set up or get a connection record and set the protocol parameters, * including channel number and call ID. */ ret = rxrpc_connect_call(call, gfp); if (ret < 0) goto error_attached_to_socket; _leave(" = %p [new]", call); return call; /* We unexpectedly found the user ID in the list after taking * the call_lock. This shouldn't happen unless the user races * with itself and tries to add the same user ID twice at the * same time in different threads. */ error_dup_user_ID: write_unlock(&rx->call_lock); release_sock(&rx->sk); rxrpc_prefail_call(call, RXRPC_CALL_LOCAL_ERROR, -EEXIST); trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), 0, rxrpc_call_see_userid_exists); mutex_unlock(&call->user_mutex); rxrpc_put_call(call, rxrpc_call_put_userid_exists); _leave(" = -EEXIST"); return ERR_PTR(-EEXIST); /* We got an error, but the call is attached to the socket and is in * need of release. However, we might now race with recvmsg() when it * completion notifies the socket. Return 0 from sys_sendmsg() and * leave the error to recvmsg() to deal with. */ error_attached_to_socket: trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), ret, rxrpc_call_see_connect_failed); rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, 0, ret); _leave(" = c=%08x [err]", call->debug_id); return call; } /* * Set up an incoming call. call->conn points to the connection. * This is called in BH context and isn't allowed to fail. */ void rxrpc_incoming_call(struct rxrpc_sock *rx, struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_connection *conn = call->conn; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); u32 chan; _enter(",%d", call->conn->debug_id); rcu_assign_pointer(call->socket, rx); call->call_id = sp->hdr.callNumber; call->dest_srx.srx_service = sp->hdr.serviceId; call->cid = sp->hdr.cid; call->cong_tstamp = skb->tstamp; __set_bit(RXRPC_CALL_EXPOSED, &call->flags); rxrpc_set_call_state(call, RXRPC_CALL_SERVER_SECURING); spin_lock(&conn->state_lock); switch (conn->state) { case RXRPC_CONN_SERVICE_UNSECURED: case RXRPC_CONN_SERVICE_CHALLENGING: rxrpc_set_call_state(call, RXRPC_CALL_SERVER_SECURING); break; case RXRPC_CONN_SERVICE: rxrpc_set_call_state(call, RXRPC_CALL_SERVER_RECV_REQUEST); break; case RXRPC_CONN_ABORTED: rxrpc_set_call_completion(call, conn->completion, conn->abort_code, conn->error); break; default: BUG(); } rxrpc_get_call(call, rxrpc_call_get_io_thread); /* Set the channel for this call. We don't get channel_lock as we're * only defending against the data_ready handler (which we're called * from) and the RESPONSE packet parser (which is only really * interested in call_counter and can cope with a disagreement with the * call pointer). */ chan = sp->hdr.cid & RXRPC_CHANNELMASK; conn->channels[chan].call_counter = call->call_id; conn->channels[chan].call_id = call->call_id; conn->channels[chan].call = call; spin_unlock(&conn->state_lock); spin_lock(&conn->peer->lock); hlist_add_head(&call->error_link, &conn->peer->error_targets); spin_unlock(&conn->peer->lock); rxrpc_start_call_timer(call); _leave(""); } /* * Note the re-emergence of a call. */ void rxrpc_see_call(struct rxrpc_call *call, enum rxrpc_call_trace why) { if (call) { int r = refcount_read(&call->ref); trace_rxrpc_call(call->debug_id, r, 0, why); } } struct rxrpc_call *rxrpc_try_get_call(struct rxrpc_call *call, enum rxrpc_call_trace why) { int r; if (!call || !__refcount_inc_not_zero(&call->ref, &r)) return NULL; trace_rxrpc_call(call->debug_id, r + 1, 0, why); return call; } /* * Note the addition of a ref on a call. */ void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace why) { int r; __refcount_inc(&call->ref, &r); trace_rxrpc_call(call->debug_id, r + 1, 0, why); } /* * Clean up the Rx skb ring. */ static void rxrpc_cleanup_ring(struct rxrpc_call *call) { rxrpc_purge_queue(&call->recvmsg_queue); rxrpc_purge_queue(&call->rx_oos_queue); } /* * Detach a call from its owning socket. */ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) { struct rxrpc_connection *conn = call->conn; bool put = false, putu = false; _enter("{%d,%d}", call->debug_id, refcount_read(&call->ref)); trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), call->flags, rxrpc_call_see_release); if (test_and_set_bit(RXRPC_CALL_RELEASED, &call->flags)) BUG(); rxrpc_put_call_slot(call); /* Make sure we don't get any more notifications */ spin_lock(&rx->recvmsg_lock); if (!list_empty(&call->recvmsg_link)) { _debug("unlinking once-pending call %p { e=%lx f=%lx }", call, call->events, call->flags); list_del(&call->recvmsg_link); put = true; } /* list_empty() must return false in rxrpc_notify_socket() */ call->recvmsg_link.next = NULL; call->recvmsg_link.prev = NULL; spin_unlock(&rx->recvmsg_lock); if (put) rxrpc_put_call(call, rxrpc_call_put_unnotify); write_lock(&rx->call_lock); if (test_and_clear_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { rb_erase(&call->sock_node, &rx->calls); memset(&call->sock_node, 0xdd, sizeof(call->sock_node)); putu = true; } list_del(&call->sock_link); write_unlock(&rx->call_lock); _debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn); if (putu) rxrpc_put_call(call, rxrpc_call_put_userid); _leave(""); } /* * release all the calls associated with a socket */ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) { struct rxrpc_call *call; _enter("%p", rx); while (!list_empty(&rx->to_be_accepted)) { call = list_entry(rx->to_be_accepted.next, struct rxrpc_call, accept_link); list_del(&call->accept_link); rxrpc_propose_abort(call, RX_CALL_DEAD, -ECONNRESET, rxrpc_abort_call_sock_release_tba); rxrpc_put_call(call, rxrpc_call_put_release_sock_tba); } while (!list_empty(&rx->sock_calls)) { call = list_entry(rx->sock_calls.next, struct rxrpc_call, sock_link); rxrpc_get_call(call, rxrpc_call_get_release_sock); rxrpc_propose_abort(call, RX_CALL_DEAD, -ECONNRESET, rxrpc_abort_call_sock_release); rxrpc_release_call(rx, call); rxrpc_put_call(call, rxrpc_call_put_release_sock); } _leave(""); } /* * release a call */ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace why) { struct rxrpc_net *rxnet = call->rxnet; unsigned int debug_id = call->debug_id; bool dead; int r; ASSERT(call != NULL); dead = __refcount_dec_and_test(&call->ref, &r); trace_rxrpc_call(debug_id, r - 1, 0, why); if (dead) { ASSERTCMP(__rxrpc_call_state(call), ==, RXRPC_CALL_COMPLETE); if (!list_empty(&call->link)) { spin_lock(&rxnet->call_lock); list_del_init(&call->link); spin_unlock(&rxnet->call_lock); } rxrpc_cleanup_call(call); } } /* * Free up the call under RCU. */ static void rxrpc_rcu_free_call(struct rcu_head *rcu) { struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu); struct rxrpc_net *rxnet = READ_ONCE(call->rxnet); kmem_cache_free(rxrpc_call_jar, call); if (atomic_dec_and_test(&rxnet->nr_calls)) wake_up_var(&rxnet->nr_calls); } /* * Final call destruction - but must be done in process context. */ static void rxrpc_destroy_call(struct work_struct *work) { struct rxrpc_call *call = container_of(work, struct rxrpc_call, destroyer); struct rxrpc_txbuf *txb; del_timer_sync(&call->timer); rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack); rxrpc_cleanup_ring(call); while ((txb = list_first_entry_or_null(&call->tx_sendmsg, struct rxrpc_txbuf, call_link))) { list_del(&txb->call_link); rxrpc_put_txbuf(txb, rxrpc_txbuf_put_cleaned); } while ((txb = list_first_entry_or_null(&call->tx_buffer, struct rxrpc_txbuf, call_link))) { list_del(&txb->call_link); rxrpc_put_txbuf(txb, rxrpc_txbuf_put_cleaned); } rxrpc_put_txbuf(call->tx_pending, rxrpc_txbuf_put_cleaned); rxrpc_put_connection(call->conn, rxrpc_conn_put_call); rxrpc_deactivate_bundle(call->bundle); rxrpc_put_bundle(call->bundle, rxrpc_bundle_put_call); rxrpc_put_peer(call->peer, rxrpc_peer_put_call); rxrpc_put_local(call->local, rxrpc_local_put_call); call_rcu(&call->rcu, rxrpc_rcu_free_call); } /* * clean up a call */ void rxrpc_cleanup_call(struct rxrpc_call *call) { memset(&call->sock_node, 0xcd, sizeof(call->sock_node)); ASSERTCMP(__rxrpc_call_state(call), ==, RXRPC_CALL_COMPLETE); ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags)); del_timer(&call->timer); if (rcu_read_lock_held()) /* Can't use the rxrpc workqueue as we need to cancel/flush * something that may be running/waiting there. */ schedule_work(&call->destroyer); else rxrpc_destroy_call(&call->destroyer); } /* * Make sure that all calls are gone from a network namespace. To reach this * point, any open UDP sockets in that namespace must have been closed, so any * outstanding calls cannot be doing I/O. */ void rxrpc_destroy_all_calls(struct rxrpc_net *rxnet) { struct rxrpc_call *call; _enter(""); if (!list_empty(&rxnet->calls)) { spin_lock(&rxnet->call_lock); while (!list_empty(&rxnet->calls)) { call = list_entry(rxnet->calls.next, struct rxrpc_call, link); _debug("Zapping call %p", call); rxrpc_see_call(call, rxrpc_call_see_zap); list_del_init(&call->link); pr_err("Call %p still in use (%d,%s,%lx,%lx)!\n", call, refcount_read(&call->ref), rxrpc_call_states[__rxrpc_call_state(call)], call->flags, call->events); spin_unlock(&rxnet->call_lock); cond_resched(); spin_lock(&rxnet->call_lock); } spin_unlock(&rxnet->call_lock); } atomic_dec(&rxnet->nr_calls); wait_var_event(&rxnet->nr_calls, !atomic_read(&rxnet->nr_calls)); }
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/dccp/ipv4.c * * An implementation of the DCCP protocol * Arnaldo Carvalho de Melo <acme@conectiva.com.br> */ #include <linux/dccp.h> #include <linux/icmp.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/random.h> #include <net/icmp.h> #include <net/inet_common.h> #include <net/inet_hashtables.h> #include <net/inet_sock.h> #include <net/protocol.h> #include <net/sock.h> #include <net/timewait_sock.h> #include <net/tcp_states.h> #include <net/xfrm.h> #include <net/secure_seq.h> #include <net/netns/generic.h> #include "ackvec.h" #include "ccid.h" #include "dccp.h" #include "feat.h" struct dccp_v4_pernet { struct sock *v4_ctl_sk; }; static unsigned int dccp_v4_pernet_id __read_mostly; /* * The per-net v4_ctl_sk socket is used for responding to * the Out-of-the-blue (OOTB) packets. A control sock will be created * for this socket at the initialization time. */ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; struct inet_sock *inet = inet_sk(sk); struct dccp_sock *dp = dccp_sk(sk); __be16 orig_sport, orig_dport; __be32 daddr, nexthop; struct flowi4 *fl4; struct rtable *rt; int err; struct ip_options_rcu *inet_opt; dp->dccps_role = DCCP_ROLE_CLIENT; if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL; if (usin->sin_family != AF_INET) return -EAFNOSUPPORT; nexthop = daddr = usin->sin_addr.s_addr; inet_opt = rcu_dereference_protected(inet->inet_opt, lockdep_sock_is_held(sk)); if (inet_opt != NULL && inet_opt->opt.srr) { if (daddr == 0) return -EINVAL; nexthop = inet_opt->opt.faddr; } orig_sport = inet->inet_sport; orig_dport = usin->sin_port; fl4 = &inet->cork.fl.u.ip4; rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, sk->sk_bound_dev_if, IPPROTO_DCCP, orig_sport, orig_dport, sk); if (IS_ERR(rt)) return PTR_ERR(rt); if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { ip_rt_put(rt); return -ENETUNREACH; } if (inet_opt == NULL || !inet_opt->opt.srr) daddr = fl4->daddr; if (inet->inet_saddr == 0) { err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); if (err) { ip_rt_put(rt); return err; } } else { sk_rcv_saddr_set(sk, inet->inet_saddr); } inet->inet_dport = usin->sin_port; sk_daddr_set(sk, daddr); inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet_opt) inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; /* * Socket identity is still unknown (sport may be zero). * However we set state to DCCP_REQUESTING and not releasing socket * lock select source port, enter ourselves into the hash tables and * complete initialization after this. */ dccp_set_state(sk, DCCP_REQUESTING); err = inet_hash_connect(&dccp_death_row, sk); if (err != 0) goto failure; rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, inet->inet_sport, inet->inet_dport, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; goto failure; } /* OK, now commit destination to socket. */ sk_setup_caps(sk, &rt->dst); dp->dccps_iss = secure_dccp_sequence_number(inet->inet_saddr, inet->inet_daddr, inet->inet_sport, inet->inet_dport); atomic_set(&inet->inet_id, get_random_u16()); err = dccp_connect(sk); rt = NULL; if (err != 0) goto failure; out: return err; failure: /* * This unhashes the socket and releases the local port, if necessary. */ dccp_set_state(sk, DCCP_CLOSED); inet_bhash2_reset_saddr(sk); ip_rt_put(rt); sk->sk_route_caps = 0; inet->inet_dport = 0; goto out; } EXPORT_SYMBOL_GPL(dccp_v4_connect); /* * This routine does path mtu discovery as defined in RFC1191. */ static inline void dccp_do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu) { struct dst_entry *dst; const struct inet_sock *inet = inet_sk(sk); const struct dccp_sock *dp = dccp_sk(sk); /* We are not interested in DCCP_LISTEN and request_socks (RESPONSEs * send out by Linux are always < 576bytes so they should go through * unfragmented). */ if (sk->sk_state == DCCP_LISTEN) return; dst = inet_csk_update_pmtu(sk, mtu); if (!dst) return; /* Something is about to be wrong... Remember soft error * for the case, if this connection will not able to recover. */ if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); mtu = dst_mtu(dst); if (inet->pmtudisc != IP_PMTUDISC_DONT && ip_sk_accept_pmtu(sk) && inet_csk(sk)->icsk_pmtu_cookie > mtu) { dccp_sync_mss(sk, mtu); /* * From RFC 4340, sec. 14.1: * * DCCP-Sync packets are the best choice for upward * probing, since DCCP-Sync probes do not risk application * data loss. */ dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC); } /* else let the usual retransmit timer handle it */ } static void dccp_do_redirect(struct sk_buff *skb, struct sock *sk) { struct dst_entry *dst = __sk_dst_check(sk, 0); if (dst) dst->ops->redirect(dst, sk, skb); } void dccp_req_err(struct sock *sk, u64 seq) { struct request_sock *req = inet_reqsk(sk); struct net *net = sock_net(sk); /* * ICMPs are not backlogged, hence we cannot get an established * socket here. */ if (!between48(seq, dccp_rsk(req)->dreq_iss, dccp_rsk(req)->dreq_gss)) { __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); } else { /* * Still in RESPOND, just remove it silently. * There is no good way to pass the error to the newly * created socket, and POSIX does not want network * errors returned from accept(). */ inet_csk_reqsk_queue_drop(req->rsk_listener, req); } reqsk_put(req); } EXPORT_SYMBOL(dccp_req_err); /* * This routine is called by the ICMP module when it gets some sort of error * condition. If err < 0 then the socket should be closed and the error * returned to the user. If err > 0 it's just the icmp type << 8 | icmp code. * After adjustment header points to the first 8 bytes of the tcp header. We * need to find the appropriate port. * * The locking strategy used here is very "optimistic". When someone else * accesses the socket the ICMP is just dropped and for some paths there is no * check at all. A more general error queue to queue errors for later handling * is probably better. */ static int dccp_v4_err(struct sk_buff *skb, u32 info) { const struct iphdr *iph = (struct iphdr *)skb->data; const u8 offset = iph->ihl << 2; const struct dccp_hdr *dh; struct dccp_sock *dp; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct sock *sk; __u64 seq; int err; struct net *net = dev_net(skb->dev); if (!pskb_may_pull(skb, offset + sizeof(*dh))) return -EINVAL; dh = (struct dccp_hdr *)(skb->data + offset); if (!pskb_may_pull(skb, offset + __dccp_basic_hdr_len(dh))) return -EINVAL; iph = (struct iphdr *)skb->data; dh = (struct dccp_hdr *)(skb->data + offset); sk = __inet_lookup_established(net, &dccp_hashinfo, iph->daddr, dh->dccph_dport, iph->saddr, ntohs(dh->dccph_sport), inet_iif(skb), 0); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return -ENOENT; } if (sk->sk_state == DCCP_TIME_WAIT) { inet_twsk_put(inet_twsk(sk)); return 0; } seq = dccp_hdr_seq(dh); if (sk->sk_state == DCCP_NEW_SYN_RECV) { dccp_req_err(sk, seq); return 0; } bh_lock_sock(sk); /* If too many ICMPs get dropped on busy * servers this needs to be solved differently. */ if (sock_owned_by_user(sk)) __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); if (sk->sk_state == DCCP_CLOSED) goto out; dp = dccp_sk(sk); if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) && !between48(seq, dp->dccps_awl, dp->dccps_awh)) { __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } switch (type) { case ICMP_REDIRECT: if (!sock_owned_by_user(sk)) dccp_do_redirect(skb, sk); goto out; case ICMP_SOURCE_QUENCH: /* Just silently ignore these. */ goto out; case ICMP_PARAMETERPROB: err = EPROTO; break; case ICMP_DEST_UNREACH: if (code > NR_ICMP_UNREACH) goto out; if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ if (!sock_owned_by_user(sk)) dccp_do_pmtu_discovery(sk, iph, info); goto out; } err = icmp_err_convert[code].errno; break; case ICMP_TIME_EXCEEDED: err = EHOSTUNREACH; break; default: goto out; } switch (sk->sk_state) { case DCCP_REQUESTING: case DCCP_RESPOND: if (!sock_owned_by_user(sk)) { __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); sk->sk_err = err; sk_error_report(sk); dccp_done(sk); } else { WRITE_ONCE(sk->sk_err_soft, err); } goto out; } /* If we've already connected we will keep trying * until we time out, or the user gives up. * * rfc1122 4.2.3.9 allows to consider as hard errors * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, * but it is obsoleted by pmtu discovery). * * Note, that in modern internet, where routing is unreliable * and in each dark corner broken firewalls sit, sending random * errors ordered by their masters even this two messages finally lose * their original sense (even Linux sends invalid PORT_UNREACHs) * * Now we are in compliance with RFCs. * --ANK (980905) */ if (!sock_owned_by_user(sk) && inet_test_bit(RECVERR, sk)) { sk->sk_err = err; sk_error_report(sk); } else { /* Only an error on timeout */ WRITE_ONCE(sk->sk_err_soft, err); } out: bh_unlock_sock(sk); sock_put(sk); return 0; } static inline __sum16 dccp_v4_csum_finish(struct sk_buff *skb, __be32 src, __be32 dst) { return csum_tcpudp_magic(src, dst, skb->len, IPPROTO_DCCP, skb->csum); } void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb) { const struct inet_sock *inet = inet_sk(sk); struct dccp_hdr *dh = dccp_hdr(skb); dccp_csum_outgoing(skb); dh->dccph_checksum = dccp_v4_csum_finish(skb, inet->inet_saddr, inet->inet_daddr); } EXPORT_SYMBOL_GPL(dccp_v4_send_check); static inline u64 dccp_v4_init_sequence(const struct sk_buff *skb) { return secure_dccp_sequence_number(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, dccp_hdr(skb)->dccph_dport, dccp_hdr(skb)->dccph_sport); } /* * The three way handshake has completed - we got a valid ACK or DATAACK - * now create the new socket. * * This is the equivalent of TCP's tcp_v4_syn_recv_sock */ struct sock *dccp_v4_request_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, bool *own_req) { struct inet_request_sock *ireq; struct inet_sock *newinet; struct sock *newsk; if (sk_acceptq_is_full(sk)) goto exit_overflow; newsk = dccp_create_openreq_child(sk, req, skb); if (newsk == NULL) goto exit_nonewsk; newinet = inet_sk(newsk); ireq = inet_rsk(req); sk_daddr_set(newsk, ireq->ir_rmt_addr); sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); newinet->inet_saddr = ireq->ir_loc_addr; RCU_INIT_POINTER(newinet->inet_opt, rcu_dereference(ireq->ireq_opt)); newinet->mc_index = inet_iif(skb); newinet->mc_ttl = ip_hdr(skb)->ttl; atomic_set(&newinet->inet_id, get_random_u16()); if (dst == NULL && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL) goto put_and_exit; sk_setup_caps(newsk, dst); dccp_sync_mss(newsk, dst_mtu(dst)); if (__inet_inherit_port(sk, newsk) < 0) goto put_and_exit; *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), NULL); if (*own_req) ireq->ireq_opt = NULL; else newinet->inet_opt = NULL; return newsk; exit_overflow: __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); exit_nonewsk: dst_release(dst); exit: __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); return NULL; put_and_exit: newinet->inet_opt = NULL; inet_csk_prepare_forced_close(newsk); dccp_done(newsk); goto exit; } EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock); static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, struct sk_buff *skb) { struct rtable *rt; const struct iphdr *iph = ip_hdr(skb); struct flowi4 fl4 = { .flowi4_oif = inet_iif(skb), .daddr = iph->saddr, .saddr = iph->daddr, .flowi4_tos = ip_sock_rt_tos(sk), .flowi4_scope = ip_sock_rt_scope(sk), .flowi4_proto = sk->sk_protocol, .fl4_sport = dccp_hdr(skb)->dccph_dport, .fl4_dport = dccp_hdr(skb)->dccph_sport, }; security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } return &rt->dst; } static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req) { int err = -1; struct sk_buff *skb; struct dst_entry *dst; struct flowi4 fl4; dst = inet_csk_route_req(sk, &fl4, req); if (dst == NULL) goto out; skb = dccp_make_response(sk, dst, req); if (skb != NULL) { const struct inet_request_sock *ireq = inet_rsk(req); struct dccp_hdr *dh = dccp_hdr(skb); dh->dccph_checksum = dccp_v4_csum_finish(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); rcu_read_lock(); err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, rcu_dereference(ireq->ireq_opt), READ_ONCE(inet_sk(sk)->tos)); rcu_read_unlock(); err = net_xmit_eval(err); } out: dst_release(dst); return err; } static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) { int err; const struct iphdr *rxiph; struct sk_buff *skb; struct dst_entry *dst; struct net *net = dev_net(skb_dst(rxskb)->dev); struct dccp_v4_pernet *pn; struct sock *ctl_sk; /* Never send a reset in response to a reset. */ if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET) return; if (skb_rtable(rxskb)->rt_type != RTN_LOCAL) return; pn = net_generic(net, dccp_v4_pernet_id); ctl_sk = pn->v4_ctl_sk; dst = dccp_v4_route_skb(net, ctl_sk, rxskb); if (dst == NULL) return; skb = dccp_ctl_make_reset(ctl_sk, rxskb); if (skb == NULL) goto out; rxiph = ip_hdr(rxskb); dccp_hdr(skb)->dccph_checksum = dccp_v4_csum_finish(skb, rxiph->saddr, rxiph->daddr); skb_dst_set(skb, dst_clone(dst)); local_bh_disable(); bh_lock_sock(ctl_sk); err = ip_build_and_send_pkt(skb, ctl_sk, rxiph->daddr, rxiph->saddr, NULL, inet_sk(ctl_sk)->tos); bh_unlock_sock(ctl_sk); if (net_xmit_eval(err) == 0) { __DCCP_INC_STATS(DCCP_MIB_OUTSEGS); __DCCP_INC_STATS(DCCP_MIB_OUTRSTS); } local_bh_enable(); out: dst_release(dst); } static void dccp_v4_reqsk_destructor(struct request_sock *req) { dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); } void dccp_syn_ack_timeout(const struct request_sock *req) { } EXPORT_SYMBOL(dccp_syn_ack_timeout); static struct request_sock_ops dccp_request_sock_ops __read_mostly = { .family = PF_INET, .obj_size = sizeof(struct dccp_request_sock), .rtx_syn_ack = dccp_v4_send_response, .send_ack = dccp_reqsk_send_ack, .destructor = dccp_v4_reqsk_destructor, .send_reset = dccp_v4_ctl_send_reset, .syn_ack_timeout = dccp_syn_ack_timeout, }; int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct inet_request_sock *ireq; struct request_sock *req; struct dccp_request_sock *dreq; const __be32 service = dccp_hdr_request(skb)->dccph_req_service; struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); /* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) return 0; /* discard, don't send a reset here */ if (dccp_bad_service_code(sk, service)) { dcb->dccpd_reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE; goto drop; } /* * TW buckets are converted to open requests without * limitations, they conserve resources and peer is * evidently real one. */ dcb->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY; if (inet_csk_reqsk_queue_is_full(sk)) goto drop; if (sk_acceptq_is_full(sk)) goto drop; req = inet_reqsk_alloc(&dccp_request_sock_ops, sk, true); if (req == NULL) goto drop; if (dccp_reqsk_init(req, dccp_sk(sk), skb)) goto drop_and_free; dreq = dccp_rsk(req); if (dccp_parse_options(sk, dreq, skb)) goto drop_and_free; ireq = inet_rsk(req); sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); ireq->ir_mark = inet_request_mark(sk, skb); ireq->ireq_family = AF_INET; ireq->ir_iif = READ_ONCE(sk->sk_bound_dev_if); if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; /* * Step 3: Process LISTEN state * * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie * * Setting S.SWL/S.SWH to is deferred to dccp_create_openreq_child(). */ dreq->dreq_isr = dcb->dccpd_seq; dreq->dreq_gsr = dreq->dreq_isr; dreq->dreq_iss = dccp_v4_init_sequence(skb); dreq->dreq_gss = dreq->dreq_iss; dreq->dreq_service = service; if (dccp_v4_send_response(sk, req)) goto drop_and_free; inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); reqsk_put(req); return 0; drop_and_free: reqsk_free(req); drop: __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); return -1; } EXPORT_SYMBOL_GPL(dccp_v4_conn_request); int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) { struct dccp_hdr *dh = dccp_hdr(skb); if (sk->sk_state == DCCP_OPEN) { /* Fast path */ if (dccp_rcv_established(sk, skb, dh, skb->len)) goto reset; return 0; } /* * Step 3: Process LISTEN state * If P.type == Request or P contains a valid Init Cookie option, * (* Must scan the packet's options to check for Init * Cookies. Only Init Cookies are processed here, * however; other options are processed in Step 8. This * scan need only be performed if the endpoint uses Init * Cookies *) * (* Generate a new socket and switch to that socket *) * Set S := new socket for this port pair * S.state = RESPOND * Choose S.ISS (initial seqno) or set from Init Cookies * Initialize S.GAR := S.ISS * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies * Continue with S.state == RESPOND * (* A Response packet will be generated in Step 11 *) * Otherwise, * Generate Reset(No Connection) unless P.type == Reset * Drop packet and return * * NOTE: the check for the packet types is done in * dccp_rcv_state_process */ if (dccp_rcv_state_process(sk, skb, dh, skb->len)) goto reset; return 0; reset: dccp_v4_ctl_send_reset(sk, skb); kfree_skb(skb); return 0; } EXPORT_SYMBOL_GPL(dccp_v4_do_rcv); /** * dccp_invalid_packet - check for malformed packets * @skb: Packet to validate * * Implements RFC 4340, 8.5: Step 1: Check header basics * Packets that fail these checks are ignored and do not receive Resets. */ int dccp_invalid_packet(struct sk_buff *skb) { const struct dccp_hdr *dh; unsigned int cscov; u8 dccph_doff; if (skb->pkt_type != PACKET_HOST) return 1; /* If the packet is shorter than 12 bytes, drop packet and return */ if (!pskb_may_pull(skb, sizeof(struct dccp_hdr))) { DCCP_WARN("pskb_may_pull failed\n"); return 1; } dh = dccp_hdr(skb); /* If P.type is not understood, drop packet and return */ if (dh->dccph_type >= DCCP_PKT_INVALID) { DCCP_WARN("invalid packet type\n"); return 1; } /* * If P.Data Offset is too small for packet type, drop packet and return */ dccph_doff = dh->dccph_doff; if (dccph_doff < dccp_hdr_len(skb) / sizeof(u32)) { DCCP_WARN("P.Data Offset(%u) too small\n", dccph_doff); return 1; } /* * If P.Data Offset is too large for packet, drop packet and return */ if (!pskb_may_pull(skb, dccph_doff * sizeof(u32))) { DCCP_WARN("P.Data Offset(%u) too large\n", dccph_doff); return 1; } dh = dccp_hdr(skb); /* * If P.type is not Data, Ack, or DataAck and P.X == 0 (the packet * has short sequence numbers), drop packet and return */ if ((dh->dccph_type < DCCP_PKT_DATA || dh->dccph_type > DCCP_PKT_DATAACK) && dh->dccph_x == 0) { DCCP_WARN("P.type (%s) not Data || [Data]Ack, while P.X == 0\n", dccp_packet_name(dh->dccph_type)); return 1; } /* * If P.CsCov is too large for the packet size, drop packet and return. * This must come _before_ checksumming (not as RFC 4340 suggests). */ cscov = dccp_csum_coverage(skb); if (cscov > skb->len) { DCCP_WARN("P.CsCov %u exceeds packet length %d\n", dh->dccph_cscov, skb->len); return 1; } /* If header checksum is incorrect, drop packet and return. * (This step is completed in the AF-dependent functions.) */ skb->csum = skb_checksum(skb, 0, cscov, 0); return 0; } EXPORT_SYMBOL_GPL(dccp_invalid_packet); /* this is called when real data arrives */ static int dccp_v4_rcv(struct sk_buff *skb) { const struct dccp_hdr *dh; const struct iphdr *iph; bool refcounted; struct sock *sk; int min_cov; /* Step 1: Check header basics */ if (dccp_invalid_packet(skb)) goto discard_it; iph = ip_hdr(skb); /* Step 1: If header checksum is incorrect, drop packet and return */ if (dccp_v4_csum_finish(skb, iph->saddr, iph->daddr)) { DCCP_WARN("dropped packet with invalid checksum\n"); goto discard_it; } dh = dccp_hdr(skb); DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(dh); DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type; dccp_pr_debug("%8.8s src=%pI4@%-5d dst=%pI4@%-5d seq=%llu", dccp_packet_name(dh->dccph_type), &iph->saddr, ntohs(dh->dccph_sport), &iph->daddr, ntohs(dh->dccph_dport), (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq); if (dccp_packet_without_ack(skb)) { DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ; dccp_pr_debug_cat("\n"); } else { DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb); dccp_pr_debug_cat(", ack=%llu\n", (unsigned long long) DCCP_SKB_CB(skb)->dccpd_ack_seq); } lookup: sk = __inet_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh), dh->dccph_sport, dh->dccph_dport, 0, &refcounted); if (!sk) { dccp_pr_debug("failed to look up flow ID in table and " "get corresponding socket\n"); goto no_dccp_socket; } /* * Step 2: * ... or S.state == TIMEWAIT, * Generate Reset(No Connection) unless P.type == Reset * Drop packet and return */ if (sk->sk_state == DCCP_TIME_WAIT) { dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: do_time_wait\n"); inet_twsk_put(inet_twsk(sk)); goto no_dccp_socket; } if (sk->sk_state == DCCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); struct sock *nsk; sk = req->rsk_listener; if (unlikely(sk->sk_state != DCCP_LISTEN)) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } sock_hold(sk); refcounted = true; nsk = dccp_check_req(sk, skb, req); if (!nsk) { reqsk_put(req); goto discard_and_relse; } if (nsk == sk) { reqsk_put(req); } else if (dccp_child_process(sk, nsk, skb)) { dccp_v4_ctl_send_reset(sk, skb); goto discard_and_relse; } else { sock_put(sk); return 0; } } /* * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage * o if MinCsCov = 0, only packets with CsCov = 0 are accepted * o if MinCsCov > 0, also accept packets with CsCov >= MinCsCov */ min_cov = dccp_sk(sk)->dccps_pcrlen; if (dh->dccph_cscov && (min_cov == 0 || dh->dccph_cscov < min_cov)) { dccp_pr_debug("Packet CsCov %d does not satisfy MinCsCov %d\n", dh->dccph_cscov, min_cov); /* FIXME: "Such packets SHOULD be reported using Data Dropped * options (Section 11.7) with Drop Code 0, Protocol * Constraints." */ goto discard_and_relse; } if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; nf_reset_ct(skb); return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4, refcounted); no_dccp_socket: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard_it; /* * Step 2: * If no socket ... * Generate Reset(No Connection) unless P.type == Reset * Drop packet and return */ if (dh->dccph_type != DCCP_PKT_RESET) { DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; dccp_v4_ctl_send_reset(sk, skb); } discard_it: kfree_skb(skb); return 0; discard_and_relse: if (refcounted) sock_put(sk); goto discard_it; } static const struct inet_connection_sock_af_ops dccp_ipv4_af_ops = { .queue_xmit = ip_queue_xmit, .send_check = dccp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, .conn_request = dccp_v4_conn_request, .syn_recv_sock = dccp_v4_request_recv_sock, .net_header_len = sizeof(struct iphdr), .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, .addr2sockaddr = inet_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in), }; static int dccp_v4_init_sock(struct sock *sk) { static __u8 dccp_v4_ctl_sock_initialized; int err = dccp_init_sock(sk, dccp_v4_ctl_sock_initialized); if (err == 0) { if (unlikely(!dccp_v4_ctl_sock_initialized)) dccp_v4_ctl_sock_initialized = 1; inet_csk(sk)->icsk_af_ops = &dccp_ipv4_af_ops; } return err; } static struct timewait_sock_ops dccp_timewait_sock_ops = { .twsk_obj_size = sizeof(struct inet_timewait_sock), }; static struct proto dccp_v4_prot = { .name = "DCCP", .owner = THIS_MODULE, .close = dccp_close, .connect = dccp_v4_connect, .disconnect = dccp_disconnect, .ioctl = dccp_ioctl, .init = dccp_v4_init_sock, .setsockopt = dccp_setsockopt, .getsockopt = dccp_getsockopt, .sendmsg = dccp_sendmsg, .recvmsg = dccp_recvmsg, .backlog_rcv = dccp_v4_do_rcv, .hash = inet_hash, .unhash = inet_unhash, .accept = inet_csk_accept, .get_port = inet_csk_get_port, .shutdown = dccp_shutdown, .destroy = dccp_destroy_sock, .orphan_count = &dccp_orphan_count, .max_header = MAX_DCCP_HEADER, .obj_size = sizeof(struct dccp_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, .rsk_prot = &dccp_request_sock_ops, .twsk_prot = &dccp_timewait_sock_ops, .h.hashinfo = &dccp_hashinfo, }; static const struct net_protocol dccp_v4_protocol = { .handler = dccp_v4_rcv, .err_handler = dccp_v4_err, .no_policy = 1, .icmp_strict_tag_validation = 1, }; static const struct proto_ops inet_dccp_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, .bind = inet_bind, .connect = inet_stream_connect, .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = inet_getname, /* FIXME: work on tcp_poll to rename it to inet_csk_poll */ .poll = dccp_poll, .ioctl = inet_ioctl, .gettstamp = sock_gettstamp, /* FIXME: work on inet_listen to rename it to sock_common_listen */ .listen = inet_dccp_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, }; static struct inet_protosw dccp_v4_protosw = { .type = SOCK_DCCP, .protocol = IPPROTO_DCCP, .prot = &dccp_v4_prot, .ops = &inet_dccp_ops, .flags = INET_PROTOSW_ICSK, }; static int __net_init dccp_v4_init_net(struct net *net) { struct dccp_v4_pernet *pn = net_generic(net, dccp_v4_pernet_id); if (dccp_hashinfo.bhash == NULL) return -ESOCKTNOSUPPORT; return inet_ctl_sock_create(&pn->v4_ctl_sk, PF_INET, SOCK_DCCP, IPPROTO_DCCP, net); } static void __net_exit dccp_v4_exit_net(struct net *net) { struct dccp_v4_pernet *pn = net_generic(net, dccp_v4_pernet_id); inet_ctl_sock_destroy(pn->v4_ctl_sk); } static void __net_exit dccp_v4_exit_batch(struct list_head *net_exit_list) { inet_twsk_purge(&dccp_hashinfo, AF_INET); } static struct pernet_operations dccp_v4_ops = { .init = dccp_v4_init_net, .exit = dccp_v4_exit_net, .exit_batch = dccp_v4_exit_batch, .id = &dccp_v4_pernet_id, .size = sizeof(struct dccp_v4_pernet), }; static int __init dccp_v4_init(void) { int err = proto_register(&dccp_v4_prot, 1); if (err) goto out; inet_register_protosw(&dccp_v4_protosw); err = register_pernet_subsys(&dccp_v4_ops); if (err) goto out_destroy_ctl_sock; err = inet_add_protocol(&dccp_v4_protocol, IPPROTO_DCCP); if (err) goto out_proto_unregister; out: return err; out_proto_unregister: unregister_pernet_subsys(&dccp_v4_ops); out_destroy_ctl_sock: inet_unregister_protosw(&dccp_v4_protosw); proto_unregister(&dccp_v4_prot); goto out; } static void __exit dccp_v4_exit(void) { inet_del_protocol(&dccp_v4_protocol, IPPROTO_DCCP); unregister_pernet_subsys(&dccp_v4_ops); inet_unregister_protosw(&dccp_v4_protosw); proto_unregister(&dccp_v4_prot); } module_init(dccp_v4_init); module_exit(dccp_v4_exit); /* * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33) * values directly, Also cover the case where the protocol is not specified, * i.e. net-pf-PF_INET-proto-0-type-SOCK_DCCP */ MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 33, 6); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 0, 6); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>"); MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 // SPDX-License-Identifier: GPL-2.0-only /* * OCB mode implementation * * Copyright: (c) 2014 Czech Technical University in Prague * (c) 2014 Volkswagen Group Research * Copyright (C) 2022-2023 Intel Corporation * Author: Rostislav Lisovy <rostislav.lisovy@fel.cvut.cz> * Funded by: Volkswagen Group Research */ #include <linux/ieee80211.h> #include <net/cfg80211.h> #include "nl80211.h" #include "core.h" #include "rdev-ops.h" int cfg80211_join_ocb(struct cfg80211_registered_device *rdev, struct net_device *dev, struct ocb_setup *setup) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; lockdep_assert_wiphy(wdev->wiphy); if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_OCB) return -EOPNOTSUPP; if (!rdev->ops->join_ocb) return -EOPNOTSUPP; if (WARN_ON(!setup->chandef.chan)) return -EINVAL; err = rdev_join_ocb(rdev, dev, setup); if (!err) wdev->u.ocb.chandef = setup->chandef; return err; } int cfg80211_leave_ocb(struct cfg80211_registered_device *rdev, struct net_device *dev) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; lockdep_assert_wiphy(wdev->wiphy); if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_OCB) return -EOPNOTSUPP; if (!rdev->ops->leave_ocb) return -EOPNOTSUPP; if (!wdev->u.ocb.chandef.chan) return -ENOTCONN; err = rdev_leave_ocb(rdev, dev); if (!err) memset(&wdev->u.ocb.chandef, 0, sizeof(wdev->u.ocb.chandef)); return err; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 /* SPDX-License-Identifier: GPL-2.0 */ /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Manage send buffer * * Copyright IBM Corp. 2016 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> */ #ifndef SMC_TX_H #define SMC_TX_H #include <linux/socket.h> #include <linux/types.h> #include "smc.h" #include "smc_cdc.h" static inline int smc_tx_prepared_sends(struct smc_connection *conn) { union smc_host_cursor sent, prep; smc_curs_copy(&sent, &conn->tx_curs_sent, conn); smc_curs_copy(&prep, &conn->tx_curs_prep, conn); return smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep); } void smc_tx_pending(struct smc_connection *conn); void smc_tx_work(struct work_struct *work); void smc_tx_init(struct smc_sock *smc); int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len); int smc_tx_sndbuf_nonempty(struct smc_connection *conn); void smc_tx_sndbuf_nonfull(struct smc_sock *smc); void smc_tx_consumer_update(struct smc_connection *conn, bool force); int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len, u32 offset, int signal); #endif /* SMC_TX_H */
2 2 1 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 // SPDX-License-Identifier: GPL-2.0 /* * Supplementary group IDs */ #include <linux/cred.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/security.h> #include <linux/sort.h> #include <linux/syscalls.h> #include <linux/user_namespace.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> struct group_info *groups_alloc(int gidsetsize) { struct group_info *gi; gi = kvmalloc(struct_size(gi, gid, gidsetsize), GFP_KERNEL_ACCOUNT); if (!gi) return NULL; refcount_set(&gi->usage, 1); gi->ngroups = gidsetsize; return gi; } EXPORT_SYMBOL(groups_alloc); void groups_free(struct group_info *group_info) { kvfree(group_info); } EXPORT_SYMBOL(groups_free); /* export the group_info to a user-space array */ static int groups_to_user(gid_t __user *grouplist, const struct group_info *group_info) { struct user_namespace *user_ns = current_user_ns(); int i; unsigned int count = group_info->ngroups; for (i = 0; i < count; i++) { gid_t gid; gid = from_kgid_munged(user_ns, group_info->gid[i]); if (put_user(gid, grouplist+i)) return -EFAULT; } return 0; } /* fill a group_info from a user-space array - it must be allocated already */ static int groups_from_user(struct group_info *group_info, gid_t __user *grouplist) { struct user_namespace *user_ns = current_user_ns(); int i; unsigned int count = group_info->ngroups; for (i = 0; i < count; i++) { gid_t gid; kgid_t kgid; if (get_user(gid, grouplist+i)) return -EFAULT; kgid = make_kgid(user_ns, gid); if (!gid_valid(kgid)) return -EINVAL; group_info->gid[i] = kgid; } return 0; } static int gid_cmp(const void *_a, const void *_b) { kgid_t a = *(kgid_t *)_a; kgid_t b = *(kgid_t *)_b; return gid_gt(a, b) - gid_lt(a, b); } void groups_sort(struct group_info *group_info) { sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid), gid_cmp, NULL); } EXPORT_SYMBOL(groups_sort); /* a simple bsearch */ int groups_search(const struct group_info *group_info, kgid_t grp) { unsigned int left, right; if (!group_info) return 0; left = 0; right = group_info->ngroups; while (left < right) { unsigned int mid = (left+right)/2; if (gid_gt(grp, group_info->gid[mid])) left = mid + 1; else if (gid_lt(grp, group_info->gid[mid])) right = mid; else return 1; } return 0; } /** * set_groups - Change a group subscription in a set of credentials * @new: The newly prepared set of credentials to alter * @group_info: The group list to install */ void set_groups(struct cred *new, struct group_info *group_info) { put_group_info(new->group_info); get_group_info(group_info); new->group_info = group_info; } EXPORT_SYMBOL(set_groups); /** * set_current_groups - Change current's group subscription * @group_info: The group list to impose * * Validate a group subscription and, if valid, impose it upon current's task * security record. */ int set_current_groups(struct group_info *group_info) { struct cred *new; const struct cred *old; int retval; new = prepare_creds(); if (!new) return -ENOMEM; old = current_cred(); set_groups(new, group_info); retval = security_task_fix_setgroups(new, old); if (retval < 0) goto error; return commit_creds(new); error: abort_creds(new); return retval; } EXPORT_SYMBOL(set_current_groups); SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist) { const struct cred *cred = current_cred(); int i; if (gidsetsize < 0) return -EINVAL; /* no need to grab task_lock here; it cannot change */ i = cred->group_info->ngroups; if (gidsetsize) { if (i > gidsetsize) { i = -EINVAL; goto out; } if (groups_to_user(grouplist, cred->group_info)) { i = -EFAULT; goto out; } } out: return i; } bool may_setgroups(void) { struct user_namespace *user_ns = current_user_ns(); return ns_capable_setid(user_ns, CAP_SETGID) && userns_may_setgroups(user_ns); } /* * SMP: Our groups are copy-on-write. We can set them safely * without another task interfering. */ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) { struct group_info *group_info; int retval; if (!may_setgroups()) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; group_info = groups_alloc(gidsetsize); if (!group_info) return -ENOMEM; retval = groups_from_user(group_info, grouplist); if (retval) { put_group_info(group_info); return retval; } groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); return retval; } /* * Check whether we're fsgid/egid or in the supplemental group.. */ int in_group_p(kgid_t grp) { const struct cred *cred = current_cred(); int retval = 1; if (!gid_eq(grp, cred->fsgid)) retval = groups_search(cred->group_info, grp); return retval; } EXPORT_SYMBOL(in_group_p); int in_egroup_p(kgid_t grp) { const struct cred *cred = current_cred(); int retval = 1; if (!gid_eq(grp, cred->egid)) retval = groups_search(cred->group_info, grp); return retval; } EXPORT_SYMBOL(in_egroup_p);
10 10 9 10 10 10 10 2 10 10 10 10 10 10 9 10 10 10 10 10 10 17 18 18 26 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2016 Qualcomm Atheros, Inc * * Based on net/sched/sch_fq_codel.c */ #ifndef __NET_SCHED_FQ_IMPL_H #define __NET_SCHED_FQ_IMPL_H #include <net/fq.h> /* functions that are embedded into includer */ static void __fq_adjust_removal(struct fq *fq, struct fq_flow *flow, unsigned int packets, unsigned int bytes, unsigned int truesize) { struct fq_tin *tin = flow->tin; int idx; tin->backlog_bytes -= bytes; tin->backlog_packets -= packets; flow->backlog -= bytes; fq->backlog -= packets; fq->memory_usage -= truesize; if (flow->backlog) return; if (flow == &tin->default_flow) { list_del_init(&tin->tin_list); return; } idx = flow - fq->flows; __clear_bit(idx, fq->flows_bitmap); } static void fq_adjust_removal(struct fq *fq, struct fq_flow *flow, struct sk_buff *skb) { __fq_adjust_removal(fq, flow, 1, skb->len, skb->truesize); } static struct sk_buff *fq_flow_dequeue(struct fq *fq, struct fq_flow *flow) { struct sk_buff *skb; lockdep_assert_held(&fq->lock); skb = __skb_dequeue(&flow->queue); if (!skb) return NULL; fq_adjust_removal(fq, flow, skb); return skb; } static int fq_flow_drop(struct fq *fq, struct fq_flow *flow, fq_skb_free_t free_func) { unsigned int packets = 0, bytes = 0, truesize = 0; struct fq_tin *tin = flow->tin; struct sk_buff *skb; int pending; lockdep_assert_held(&fq->lock); pending = min_t(int, 32, skb_queue_len(&flow->queue) / 2); do { skb = __skb_dequeue(&flow->queue); if (!skb) break; packets++; bytes += skb->len; truesize += skb->truesize; free_func(fq, tin, flow, skb); } while (packets < pending); __fq_adjust_removal(fq, flow, packets, bytes, truesize); return packets; } static struct sk_buff *fq_tin_dequeue(struct fq *fq, struct fq_tin *tin, fq_tin_dequeue_t dequeue_func) { struct fq_flow *flow; struct list_head *head; struct sk_buff *skb; lockdep_assert_held(&fq->lock); begin: head = &tin->new_flows; if (list_empty(head)) { head = &tin->old_flows; if (list_empty(head)) return NULL; } flow = list_first_entry(head, struct fq_flow, flowchain); if (flow->deficit <= 0) { flow->deficit += fq->quantum; list_move_tail(&flow->flowchain, &tin->old_flows); goto begin; } skb = dequeue_func(fq, tin, flow); if (!skb) { /* force a pass through old_flows to prevent starvation */ if ((head == &tin->new_flows) && !list_empty(&tin->old_flows)) { list_move_tail(&flow->flowchain, &tin->old_flows); } else { list_del_init(&flow->flowchain); flow->tin = NULL; } goto begin; } flow->deficit -= skb->len; tin->tx_bytes += skb->len; tin->tx_packets++; return skb; } static u32 fq_flow_idx(struct fq *fq, struct sk_buff *skb) { u32 hash = skb_get_hash(skb); return reciprocal_scale(hash, fq->flows_cnt); } static struct fq_flow *fq_flow_classify(struct fq *fq, struct fq_tin *tin, u32 idx, struct sk_buff *skb) { struct fq_flow *flow; lockdep_assert_held(&fq->lock); flow = &fq->flows[idx]; if (flow->tin && flow->tin != tin) { flow = &tin->default_flow; tin->collisions++; fq->collisions++; } if (!flow->tin) tin->flows++; return flow; } static struct fq_flow *fq_find_fattest_flow(struct fq *fq) { struct fq_tin *tin; struct fq_flow *flow = NULL; u32 len = 0; int i; for_each_set_bit(i, fq->flows_bitmap, fq->flows_cnt) { struct fq_flow *cur = &fq->flows[i]; unsigned int cur_len; cur_len = cur->backlog; if (cur_len <= len) continue; flow = cur; len = cur_len; } list_for_each_entry(tin, &fq->tin_backlog, tin_list) { unsigned int cur_len = tin->default_flow.backlog; if (cur_len <= len) continue; flow = &tin->default_flow; len = cur_len; } return flow; } static void fq_tin_enqueue(struct fq *fq, struct fq_tin *tin, u32 idx, struct sk_buff *skb, fq_skb_free_t free_func) { struct fq_flow *flow; struct sk_buff *next; bool oom; lockdep_assert_held(&fq->lock); flow = fq_flow_classify(fq, tin, idx, skb); if (!flow->backlog) { if (flow != &tin->default_flow) __set_bit(idx, fq->flows_bitmap); else if (list_empty(&tin->tin_list)) list_add(&tin->tin_list, &fq->tin_backlog); } flow->tin = tin; skb_list_walk_safe(skb, skb, next) { skb_mark_not_on_list(skb); flow->backlog += skb->len; tin->backlog_bytes += skb->len; tin->backlog_packets++; fq->memory_usage += skb->truesize; fq->backlog++; __skb_queue_tail(&flow->queue, skb); } if (list_empty(&flow->flowchain)) { flow->deficit = fq->quantum; list_add_tail(&flow->flowchain, &tin->new_flows); } oom = (fq->memory_usage > fq->memory_limit); while (fq->backlog > fq->limit || oom) { flow = fq_find_fattest_flow(fq); if (!flow) return; if (!fq_flow_drop(fq, flow, free_func)) return; flow->tin->overlimit++; fq->overlimit++; if (oom) { fq->overmemory++; oom = (fq->memory_usage > fq->memory_limit); } } } static void fq_flow_filter(struct fq *fq, struct fq_flow *flow, fq_skb_filter_t filter_func, void *filter_data, fq_skb_free_t free_func) { struct fq_tin *tin = flow->tin; struct sk_buff *skb, *tmp; lockdep_assert_held(&fq->lock); skb_queue_walk_safe(&flow->queue, skb, tmp) { if (!filter_func(fq, tin, flow, skb, filter_data)) continue; __skb_unlink(skb, &flow->queue); fq_adjust_removal(fq, flow, skb); free_func(fq, tin, flow, skb); } } static void fq_tin_filter(struct fq *fq, struct fq_tin *tin, fq_skb_filter_t filter_func, void *filter_data, fq_skb_free_t free_func) { struct fq_flow *flow; lockdep_assert_held(&fq->lock); list_for_each_entry(flow, &tin->new_flows, flowchain) fq_flow_filter(fq, flow, filter_func, filter_data, free_func); list_for_each_entry(flow, &tin->old_flows, flowchain) fq_flow_filter(fq, flow, filter_func, filter_data, free_func); } static void fq_flow_reset(struct fq *fq, struct fq_flow *flow, fq_skb_free_t free_func) { struct fq_tin *tin = flow->tin; struct sk_buff *skb; while ((skb = fq_flow_dequeue(fq, flow))) free_func(fq, tin, flow, skb); if (!list_empty(&flow->flowchain)) { list_del_init(&flow->flowchain); if (list_empty(&tin->new_flows) && list_empty(&tin->old_flows)) list_del_init(&tin->tin_list); } flow->tin = NULL; WARN_ON_ONCE(flow->backlog); } static void fq_tin_reset(struct fq *fq, struct fq_tin *tin, fq_skb_free_t free_func) { struct list_head *head; struct fq_flow *flow; for (;;) { head = &tin->new_flows; if (list_empty(head)) { head = &tin->old_flows; if (list_empty(head)) break; } flow = list_first_entry(head, struct fq_flow, flowchain); fq_flow_reset(fq, flow, free_func); } WARN_ON_ONCE(!list_empty(&tin->tin_list)); WARN_ON_ONCE(tin->backlog_bytes); WARN_ON_ONCE(tin->backlog_packets); } static void fq_flow_init(struct fq_flow *flow) { INIT_LIST_HEAD(&flow->flowchain); __skb_queue_head_init(&flow->queue); } static void fq_tin_init(struct fq_tin *tin) { INIT_LIST_HEAD(&tin->new_flows); INIT_LIST_HEAD(&tin->old_flows); INIT_LIST_HEAD(&tin->tin_list); fq_flow_init(&tin->default_flow); } static int fq_init(struct fq *fq, int flows_cnt) { int i; memset(fq, 0, sizeof(fq[0])); spin_lock_init(&fq->lock); INIT_LIST_HEAD(&fq->tin_backlog); fq->flows_cnt = max_t(u32, flows_cnt, 1); fq->quantum = 300; fq->limit = 8192; fq->memory_limit = 16 << 20; /* 16 MBytes */ fq->flows = kvcalloc(fq->flows_cnt, sizeof(fq->flows[0]), GFP_KERNEL); if (!fq->flows) return -ENOMEM; fq->flows_bitmap = bitmap_zalloc(fq->flows_cnt, GFP_KERNEL); if (!fq->flows_bitmap) { kvfree(fq->flows); fq->flows = NULL; return -ENOMEM; } for (i = 0; i < fq->flows_cnt; i++) fq_flow_init(&fq->flows[i]); return 0; } static void fq_reset(struct fq *fq, fq_skb_free_t free_func) { int i; for (i = 0; i < fq->flows_cnt; i++) fq_flow_reset(fq, &fq->flows[i], free_func); kvfree(fq->flows); fq->flows = NULL; bitmap_free(fq->flows_bitmap); fq->flows_bitmap = NULL; } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_SPINLOCK_H #define __LINUX_SPINLOCK_H #define __LINUX_INSIDE_SPINLOCK_H /* * include/linux/spinlock.h - generic spinlock/rwlock declarations * * here's the role of the various spinlock/rwlock related include files: * * on SMP builds: * * asm/spinlock_types.h: contains the arch_spinlock_t/arch_rwlock_t and the * initializers * * linux/spinlock_types_raw: * The raw types and initializers * linux/spinlock_types.h: * defines the generic type and initializers * * asm/spinlock.h: contains the arch_spin_*()/etc. lowlevel * implementations, mostly inline assembly code * * (also included on UP-debug builds:) * * linux/spinlock_api_smp.h: * contains the prototypes for the _spin_*() APIs. * * linux/spinlock.h: builds the final spin_*() APIs. * * on UP builds: * * linux/spinlock_type_up.h: * contains the generic, simplified UP spinlock type. * (which is an empty structure on non-debug builds) * * linux/spinlock_types_raw: * The raw RT types and initializers * linux/spinlock_types.h: * defines the generic type and initializers * * linux/spinlock_up.h: * contains the arch_spin_*()/etc. version of UP * builds. (which are NOPs on non-debug, non-preempt * builds) * * (included on UP-non-debug builds:) * * linux/spinlock_api_up.h: * builds the _spin_*() APIs. * * linux/spinlock.h: builds the final spin_*() APIs. */ #include <linux/typecheck.h> #include <linux/preempt.h> #include <linux/linkage.h> #include <linux/compiler.h> #include <linux/irqflags.h> #include <linux/thread_info.h> #include <linux/stringify.h> #include <linux/bottom_half.h> #include <linux/lockdep.h> #include <linux/cleanup.h> #include <asm/barrier.h> #include <asm/mmiowb.h> /* * Must define these before including other files, inline functions need them */ #define LOCK_SECTION_NAME ".text..lock."KBUILD_BASENAME #define LOCK_SECTION_START(extra) \ ".subsection 1\n\t" \ extra \ ".ifndef " LOCK_SECTION_NAME "\n\t" \ LOCK_SECTION_NAME ":\n\t" \ ".endif\n" #define LOCK_SECTION_END \ ".previous\n\t" #define __lockfunc __section(".spinlock.text") /* * Pull the arch_spinlock_t and arch_rwlock_t definitions: */ #include <linux/spinlock_types.h> /* * Pull the arch_spin*() functions/declarations (UP-nondebug doesn't need them): */ #ifdef CONFIG_SMP # include <asm/spinlock.h> #else # include <linux/spinlock_up.h> #endif #ifdef CONFIG_DEBUG_SPINLOCK extern void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, struct lock_class_key *key, short inner); # define raw_spin_lock_init(lock) \ do { \ static struct lock_class_key __key; \ \ __raw_spin_lock_init((lock), #lock, &__key, LD_WAIT_SPIN); \ } while (0) #else # define raw_spin_lock_init(lock) \ do { *(lock) = __RAW_SPIN_LOCK_UNLOCKED(lock); } while (0) #endif #define raw_spin_is_locked(lock) arch_spin_is_locked(&(lock)->raw_lock) #ifdef arch_spin_is_contended #define raw_spin_is_contended(lock) arch_spin_is_contended(&(lock)->raw_lock) #else #define raw_spin_is_contended(lock) (((void)(lock), 0)) #endif /*arch_spin_is_contended*/ /* * smp_mb__after_spinlock() provides the equivalent of a full memory barrier * between program-order earlier lock acquisitions and program-order later * memory accesses. * * This guarantees that the following two properties hold: * * 1) Given the snippet: * * { X = 0; Y = 0; } * * CPU0 CPU1 * * WRITE_ONCE(X, 1); WRITE_ONCE(Y, 1); * spin_lock(S); smp_mb(); * smp_mb__after_spinlock(); r1 = READ_ONCE(X); * r0 = READ_ONCE(Y); * spin_unlock(S); * * it is forbidden that CPU0 does not observe CPU1's store to Y (r0 = 0) * and CPU1 does not observe CPU0's store to X (r1 = 0); see the comments * preceding the call to smp_mb__after_spinlock() in __schedule() and in * try_to_wake_up(). * * 2) Given the snippet: * * { X = 0; Y = 0; } * * CPU0 CPU1 CPU2 * * spin_lock(S); spin_lock(S); r1 = READ_ONCE(Y); * WRITE_ONCE(X, 1); smp_mb__after_spinlock(); smp_rmb(); * spin_unlock(S); r0 = READ_ONCE(X); r2 = READ_ONCE(X); * WRITE_ONCE(Y, 1); * spin_unlock(S); * * it is forbidden that CPU0's critical section executes before CPU1's * critical section (r0 = 1), CPU2 observes CPU1's store to Y (r1 = 1) * and CPU2 does not observe CPU0's store to X (r2 = 0); see the comments * preceding the calls to smp_rmb() in try_to_wake_up() for similar * snippets but "projected" onto two CPUs. * * Property (2) upgrades the lock to an RCsc lock. * * Since most load-store architectures implement ACQUIRE with an smp_mb() after * the LL/SC loop, they need no further barriers. Similarly all our TSO * architectures imply an smp_mb() for each atomic instruction and equally don't * need more. * * Architectures that can implement ACQUIRE better need to take care. */ #ifndef smp_mb__after_spinlock #define smp_mb__after_spinlock() kcsan_mb() #endif #ifdef CONFIG_DEBUG_SPINLOCK extern void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock); extern int do_raw_spin_trylock(raw_spinlock_t *lock); extern void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock); #else static inline void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock) { __acquire(lock); arch_spin_lock(&lock->raw_lock); mmiowb_spin_lock(); } static inline int do_raw_spin_trylock(raw_spinlock_t *lock) { int ret = arch_spin_trylock(&(lock)->raw_lock); if (ret) mmiowb_spin_lock(); return ret; } static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) { mmiowb_spin_unlock(); arch_spin_unlock(&lock->raw_lock); __release(lock); } #endif /* * Define the various spin_lock methods. Note we define these * regardless of whether CONFIG_SMP or CONFIG_PREEMPTION are set. The * various methods are defined as nops in the case they are not * required. */ #define raw_spin_trylock(lock) __cond_lock(lock, _raw_spin_trylock(lock)) #define raw_spin_lock(lock) _raw_spin_lock(lock) #ifdef CONFIG_DEBUG_LOCK_ALLOC # define raw_spin_lock_nested(lock, subclass) \ _raw_spin_lock_nested(lock, subclass) # define raw_spin_lock_nest_lock(lock, nest_lock) \ do { \ typecheck(struct lockdep_map *, &(nest_lock)->dep_map);\ _raw_spin_lock_nest_lock(lock, &(nest_lock)->dep_map); \ } while (0) #else /* * Always evaluate the 'subclass' argument to avoid that the compiler * warns about set-but-not-used variables when building with * CONFIG_DEBUG_LOCK_ALLOC=n and with W=1. */ # define raw_spin_lock_nested(lock, subclass) \ _raw_spin_lock(((void)(subclass), (lock))) # define raw_spin_lock_nest_lock(lock, nest_lock) _raw_spin_lock(lock) #endif #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) #define raw_spin_lock_irqsave(lock, flags) \ do { \ typecheck(unsigned long, flags); \ flags = _raw_spin_lock_irqsave(lock); \ } while (0) #ifdef CONFIG_DEBUG_LOCK_ALLOC #define raw_spin_lock_irqsave_nested(lock, flags, subclass) \ do { \ typecheck(unsigned long, flags); \ flags = _raw_spin_lock_irqsave_nested(lock, subclass); \ } while (0) #else #define raw_spin_lock_irqsave_nested(lock, flags, subclass) \ do { \ typecheck(unsigned long, flags); \ flags = _raw_spin_lock_irqsave(lock); \ } while (0) #endif #else #define raw_spin_lock_irqsave(lock, flags) \ do { \ typecheck(unsigned long, flags); \ _raw_spin_lock_irqsave(lock, flags); \ } while (0) #define raw_spin_lock_irqsave_nested(lock, flags, subclass) \ raw_spin_lock_irqsave(lock, flags) #endif #define raw_spin_lock_irq(lock) _raw_spin_lock_irq(lock) #define raw_spin_lock_bh(lock) _raw_spin_lock_bh(lock) #define raw_spin_unlock(lock) _raw_spin_unlock(lock) #define raw_spin_unlock_irq(lock) _raw_spin_unlock_irq(lock) #define raw_spin_unlock_irqrestore(lock, flags) \ do { \ typecheck(unsigned long, flags); \ _raw_spin_unlock_irqrestore(lock, flags); \ } while (0) #define raw_spin_unlock_bh(lock) _raw_spin_unlock_bh(lock) #define raw_spin_trylock_bh(lock) \ __cond_lock(lock, _raw_spin_trylock_bh(lock)) #define raw_spin_trylock_irq(lock) \ ({ \ local_irq_disable(); \ raw_spin_trylock(lock) ? \ 1 : ({ local_irq_enable(); 0; }); \ }) #define raw_spin_trylock_irqsave(lock, flags) \ ({ \ local_irq_save(flags); \ raw_spin_trylock(lock) ? \ 1 : ({ local_irq_restore(flags); 0; }); \ }) #ifndef CONFIG_PREEMPT_RT /* Include rwlock functions for !RT */ #include <linux/rwlock.h> #endif /* * Pull the _spin_*()/_read_*()/_write_*() functions/declarations: */ #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) # include <linux/spinlock_api_smp.h> #else # include <linux/spinlock_api_up.h> #endif /* Non PREEMPT_RT kernel, map to raw spinlocks: */ #ifndef CONFIG_PREEMPT_RT /* * Map the spin_lock functions to the raw variants for PREEMPT_RT=n */ static __always_inline raw_spinlock_t *spinlock_check(spinlock_t *lock) { return &lock->rlock; } #ifdef CONFIG_DEBUG_SPINLOCK # define spin_lock_init(lock) \ do { \ static struct lock_class_key __key; \ \ __raw_spin_lock_init(spinlock_check(lock), \ #lock, &__key, LD_WAIT_CONFIG); \ } while (0) #else # define spin_lock_init(_lock) \ do { \ spinlock_check(_lock); \ *(_lock) = __SPIN_LOCK_UNLOCKED(_lock); \ } while (0) #endif static __always_inline void spin_lock(spinlock_t *lock) { raw_spin_lock(&lock->rlock); } static __always_inline void spin_lock_bh(spinlock_t *lock) { raw_spin_lock_bh(&lock->rlock); } static __always_inline int spin_trylock(spinlock_t *lock) { return raw_spin_trylock(&lock->rlock); } #define spin_lock_nested(lock, subclass) \ do { \ raw_spin_lock_nested(spinlock_check(lock), subclass); \ } while (0) #define spin_lock_nest_lock(lock, nest_lock) \ do { \ raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock); \ } while (0) static __always_inline void spin_lock_irq(spinlock_t *lock) { raw_spin_lock_irq(&lock->rlock); } #define spin_lock_irqsave(lock, flags) \ do { \ raw_spin_lock_irqsave(spinlock_check(lock), flags); \ } while (0) #define spin_lock_irqsave_nested(lock, flags, subclass) \ do { \ raw_spin_lock_irqsave_nested(spinlock_check(lock), flags, subclass); \ } while (0) static __always_inline void spin_unlock(spinlock_t *lock) { raw_spin_unlock(&lock->rlock); } static __always_inline void spin_unlock_bh(spinlock_t *lock) { raw_spin_unlock_bh(&lock->rlock); } static __always_inline void spin_unlock_irq(spinlock_t *lock) { raw_spin_unlock_irq(&lock->rlock); } static __always_inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) { raw_spin_unlock_irqrestore(&lock->rlock, flags); } static __always_inline int spin_trylock_bh(spinlock_t *lock) { return raw_spin_trylock_bh(&lock->rlock); } static __always_inline int spin_trylock_irq(spinlock_t *lock) { return raw_spin_trylock_irq(&lock->rlock); } #define spin_trylock_irqsave(lock, flags) \ ({ \ raw_spin_trylock_irqsave(spinlock_check(lock), flags); \ }) /** * spin_is_locked() - Check whether a spinlock is locked. * @lock: Pointer to the spinlock. * * This function is NOT required to provide any memory ordering * guarantees; it could be used for debugging purposes or, when * additional synchronization is needed, accompanied with other * constructs (memory barriers) enforcing the synchronization. * * Returns: 1 if @lock is locked, 0 otherwise. * * Note that the function only tells you that the spinlock is * seen to be locked, not that it is locked on your CPU. * * Further, on CONFIG_SMP=n builds with CONFIG_DEBUG_SPINLOCK=n, * the return value is always 0 (see include/linux/spinlock_up.h). * Therefore you should not rely heavily on the return value. */ static __always_inline int spin_is_locked(spinlock_t *lock) { return raw_spin_is_locked(&lock->rlock); } static __always_inline int spin_is_contended(spinlock_t *lock) { return raw_spin_is_contended(&lock->rlock); } #define assert_spin_locked(lock) assert_raw_spin_locked(&(lock)->rlock) #else /* !CONFIG_PREEMPT_RT */ # include <linux/spinlock_rt.h> #endif /* CONFIG_PREEMPT_RT */ /* * Does a critical section need to be broken due to another * task waiting?: (technically does not depend on CONFIG_PREEMPTION, * but a general need for low latency) */ static inline int spin_needbreak(spinlock_t *lock) { #ifdef CONFIG_PREEMPTION return spin_is_contended(lock); #else return 0; #endif } /* * Check if a rwlock is contended. * Returns non-zero if there is another task waiting on the rwlock. * Returns zero if the lock is not contended or the system / underlying * rwlock implementation does not support contention detection. * Technically does not depend on CONFIG_PREEMPTION, but a general need * for low latency. */ static inline int rwlock_needbreak(rwlock_t *lock) { #ifdef CONFIG_PREEMPTION return rwlock_is_contended(lock); #else return 0; #endif } /* * Pull the atomic_t declaration: * (asm-mips/atomic.h needs above definitions) */ #include <linux/atomic.h> /** * atomic_dec_and_lock - lock on reaching reference count zero * @atomic: the atomic counter * @lock: the spinlock in question * * Decrements @atomic by 1. If the result is 0, returns true and locks * @lock. Returns false for all other cases. */ extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock); #define atomic_dec_and_lock(atomic, lock) \ __cond_lock(lock, _atomic_dec_and_lock(atomic, lock)) extern int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock, unsigned long *flags); #define atomic_dec_and_lock_irqsave(atomic, lock, flags) \ __cond_lock(lock, _atomic_dec_and_lock_irqsave(atomic, lock, &(flags))) extern int _atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock); #define atomic_dec_and_raw_lock(atomic, lock) \ __cond_lock(lock, _atomic_dec_and_raw_lock(atomic, lock)) extern int _atomic_dec_and_raw_lock_irqsave(atomic_t *atomic, raw_spinlock_t *lock, unsigned long *flags); #define atomic_dec_and_raw_lock_irqsave(atomic, lock, flags) \ __cond_lock(lock, _atomic_dec_and_raw_lock_irqsave(atomic, lock, &(flags))) int __alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *lock_mask, size_t max_size, unsigned int cpu_mult, gfp_t gfp, const char *name, struct lock_class_key *key); #define alloc_bucket_spinlocks(locks, lock_mask, max_size, cpu_mult, gfp) \ ({ \ static struct lock_class_key key; \ int ret; \ \ ret = __alloc_bucket_spinlocks(locks, lock_mask, max_size, \ cpu_mult, gfp, #locks, &key); \ ret; \ }) void free_bucket_spinlocks(spinlock_t *locks); DEFINE_LOCK_GUARD_1(raw_spinlock, raw_spinlock_t, raw_spin_lock(_T->lock), raw_spin_unlock(_T->lock)) DEFINE_LOCK_GUARD_1_COND(raw_spinlock, _try, raw_spin_trylock(_T->lock)) DEFINE_LOCK_GUARD_1(raw_spinlock_nested, raw_spinlock_t, raw_spin_lock_nested(_T->lock, SINGLE_DEPTH_NESTING), raw_spin_unlock(_T->lock)) DEFINE_LOCK_GUARD_1(raw_spinlock_irq, raw_spinlock_t, raw_spin_lock_irq(_T->lock), raw_spin_unlock_irq(_T->lock)) DEFINE_LOCK_GUARD_1_COND(raw_spinlock_irq, _try, raw_spin_trylock_irq(_T->lock)) DEFINE_LOCK_GUARD_1(raw_spinlock_irqsave, raw_spinlock_t, raw_spin_lock_irqsave(_T->lock, _T->flags), raw_spin_unlock_irqrestore(_T->lock, _T->flags), unsigned long flags) DEFINE_LOCK_GUARD_1_COND(raw_spinlock_irqsave, _try, raw_spin_trylock_irqsave(_T->lock, _T->flags)) DEFINE_LOCK_GUARD_1(spinlock, spinlock_t, spin_lock(_T->lock), spin_unlock(_T->lock)) DEFINE_LOCK_GUARD_1_COND(spinlock, _try, spin_trylock(_T->lock)) DEFINE_LOCK_GUARD_1(spinlock_irq, spinlock_t, spin_lock_irq(_T->lock), spin_unlock_irq(_T->lock)) DEFINE_LOCK_GUARD_1_COND(spinlock_irq, _try, spin_trylock_irq(_T->lock)) DEFINE_LOCK_GUARD_1(spinlock_irqsave, spinlock_t, spin_lock_irqsave(_T->lock, _T->flags), spin_unlock_irqrestore(_T->lock, _T->flags), unsigned long flags) DEFINE_LOCK_GUARD_1_COND(spinlock_irqsave, _try, spin_trylock_irqsave(_T->lock, _T->flags)) DEFINE_LOCK_GUARD_1(read_lock, rwlock_t, read_lock(_T->lock), read_unlock(_T->lock)) DEFINE_LOCK_GUARD_1(read_lock_irq, rwlock_t, read_lock_irq(_T->lock), read_unlock_irq(_T->lock)) DEFINE_LOCK_GUARD_1(read_lock_irqsave, rwlock_t, read_lock_irqsave(_T->lock, _T->flags), read_unlock_irqrestore(_T->lock, _T->flags), unsigned long flags) DEFINE_LOCK_GUARD_1(write_lock, rwlock_t, write_lock(_T->lock), write_unlock(_T->lock)) DEFINE_LOCK_GUARD_1(write_lock_irq, rwlock_t, write_lock_irq(_T->lock), write_unlock_irq(_T->lock)) DEFINE_LOCK_GUARD_1(write_lock_irqsave, rwlock_t, write_lock_irqsave(_T->lock, _T->flags), write_unlock_irqrestore(_T->lock, _T->flags), unsigned long flags) #undef __LINUX_INSIDE_SPINLOCK_H #endif /* __LINUX_SPINLOCK_H */
83 1 77 3 80 82 82 82 23 12 26 6 20 10 4 18 18 22 26 6 20 26 136 18 76 15 68 26 25 26 26 4 22 5 21 24 2 5 5 2 21 5 15 18 2 2 2 7 14 21 9 9 19 2 13 13 1 1 14 13 1 7 9 8 5 3 1 8 5 3 7 7 31 32 30 1 15 1 13 2 4 1 1 2 1 1 1 9 13 29 16 13 13 13 10 13 13 13 19 10 8 2 6 1 17 46 11 7 20 5 7 8 15 26 2 30 2 5 25 25 24 24 23 1 4 18 7 23 23 22 23 23 22 1 1 1 23 1 1 1 23 1 29 29 28 27 23 6 2 4 14 14 28 14 14 28 53 3 50 31 16 16 107 106 42 63 48 3 4 15 91 24 37 37 102 1 92 9 37 1 22 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 // SPDX-License-Identifier: GPL-2.0-or-later /* * inet_diag.c Module for monitoring INET transport protocols sockets. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/random.h> #include <linux/slab.h> #include <linux/cache.h> #include <linux/init.h> #include <linux/time.h> #include <net/icmp.h> #include <net/tcp.h> #include <net/ipv6.h> #include <net/inet_common.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #include <net/inet_timewait_sock.h> #include <net/inet6_hashtables.h> #include <net/bpf_sk_storage.h> #include <net/netlink.h> #include <linux/inet.h> #include <linux/stddef.h> #include <linux/inet_diag.h> #include <linux/sock_diag.h> static const struct inet_diag_handler __rcu **inet_diag_table; struct inet_diag_entry { const __be32 *saddr; const __be32 *daddr; u16 sport; u16 dport; u16 family; u16 userlocks; u32 ifindex; u32 mark; #ifdef CONFIG_SOCK_CGROUP_DATA u64 cgroup_id; #endif }; static const struct inet_diag_handler *inet_diag_lock_handler(int proto) { const struct inet_diag_handler *handler; if (proto < 0 || proto >= IPPROTO_MAX) return NULL; if (!READ_ONCE(inet_diag_table[proto])) sock_load_diag_module(AF_INET, proto); rcu_read_lock(); handler = rcu_dereference(inet_diag_table[proto]); if (handler && !try_module_get(handler->owner)) handler = NULL; rcu_read_unlock(); return handler; } static void inet_diag_unlock_handler(const struct inet_diag_handler *handler) { module_put(handler->owner); } void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk) { r->idiag_family = sk->sk_family; r->id.idiag_sport = htons(sk->sk_num); r->id.idiag_dport = sk->sk_dport; r->id.idiag_if = sk->sk_bound_dev_if; sock_diag_save_cookie(sk, r->id.idiag_cookie); #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) { *(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr; *(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr; } else #endif { memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); r->id.idiag_src[0] = sk->sk_rcv_saddr; r->id.idiag_dst[0] = sk->sk_daddr; } } EXPORT_SYMBOL_GPL(inet_diag_msg_common_fill); static size_t inet_sk_attr_size(struct sock *sk, const struct inet_diag_req_v2 *req, bool net_admin) { const struct inet_diag_handler *handler; size_t aux = 0; rcu_read_lock(); handler = rcu_dereference(inet_diag_table[req->sdiag_protocol]); DEBUG_NET_WARN_ON_ONCE(!handler); if (handler && handler->idiag_get_aux_size) aux = handler->idiag_get_aux_size(sk, net_admin); rcu_read_unlock(); return nla_total_size(sizeof(struct tcp_info)) + nla_total_size(sizeof(struct inet_diag_msg)) + inet_diag_msg_attrs_size() + nla_total_size(sizeof(struct inet_diag_meminfo)) + nla_total_size(SK_MEMINFO_VARS * sizeof(u32)) + nla_total_size(TCP_CA_NAME_MAX) + nla_total_size(sizeof(struct tcpvegas_info)) + aux + 64; } int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, struct inet_diag_msg *r, int ext, struct user_namespace *user_ns, bool net_admin) { const struct inet_sock *inet = inet_sk(sk); struct inet_diag_sockopt inet_sockopt; if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown)) goto errout; /* IPv6 dual-stack sockets use inet->tos for IPv4 connections, * hence this needs to be included regardless of socket family. */ if (ext & (1 << (INET_DIAG_TOS - 1))) if (nla_put_u8(skb, INET_DIAG_TOS, READ_ONCE(inet->tos)) < 0) goto errout; #if IS_ENABLED(CONFIG_IPV6) if (r->idiag_family == AF_INET6) { if (ext & (1 << (INET_DIAG_TCLASS - 1))) if (nla_put_u8(skb, INET_DIAG_TCLASS, inet6_sk(sk)->tclass) < 0) goto errout; if (((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && nla_put_u8(skb, INET_DIAG_SKV6ONLY, ipv6_only_sock(sk))) goto errout; } #endif if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, READ_ONCE(sk->sk_mark))) goto errout; if (ext & (1 << (INET_DIAG_CLASS_ID - 1)) || ext & (1 << (INET_DIAG_TCLASS - 1))) { u32 classid = 0; #ifdef CONFIG_SOCK_CGROUP_DATA classid = sock_cgroup_classid(&sk->sk_cgrp_data); #endif /* Fallback to socket priority if class id isn't set. * Classful qdiscs use it as direct reference to class. * For cgroup2 classid is always zero. */ if (!classid) classid = READ_ONCE(sk->sk_priority); if (nla_put_u32(skb, INET_DIAG_CLASS_ID, classid)) goto errout; } #ifdef CONFIG_SOCK_CGROUP_DATA if (nla_put_u64_64bit(skb, INET_DIAG_CGROUP_ID, cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)), INET_DIAG_PAD)) goto errout; #endif r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); r->idiag_inode = sock_i_ino(sk); memset(&inet_sockopt, 0, sizeof(inet_sockopt)); inet_sockopt.recverr = inet_test_bit(RECVERR, sk); inet_sockopt.is_icsk = inet_test_bit(IS_ICSK, sk); inet_sockopt.freebind = inet_test_bit(FREEBIND, sk); inet_sockopt.hdrincl = inet_test_bit(HDRINCL, sk); inet_sockopt.mc_loop = inet_test_bit(MC_LOOP, sk); inet_sockopt.transparent = inet_test_bit(TRANSPARENT, sk); inet_sockopt.mc_all = inet_test_bit(MC_ALL, sk); inet_sockopt.nodefrag = inet_test_bit(NODEFRAG, sk); inet_sockopt.bind_address_no_port = inet_test_bit(BIND_ADDRESS_NO_PORT, sk); inet_sockopt.recverr_rfc4884 = inet_test_bit(RECVERR_RFC4884, sk); inet_sockopt.defer_connect = inet_test_bit(DEFER_CONNECT, sk); if (nla_put(skb, INET_DIAG_SOCKOPT, sizeof(inet_sockopt), &inet_sockopt)) goto errout; return 0; errout: return 1; } EXPORT_SYMBOL_GPL(inet_diag_msg_attrs_fill); static int inet_diag_parse_attrs(const struct nlmsghdr *nlh, int hdrlen, struct nlattr **req_nlas) { struct nlattr *nla; int remaining; nlmsg_for_each_attr(nla, nlh, hdrlen, remaining) { int type = nla_type(nla); if (type == INET_DIAG_REQ_PROTOCOL && nla_len(nla) != sizeof(u32)) return -EINVAL; if (type < __INET_DIAG_REQ_MAX) req_nlas[type] = nla; } return 0; } static int inet_diag_get_protocol(const struct inet_diag_req_v2 *req, const struct inet_diag_dump_data *data) { if (data->req_nlas[INET_DIAG_REQ_PROTOCOL]) return nla_get_u32(data->req_nlas[INET_DIAG_REQ_PROTOCOL]); return req->sdiag_protocol; } #define MAX_DUMP_ALLOC_SIZE (KMALLOC_MAX_SIZE - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, u16 nlmsg_flags, bool net_admin) { const struct tcp_congestion_ops *ca_ops; const struct inet_diag_handler *handler; struct inet_diag_dump_data *cb_data; int ext = req->idiag_ext; struct inet_diag_msg *r; struct nlmsghdr *nlh; struct nlattr *attr; void *info = NULL; int protocol; cb_data = cb->data; protocol = inet_diag_get_protocol(req, cb_data); /* inet_diag_lock_handler() made sure inet_diag_table[] is stable. */ handler = rcu_dereference_protected(inet_diag_table[protocol], 1); DEBUG_NET_WARN_ON_ONCE(!handler); if (!handler) return -ENXIO; nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags); if (!nlh) return -EMSGSIZE; r = nlmsg_data(nlh); BUG_ON(!sk_fullsock(sk)); inet_diag_msg_common_fill(r, sk); r->idiag_state = sk->sk_state; r->idiag_timer = 0; r->idiag_retrans = 0; r->idiag_expires = 0; if (inet_diag_msg_attrs_fill(sk, skb, r, ext, sk_user_ns(NETLINK_CB(cb->skb).sk), net_admin)) goto errout; if (ext & (1 << (INET_DIAG_MEMINFO - 1))) { struct inet_diag_meminfo minfo = { .idiag_rmem = sk_rmem_alloc_get(sk), .idiag_wmem = READ_ONCE(sk->sk_wmem_queued), .idiag_fmem = sk_forward_alloc_get(sk), .idiag_tmem = sk_wmem_alloc_get(sk), }; if (nla_put(skb, INET_DIAG_MEMINFO, sizeof(minfo), &minfo) < 0) goto errout; } if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO)) goto errout; /* * RAW sockets might have user-defined protocols assigned, * so report the one supplied on socket creation. */ if (sk->sk_type == SOCK_RAW) { if (nla_put_u8(skb, INET_DIAG_PROTOCOL, sk->sk_protocol)) goto errout; } if (!icsk) { handler->idiag_get_info(sk, r, NULL); goto out; } if (icsk->icsk_pending == ICSK_TIME_RETRANS || icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { r->idiag_timer = 1; r->idiag_retrans = icsk->icsk_retransmits; r->idiag_expires = jiffies_delta_to_msecs(icsk->icsk_timeout - jiffies); } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { r->idiag_timer = 4; r->idiag_retrans = icsk->icsk_probes_out; r->idiag_expires = jiffies_delta_to_msecs(icsk->icsk_timeout - jiffies); } else if (timer_pending(&sk->sk_timer)) { r->idiag_timer = 2; r->idiag_retrans = icsk->icsk_probes_out; r->idiag_expires = jiffies_delta_to_msecs(sk->sk_timer.expires - jiffies); } if ((ext & (1 << (INET_DIAG_INFO - 1))) && handler->idiag_info_size) { attr = nla_reserve_64bit(skb, INET_DIAG_INFO, handler->idiag_info_size, INET_DIAG_PAD); if (!attr) goto errout; info = nla_data(attr); } if (ext & (1 << (INET_DIAG_CONG - 1))) { int err = 0; rcu_read_lock(); ca_ops = READ_ONCE(icsk->icsk_ca_ops); if (ca_ops) err = nla_put_string(skb, INET_DIAG_CONG, ca_ops->name); rcu_read_unlock(); if (err < 0) goto errout; } handler->idiag_get_info(sk, r, info); if (ext & (1 << (INET_DIAG_INFO - 1)) && handler->idiag_get_aux) if (handler->idiag_get_aux(sk, net_admin, skb) < 0) goto errout; if (sk->sk_state < TCP_TIME_WAIT) { union tcp_cc_info info; size_t sz = 0; int attr; rcu_read_lock(); ca_ops = READ_ONCE(icsk->icsk_ca_ops); if (ca_ops && ca_ops->get_info) sz = ca_ops->get_info(sk, ext, &attr, &info); rcu_read_unlock(); if (sz && nla_put(skb, attr, sz, &info) < 0) goto errout; } /* Keep it at the end for potential retry with a larger skb, * or else do best-effort fitting, which is only done for the * first_nlmsg. */ if (cb_data->bpf_stg_diag) { bool first_nlmsg = ((unsigned char *)nlh == skb->data); unsigned int prev_min_dump_alloc; unsigned int total_nla_size = 0; unsigned int msg_len; int err; msg_len = skb_tail_pointer(skb) - (unsigned char *)nlh; err = bpf_sk_storage_diag_put(cb_data->bpf_stg_diag, sk, skb, INET_DIAG_SK_BPF_STORAGES, &total_nla_size); if (!err) goto out; total_nla_size += msg_len; prev_min_dump_alloc = cb->min_dump_alloc; if (total_nla_size > prev_min_dump_alloc) cb->min_dump_alloc = min_t(u32, total_nla_size, MAX_DUMP_ALLOC_SIZE); if (!first_nlmsg) goto errout; if (cb->min_dump_alloc > prev_min_dump_alloc) /* Retry with pskb_expand_head() with * __GFP_DIRECT_RECLAIM */ goto errout; WARN_ON_ONCE(total_nla_size <= prev_min_dump_alloc); /* Send what we have for this sk * and move on to the next sk in the following * dump() */ } out: nlmsg_end(skb, nlh); return 0; errout: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } EXPORT_SYMBOL_GPL(inet_sk_diag_fill); static int inet_twsk_diag_fill(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, u16 nlmsg_flags, bool net_admin) { struct inet_timewait_sock *tw = inet_twsk(sk); struct inet_diag_msg *r; struct nlmsghdr *nlh; long tmo; nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags); if (!nlh) return -EMSGSIZE; r = nlmsg_data(nlh); BUG_ON(tw->tw_state != TCP_TIME_WAIT); inet_diag_msg_common_fill(r, sk); r->idiag_retrans = 0; r->idiag_state = tw->tw_substate; r->idiag_timer = 3; tmo = tw->tw_timer.expires - jiffies; r->idiag_expires = jiffies_delta_to_msecs(tmo); r->idiag_rqueue = 0; r->idiag_wqueue = 0; r->idiag_uid = 0; r->idiag_inode = 0; if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, tw->tw_mark)) { nlmsg_cancel(skb, nlh); return -EMSGSIZE; } nlmsg_end(skb, nlh); return 0; } static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, u16 nlmsg_flags, bool net_admin) { struct request_sock *reqsk = inet_reqsk(sk); struct inet_diag_msg *r; struct nlmsghdr *nlh; long tmo; nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags); if (!nlh) return -EMSGSIZE; r = nlmsg_data(nlh); inet_diag_msg_common_fill(r, sk); r->idiag_state = TCP_SYN_RECV; r->idiag_timer = 1; r->idiag_retrans = reqsk->num_retrans; BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) != offsetof(struct sock, sk_cookie)); tmo = inet_reqsk(sk)->rsk_timer.expires - jiffies; r->idiag_expires = jiffies_delta_to_msecs(tmo); r->idiag_rqueue = 0; r->idiag_wqueue = 0; r->idiag_uid = 0; r->idiag_inode = 0; if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, inet_rsk(reqsk)->ir_mark)) { nlmsg_cancel(skb, nlh); return -EMSGSIZE; } nlmsg_end(skb, nlh); return 0; } static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, u16 nlmsg_flags, bool net_admin) { if (sk->sk_state == TCP_TIME_WAIT) return inet_twsk_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); if (sk->sk_state == TCP_NEW_SYN_RECV) return inet_req_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, r, nlmsg_flags, net_admin); } struct sock *inet_diag_find_one_icsk(struct net *net, struct inet_hashinfo *hashinfo, const struct inet_diag_req_v2 *req) { struct sock *sk; rcu_read_lock(); if (req->sdiag_family == AF_INET) sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_if); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) { if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[3], req->id.idiag_dport, req->id.idiag_src[3], req->id.idiag_sport, req->id.idiag_if); else sk = inet6_lookup(net, hashinfo, NULL, 0, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, (struct in6_addr *)req->id.idiag_src, req->id.idiag_sport, req->id.idiag_if); } #endif else { rcu_read_unlock(); return ERR_PTR(-EINVAL); } rcu_read_unlock(); if (!sk) return ERR_PTR(-ENOENT); if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) { sock_gen_put(sk); return ERR_PTR(-ENOENT); } return sk; } EXPORT_SYMBOL_GPL(inet_diag_find_one_icsk); int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { struct sk_buff *in_skb = cb->skb; bool net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN); struct net *net = sock_net(in_skb->sk); struct sk_buff *rep; struct sock *sk; int err; sk = inet_diag_find_one_icsk(net, hashinfo, req); if (IS_ERR(sk)) return PTR_ERR(sk); rep = nlmsg_new(inet_sk_attr_size(sk, req, net_admin), GFP_KERNEL); if (!rep) { err = -ENOMEM; goto out; } err = sk_diag_fill(sk, rep, cb, req, 0, net_admin); if (err < 0) { WARN_ON(err == -EMSGSIZE); nlmsg_free(rep); goto out; } err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); out: if (sk) sock_gen_put(sk); return err; } EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk); static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb, const struct nlmsghdr *nlh, int hdrlen, const struct inet_diag_req_v2 *req) { const struct inet_diag_handler *handler; struct inet_diag_dump_data dump_data; int err, protocol; memset(&dump_data, 0, sizeof(dump_data)); err = inet_diag_parse_attrs(nlh, hdrlen, dump_data.req_nlas); if (err) return err; protocol = inet_diag_get_protocol(req, &dump_data); handler = inet_diag_lock_handler(protocol); if (!handler) return -ENOENT; if (cmd == SOCK_DIAG_BY_FAMILY) { struct netlink_callback cb = { .nlh = nlh, .skb = in_skb, .data = &dump_data, }; err = handler->dump_one(&cb, req); } else if (cmd == SOCK_DESTROY && handler->destroy) { err = handler->destroy(in_skb, req); } else { err = -EOPNOTSUPP; } inet_diag_unlock_handler(handler); return err; } static int bitstring_match(const __be32 *a1, const __be32 *a2, int bits) { int words = bits >> 5; bits &= 0x1f; if (words) { if (memcmp(a1, a2, words << 2)) return 0; } if (bits) { __be32 w1, w2; __be32 mask; w1 = a1[words]; w2 = a2[words]; mask = htonl((0xffffffff) << (32 - bits)); if ((w1 ^ w2) & mask) return 0; } return 1; } static int inet_diag_bc_run(const struct nlattr *_bc, const struct inet_diag_entry *entry) { const void *bc = nla_data(_bc); int len = nla_len(_bc); while (len > 0) { int yes = 1; const struct inet_diag_bc_op *op = bc; switch (op->code) { case INET_DIAG_BC_NOP: break; case INET_DIAG_BC_JMP: yes = 0; break; case INET_DIAG_BC_S_EQ: yes = entry->sport == op[1].no; break; case INET_DIAG_BC_S_GE: yes = entry->sport >= op[1].no; break; case INET_DIAG_BC_S_LE: yes = entry->sport <= op[1].no; break; case INET_DIAG_BC_D_EQ: yes = entry->dport == op[1].no; break; case INET_DIAG_BC_D_GE: yes = entry->dport >= op[1].no; break; case INET_DIAG_BC_D_LE: yes = entry->dport <= op[1].no; break; case INET_DIAG_BC_AUTO: yes = !(entry->userlocks & SOCK_BINDPORT_LOCK); break; case INET_DIAG_BC_S_COND: case INET_DIAG_BC_D_COND: { const struct inet_diag_hostcond *cond; const __be32 *addr; cond = (const struct inet_diag_hostcond *)(op + 1); if (cond->port != -1 && cond->port != (op->code == INET_DIAG_BC_S_COND ? entry->sport : entry->dport)) { yes = 0; break; } if (op->code == INET_DIAG_BC_S_COND) addr = entry->saddr; else addr = entry->daddr; if (cond->family != AF_UNSPEC && cond->family != entry->family) { if (entry->family == AF_INET6 && cond->family == AF_INET) { if (addr[0] == 0 && addr[1] == 0 && addr[2] == htonl(0xffff) && bitstring_match(addr + 3, cond->addr, cond->prefix_len)) break; } yes = 0; break; } if (cond->prefix_len == 0) break; if (bitstring_match(addr, cond->addr, cond->prefix_len)) break; yes = 0; break; } case INET_DIAG_BC_DEV_COND: { u32 ifindex; ifindex = *((const u32 *)(op + 1)); if (ifindex != entry->ifindex) yes = 0; break; } case INET_DIAG_BC_MARK_COND: { struct inet_diag_markcond *cond; cond = (struct inet_diag_markcond *)(op + 1); if ((entry->mark & cond->mask) != cond->mark) yes = 0; break; } #ifdef CONFIG_SOCK_CGROUP_DATA case INET_DIAG_BC_CGROUP_COND: { u64 cgroup_id; cgroup_id = get_unaligned((const u64 *)(op + 1)); if (cgroup_id != entry->cgroup_id) yes = 0; break; } #endif } if (yes) { len -= op->yes; bc += op->yes; } else { len -= op->no; bc += op->no; } } return len == 0; } /* This helper is available for all sockets (ESTABLISH, TIMEWAIT, SYN_RECV) */ static void entry_fill_addrs(struct inet_diag_entry *entry, const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) { entry->saddr = sk->sk_v6_rcv_saddr.s6_addr32; entry->daddr = sk->sk_v6_daddr.s6_addr32; } else #endif { entry->saddr = &sk->sk_rcv_saddr; entry->daddr = &sk->sk_daddr; } } int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) { struct inet_sock *inet = inet_sk(sk); struct inet_diag_entry entry; if (!bc) return 1; entry.family = sk->sk_family; entry_fill_addrs(&entry, sk); entry.sport = inet->inet_num; entry.dport = ntohs(inet->inet_dport); entry.ifindex = sk->sk_bound_dev_if; entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0; if (sk_fullsock(sk)) entry.mark = READ_ONCE(sk->sk_mark); else if (sk->sk_state == TCP_NEW_SYN_RECV) entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark; else if (sk->sk_state == TCP_TIME_WAIT) entry.mark = inet_twsk(sk)->tw_mark; else entry.mark = 0; #ifdef CONFIG_SOCK_CGROUP_DATA entry.cgroup_id = sk_fullsock(sk) ? cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)) : 0; #endif return inet_diag_bc_run(bc, &entry); } EXPORT_SYMBOL_GPL(inet_diag_bc_sk); static int valid_cc(const void *bc, int len, int cc) { while (len >= 0) { const struct inet_diag_bc_op *op = bc; if (cc > len) return 0; if (cc == len) return 1; if (op->yes < 4 || op->yes & 3) return 0; len -= op->yes; bc += op->yes; } return 0; } /* data is u32 ifindex */ static bool valid_devcond(const struct inet_diag_bc_op *op, int len, int *min_len) { /* Check ifindex space. */ *min_len += sizeof(u32); if (len < *min_len) return false; return true; } /* Validate an inet_diag_hostcond. */ static bool valid_hostcond(const struct inet_diag_bc_op *op, int len, int *min_len) { struct inet_diag_hostcond *cond; int addr_len; /* Check hostcond space. */ *min_len += sizeof(struct inet_diag_hostcond); if (len < *min_len) return false; cond = (struct inet_diag_hostcond *)(op + 1); /* Check address family and address length. */ switch (cond->family) { case AF_UNSPEC: addr_len = 0; break; case AF_INET: addr_len = sizeof(struct in_addr); break; case AF_INET6: addr_len = sizeof(struct in6_addr); break; default: return false; } *min_len += addr_len; if (len < *min_len) return false; /* Check prefix length (in bits) vs address length (in bytes). */ if (cond->prefix_len > 8 * addr_len) return false; return true; } /* Validate a port comparison operator. */ static bool valid_port_comparison(const struct inet_diag_bc_op *op, int len, int *min_len) { /* Port comparisons put the port in a follow-on inet_diag_bc_op. */ *min_len += sizeof(struct inet_diag_bc_op); if (len < *min_len) return false; return true; } static bool valid_markcond(const struct inet_diag_bc_op *op, int len, int *min_len) { *min_len += sizeof(struct inet_diag_markcond); return len >= *min_len; } #ifdef CONFIG_SOCK_CGROUP_DATA static bool valid_cgroupcond(const struct inet_diag_bc_op *op, int len, int *min_len) { *min_len += sizeof(u64); return len >= *min_len; } #endif static int inet_diag_bc_audit(const struct nlattr *attr, const struct sk_buff *skb) { bool net_admin = netlink_net_capable(skb, CAP_NET_ADMIN); const void *bytecode, *bc; int bytecode_len, len; if (!attr || nla_len(attr) < sizeof(struct inet_diag_bc_op)) return -EINVAL; bytecode = bc = nla_data(attr); len = bytecode_len = nla_len(attr); while (len > 0) { int min_len = sizeof(struct inet_diag_bc_op); const struct inet_diag_bc_op *op = bc; switch (op->code) { case INET_DIAG_BC_S_COND: case INET_DIAG_BC_D_COND: if (!valid_hostcond(bc, len, &min_len)) return -EINVAL; break; case INET_DIAG_BC_DEV_COND: if (!valid_devcond(bc, len, &min_len)) return -EINVAL; break; case INET_DIAG_BC_S_EQ: case INET_DIAG_BC_S_GE: case INET_DIAG_BC_S_LE: case INET_DIAG_BC_D_EQ: case INET_DIAG_BC_D_GE: case INET_DIAG_BC_D_LE: if (!valid_port_comparison(bc, len, &min_len)) return -EINVAL; break; case INET_DIAG_BC_MARK_COND: if (!net_admin) return -EPERM; if (!valid_markcond(bc, len, &min_len)) return -EINVAL; break; #ifdef CONFIG_SOCK_CGROUP_DATA case INET_DIAG_BC_CGROUP_COND: if (!valid_cgroupcond(bc, len, &min_len)) return -EINVAL; break; #endif case INET_DIAG_BC_AUTO: case INET_DIAG_BC_JMP: case INET_DIAG_BC_NOP: break; default: return -EINVAL; } if (op->code != INET_DIAG_BC_NOP) { if (op->no < min_len || op->no > len + 4 || op->no & 3) return -EINVAL; if (op->no < len && !valid_cc(bytecode, bytecode_len, len - op->no)) return -EINVAL; } if (op->yes < min_len || op->yes > len + 4 || op->yes & 3) return -EINVAL; bc += op->yes; len -= op->yes; } return len == 0 ? 0 : -EINVAL; } static void twsk_build_assert(void) { BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) != offsetof(struct sock, sk_family)); BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_num) != offsetof(struct inet_sock, inet_num)); BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_dport) != offsetof(struct inet_sock, inet_dport)); BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_rcv_saddr) != offsetof(struct inet_sock, inet_rcv_saddr)); BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_daddr) != offsetof(struct inet_sock, inet_daddr)); #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_rcv_saddr) != offsetof(struct sock, sk_v6_rcv_saddr)); BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_daddr) != offsetof(struct sock, sk_v6_daddr)); #endif } void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct inet_diag_dump_data *cb_data = cb->data; struct net *net = sock_net(skb->sk); u32 idiag_states = r->idiag_states; int i, num, s_i, s_num; struct nlattr *bc; struct sock *sk; bc = cb_data->inet_diag_nla_bc; if (idiag_states & TCPF_SYN_RECV) idiag_states |= TCPF_NEW_SYN_RECV; s_i = cb->args[1]; s_num = num = cb->args[2]; if (cb->args[0] == 0) { if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport) goto skip_listen_ht; for (i = s_i; i <= hashinfo->lhash2_mask; i++) { struct inet_listen_hashbucket *ilb; struct hlist_nulls_node *node; num = 0; ilb = &hashinfo->lhash2[i]; if (hlist_nulls_empty(&ilb->nulls_head)) { s_num = 0; continue; } spin_lock(&ilb->lock); sk_nulls_for_each(sk, node, &ilb->nulls_head) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) continue; if (num < s_num) { num++; continue; } if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) goto next_listen; if (r->id.idiag_sport != inet->inet_sport && r->id.idiag_sport) goto next_listen; if (!inet_diag_bc_sk(bc, sk)) goto next_listen; if (inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, r, NLM_F_MULTI, net_admin) < 0) { spin_unlock(&ilb->lock); goto done; } next_listen: ++num; } spin_unlock(&ilb->lock); s_num = 0; } skip_listen_ht: cb->args[0] = 1; s_i = num = s_num = 0; } /* Process a maximum of SKARR_SZ sockets at a time when walking hash buckets * with bh disabled. */ #define SKARR_SZ 16 /* Dump bound but inactive (not listening, connecting, etc.) sockets */ if (cb->args[0] == 1) { if (!(idiag_states & TCPF_BOUND_INACTIVE)) goto skip_bind_ht; for (i = s_i; i < hashinfo->bhash_size; i++) { struct inet_bind_hashbucket *ibb; struct inet_bind2_bucket *tb2; struct sock *sk_arr[SKARR_SZ]; int num_arr[SKARR_SZ]; int idx, accum, res; resume_bind_walk: num = 0; accum = 0; ibb = &hashinfo->bhash2[i]; if (hlist_empty(&ibb->chain)) { s_num = 0; continue; } spin_lock_bh(&ibb->lock); inet_bind_bucket_for_each(tb2, &ibb->chain) { if (!net_eq(ib2_net(tb2), net)) continue; sk_for_each_bound(sk, &tb2->owners) { struct inet_sock *inet = inet_sk(sk); if (num < s_num) goto next_bind; if (sk->sk_state != TCP_CLOSE || !inet->inet_num) goto next_bind; if (r->sdiag_family != AF_UNSPEC && r->sdiag_family != sk->sk_family) goto next_bind; if (!inet_diag_bc_sk(bc, sk)) goto next_bind; sock_hold(sk); num_arr[accum] = num; sk_arr[accum] = sk; if (++accum == SKARR_SZ) goto pause_bind_walk; next_bind: num++; } } pause_bind_walk: spin_unlock_bh(&ibb->lock); res = 0; for (idx = 0; idx < accum; idx++) { if (res >= 0) { res = inet_sk_diag_fill(sk_arr[idx], NULL, skb, cb, r, NLM_F_MULTI, net_admin); if (res < 0) num = num_arr[idx]; } sock_put(sk_arr[idx]); } if (res < 0) goto done; cond_resched(); if (accum == SKARR_SZ) { s_num = num + 1; goto resume_bind_walk; } s_num = 0; } skip_bind_ht: cb->args[0] = 2; s_i = num = s_num = 0; } if (!(idiag_states & ~TCPF_LISTEN)) goto out; for (i = s_i; i <= hashinfo->ehash_mask; i++) { struct inet_ehash_bucket *head = &hashinfo->ehash[i]; spinlock_t *lock = inet_ehash_lockp(hashinfo, i); struct hlist_nulls_node *node; struct sock *sk_arr[SKARR_SZ]; int num_arr[SKARR_SZ]; int idx, accum, res; if (hlist_nulls_empty(&head->chain)) continue; if (i > s_i) s_num = 0; next_chunk: num = 0; accum = 0; spin_lock_bh(lock); sk_nulls_for_each(sk, node, &head->chain) { int state; if (!net_eq(sock_net(sk), net)) continue; if (num < s_num) goto next_normal; state = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_substate : sk->sk_state; if (!(idiag_states & (1 << state))) goto next_normal; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) goto next_normal; if (r->id.idiag_sport != htons(sk->sk_num) && r->id.idiag_sport) goto next_normal; if (r->id.idiag_dport != sk->sk_dport && r->id.idiag_dport) goto next_normal; twsk_build_assert(); if (!inet_diag_bc_sk(bc, sk)) goto next_normal; if (!refcount_inc_not_zero(&sk->sk_refcnt)) goto next_normal; num_arr[accum] = num; sk_arr[accum] = sk; if (++accum == SKARR_SZ) break; next_normal: ++num; } spin_unlock_bh(lock); res = 0; for (idx = 0; idx < accum; idx++) { if (res >= 0) { res = sk_diag_fill(sk_arr[idx], skb, cb, r, NLM_F_MULTI, net_admin); if (res < 0) num = num_arr[idx]; } sock_gen_put(sk_arr[idx]); } if (res < 0) break; cond_resched(); if (accum == SKARR_SZ) { s_num = num + 1; goto next_chunk; } } done: cb->args[1] = i; cb->args[2] = num; out: ; } EXPORT_SYMBOL_GPL(inet_diag_dump_icsk); static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { struct inet_diag_dump_data *cb_data = cb->data; const struct inet_diag_handler *handler; u32 prev_min_dump_alloc; int protocol, err = 0; protocol = inet_diag_get_protocol(r, cb_data); again: prev_min_dump_alloc = cb->min_dump_alloc; handler = inet_diag_lock_handler(protocol); if (handler) { handler->dump(skb, cb, r); inet_diag_unlock_handler(handler); } else { err = -ENOENT; } /* The skb is not large enough to fit one sk info and * inet_sk_diag_fill() has requested for a larger skb. */ if (!skb->len && cb->min_dump_alloc > prev_min_dump_alloc) { err = pskb_expand_head(skb, 0, cb->min_dump_alloc, GFP_KERNEL); if (!err) goto again; } return err ? : skb->len; } static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh)); } static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen) { const struct nlmsghdr *nlh = cb->nlh; struct inet_diag_dump_data *cb_data; struct sk_buff *skb = cb->skb; struct nlattr *nla; int err; cb_data = kzalloc(sizeof(*cb_data), GFP_KERNEL); if (!cb_data) return -ENOMEM; err = inet_diag_parse_attrs(nlh, hdrlen, cb_data->req_nlas); if (err) { kfree(cb_data); return err; } nla = cb_data->inet_diag_nla_bc; if (nla) { err = inet_diag_bc_audit(nla, skb); if (err) { kfree(cb_data); return err; } } nla = cb_data->inet_diag_nla_bpf_stgs; if (nla) { struct bpf_sk_storage_diag *bpf_stg_diag; bpf_stg_diag = bpf_sk_storage_diag_alloc(nla); if (IS_ERR(bpf_stg_diag)) { kfree(cb_data); return PTR_ERR(bpf_stg_diag); } cb_data->bpf_stg_diag = bpf_stg_diag; } cb->data = cb_data; return 0; } static int inet_diag_dump_start(struct netlink_callback *cb) { return __inet_diag_dump_start(cb, sizeof(struct inet_diag_req_v2)); } static int inet_diag_dump_start_compat(struct netlink_callback *cb) { return __inet_diag_dump_start(cb, sizeof(struct inet_diag_req)); } static int inet_diag_dump_done(struct netlink_callback *cb) { struct inet_diag_dump_data *cb_data = cb->data; bpf_sk_storage_diag_free(cb_data->bpf_stg_diag); kfree(cb->data); return 0; } static int inet_diag_type2proto(int type) { switch (type) { case TCPDIAG_GETSOCK: return IPPROTO_TCP; case DCCPDIAG_GETSOCK: return IPPROTO_DCCP; default: return 0; } } static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb) { struct inet_diag_req *rc = nlmsg_data(cb->nlh); struct inet_diag_req_v2 req; req.sdiag_family = AF_UNSPEC; /* compatibility */ req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type); req.idiag_ext = rc->idiag_ext; req.idiag_states = rc->idiag_states; req.id = rc->id; return __inet_diag_dump(skb, cb, &req); } static int inet_diag_get_exact_compat(struct sk_buff *in_skb, const struct nlmsghdr *nlh) { struct inet_diag_req *rc = nlmsg_data(nlh); struct inet_diag_req_v2 req; req.sdiag_family = rc->idiag_family; req.sdiag_protocol = inet_diag_type2proto(nlh->nlmsg_type); req.idiag_ext = rc->idiag_ext; req.idiag_states = rc->idiag_states; req.id = rc->id; return inet_diag_cmd_exact(SOCK_DIAG_BY_FAMILY, in_skb, nlh, sizeof(struct inet_diag_req), &req); } static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) { int hdrlen = sizeof(struct inet_diag_req); struct net *net = sock_net(skb->sk); if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX || nlmsg_len(nlh) < hdrlen) return -EINVAL; if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = inet_diag_dump_start_compat, .done = inet_diag_dump_done, .dump = inet_diag_dump_compat, }; return netlink_dump_start(net->diag_nlsk, skb, nlh, &c); } return inet_diag_get_exact_compat(skb, nlh); } static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h) { int hdrlen = sizeof(struct inet_diag_req_v2); struct net *net = sock_net(skb->sk); if (nlmsg_len(h) < hdrlen) return -EINVAL; if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY && h->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = inet_diag_dump_start, .done = inet_diag_dump_done, .dump = inet_diag_dump, }; return netlink_dump_start(net->diag_nlsk, skb, h, &c); } return inet_diag_cmd_exact(h->nlmsg_type, skb, h, hdrlen, nlmsg_data(h)); } static int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk) { const struct inet_diag_handler *handler; struct nlmsghdr *nlh; struct nlattr *attr; struct inet_diag_msg *r; void *info = NULL; int err = 0; nlh = nlmsg_put(skb, 0, 0, SOCK_DIAG_BY_FAMILY, sizeof(*r), 0); if (!nlh) return -ENOMEM; r = nlmsg_data(nlh); memset(r, 0, sizeof(*r)); inet_diag_msg_common_fill(r, sk); if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_STREAM) r->id.idiag_sport = inet_sk(sk)->inet_sport; r->idiag_state = sk->sk_state; if ((err = nla_put_u8(skb, INET_DIAG_PROTOCOL, sk->sk_protocol))) { nlmsg_cancel(skb, nlh); return err; } handler = inet_diag_lock_handler(sk->sk_protocol); if (!handler) { nlmsg_cancel(skb, nlh); return -ENOENT; } attr = handler->idiag_info_size ? nla_reserve_64bit(skb, INET_DIAG_INFO, handler->idiag_info_size, INET_DIAG_PAD) : NULL; if (attr) info = nla_data(attr); handler->idiag_get_info(sk, r, info); inet_diag_unlock_handler(handler); nlmsg_end(skb, nlh); return 0; } static const struct sock_diag_handler inet_diag_handler = { .owner = THIS_MODULE, .family = AF_INET, .dump = inet_diag_handler_cmd, .get_info = inet_diag_handler_get_info, .destroy = inet_diag_handler_cmd, }; static const struct sock_diag_handler inet6_diag_handler = { .owner = THIS_MODULE, .family = AF_INET6, .dump = inet_diag_handler_cmd, .get_info = inet_diag_handler_get_info, .destroy = inet_diag_handler_cmd, }; int inet_diag_register(const struct inet_diag_handler *h) { const __u16 type = h->idiag_type; if (type >= IPPROTO_MAX) return -EINVAL; return !cmpxchg((const struct inet_diag_handler **)&inet_diag_table[type], NULL, h) ? 0 : -EEXIST; } EXPORT_SYMBOL_GPL(inet_diag_register); void inet_diag_unregister(const struct inet_diag_handler *h) { const __u16 type = h->idiag_type; if (type >= IPPROTO_MAX) return; xchg((const struct inet_diag_handler **)&inet_diag_table[type], NULL); } EXPORT_SYMBOL_GPL(inet_diag_unregister); static const struct sock_diag_inet_compat inet_diag_compat = { .owner = THIS_MODULE, .fn = inet_diag_rcv_msg_compat, }; static int __init inet_diag_init(void) { const int inet_diag_table_size = (IPPROTO_MAX * sizeof(struct inet_diag_handler *)); int err = -ENOMEM; inet_diag_table = kzalloc(inet_diag_table_size, GFP_KERNEL); if (!inet_diag_table) goto out; err = sock_diag_register(&inet_diag_handler); if (err) goto out_free_nl; err = sock_diag_register(&inet6_diag_handler); if (err) goto out_free_inet; sock_diag_register_inet_compat(&inet_diag_compat); out: return err; out_free_inet: sock_diag_unregister(&inet_diag_handler); out_free_nl: kfree(inet_diag_table); goto out; } static void __exit inet_diag_exit(void) { sock_diag_unregister(&inet6_diag_handler); sock_diag_unregister(&inet_diag_handler); sock_diag_unregister_inet_compat(&inet_diag_compat); kfree(inet_diag_table); } module_init(inet_diag_init); module_exit(inet_diag_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("INET/INET6: socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2 /* AF_INET */); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10 /* AF_INET6 */);
93 93 2 2 91 91 7798 7814 7787 7786 501 502 501 176 57 276 57 57 30 30 247 247 176 177 176 527 16 511 511 511 2650 2648 442 443 1284 1289 10 1274 499 500 16 16 1310 1313 1313 1002 1001 1001 362 365 365 282 136 152 9 280 280 281 4465 4456 4466 4454 4461 3965 599 1300 1292 1301 1300 1304 2281 2277 2401 594 595 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 // SPDX-License-Identifier: GPL-2.0-only /* * AppArmor security module * * This file contains AppArmor LSM hooks. * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2010 Canonical Ltd. */ #include <linux/lsm_hooks.h> #include <linux/moduleparam.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/ptrace.h> #include <linux/ctype.h> #include <linux/sysctl.h> #include <linux/audit.h> #include <linux/user_namespace.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/zstd.h> #include <net/sock.h> #include <uapi/linux/mount.h> #include <uapi/linux/lsm.h> #include "include/apparmor.h" #include "include/apparmorfs.h" #include "include/audit.h" #include "include/capability.h" #include "include/cred.h" #include "include/file.h" #include "include/ipc.h" #include "include/net.h" #include "include/path.h" #include "include/label.h" #include "include/policy.h" #include "include/policy_ns.h" #include "include/procattr.h" #include "include/mount.h" #include "include/secid.h" /* Flag indicating whether initialization completed */ int apparmor_initialized; union aa_buffer { struct list_head list; DECLARE_FLEX_ARRAY(char, buffer); }; struct aa_local_cache { unsigned int hold; unsigned int count; struct list_head head; }; #define RESERVE_COUNT 2 static int reserve_count = RESERVE_COUNT; static int buffer_count; static LIST_HEAD(aa_global_buffers); static DEFINE_SPINLOCK(aa_buffers_lock); static DEFINE_PER_CPU(struct aa_local_cache, aa_local_buffers); /* * LSM hook functions */ /* * put the associated labels */ static void apparmor_cred_free(struct cred *cred) { aa_put_label(cred_label(cred)); set_cred_label(cred, NULL); } /* * allocate the apparmor part of blank credentials */ static int apparmor_cred_alloc_blank(struct cred *cred, gfp_t gfp) { set_cred_label(cred, NULL); return 0; } /* * prepare new cred label for modification by prepare_cred block */ static int apparmor_cred_prepare(struct cred *new, const struct cred *old, gfp_t gfp) { set_cred_label(new, aa_get_newest_label(cred_label(old))); return 0; } /* * transfer the apparmor data to a blank set of creds */ static void apparmor_cred_transfer(struct cred *new, const struct cred *old) { set_cred_label(new, aa_get_newest_label(cred_label(old))); } static void apparmor_task_free(struct task_struct *task) { aa_free_task_ctx(task_ctx(task)); } static int apparmor_task_alloc(struct task_struct *task, unsigned long clone_flags) { struct aa_task_ctx *new = task_ctx(task); aa_dup_task_ctx(new, task_ctx(current)); return 0; } static int apparmor_ptrace_access_check(struct task_struct *child, unsigned int mode) { struct aa_label *tracer, *tracee; const struct cred *cred; int error; cred = get_task_cred(child); tracee = cred_label(cred); /* ref count on cred */ tracer = __begin_current_label_crit_section(); error = aa_may_ptrace(current_cred(), tracer, cred, tracee, (mode & PTRACE_MODE_READ) ? AA_PTRACE_READ : AA_PTRACE_TRACE); __end_current_label_crit_section(tracer); put_cred(cred); return error; } static int apparmor_ptrace_traceme(struct task_struct *parent) { struct aa_label *tracer, *tracee; const struct cred *cred; int error; tracee = __begin_current_label_crit_section(); cred = get_task_cred(parent); tracer = cred_label(cred); /* ref count on cred */ error = aa_may_ptrace(cred, tracer, current_cred(), tracee, AA_PTRACE_TRACE); put_cred(cred); __end_current_label_crit_section(tracee); return error; } /* Derived from security/commoncap.c:cap_capget */ static int apparmor_capget(const struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { struct aa_label *label; const struct cred *cred; rcu_read_lock(); cred = __task_cred(target); label = aa_get_newest_cred_label(cred); /* * cap_capget is stacked ahead of this and will * initialize effective and permitted. */ if (!unconfined(label)) { struct aa_profile *profile; struct label_it i; label_for_each_confined(i, label, profile) { struct aa_ruleset *rules; if (COMPLAIN_MODE(profile)) continue; rules = list_first_entry(&profile->rules, typeof(*rules), list); *effective = cap_intersect(*effective, rules->caps.allow); *permitted = cap_intersect(*permitted, rules->caps.allow); } } rcu_read_unlock(); aa_put_label(label); return 0; } static int apparmor_capable(const struct cred *cred, struct user_namespace *ns, int cap, unsigned int opts) { struct aa_label *label; int error = 0; label = aa_get_newest_cred_label(cred); if (!unconfined(label)) error = aa_capable(cred, label, cap, opts); aa_put_label(label); return error; } /** * common_perm - basic common permission check wrapper fn for paths * @op: operation being checked * @path: path to check permission of (NOT NULL) * @mask: requested permissions mask * @cond: conditional info for the permission request (NOT NULL) * * Returns: %0 else error code if error or permission denied */ static int common_perm(const char *op, const struct path *path, u32 mask, struct path_cond *cond) { struct aa_label *label; int error = 0; label = __begin_current_label_crit_section(); if (!unconfined(label)) error = aa_path_perm(op, current_cred(), label, path, 0, mask, cond); __end_current_label_crit_section(label); return error; } /** * common_perm_cond - common permission wrapper around inode cond * @op: operation being checked * @path: location to check (NOT NULL) * @mask: requested permissions mask * * Returns: %0 else error code if error or permission denied */ static int common_perm_cond(const char *op, const struct path *path, u32 mask) { vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_idmap(path->mnt), d_backing_inode(path->dentry)); struct path_cond cond = { vfsuid_into_kuid(vfsuid), d_backing_inode(path->dentry)->i_mode }; if (!path_mediated_fs(path->dentry)) return 0; return common_perm(op, path, mask, &cond); } /** * common_perm_dir_dentry - common permission wrapper when path is dir, dentry * @op: operation being checked * @dir: directory of the dentry (NOT NULL) * @dentry: dentry to check (NOT NULL) * @mask: requested permissions mask * @cond: conditional info for the permission request (NOT NULL) * * Returns: %0 else error code if error or permission denied */ static int common_perm_dir_dentry(const char *op, const struct path *dir, struct dentry *dentry, u32 mask, struct path_cond *cond) { struct path path = { .mnt = dir->mnt, .dentry = dentry }; return common_perm(op, &path, mask, cond); } /** * common_perm_rm - common permission wrapper for operations doing rm * @op: operation being checked * @dir: directory that the dentry is in (NOT NULL) * @dentry: dentry being rm'd (NOT NULL) * @mask: requested permission mask * * Returns: %0 else error code if error or permission denied */ static int common_perm_rm(const char *op, const struct path *dir, struct dentry *dentry, u32 mask) { struct inode *inode = d_backing_inode(dentry); struct path_cond cond = { }; vfsuid_t vfsuid; if (!inode || !path_mediated_fs(dentry)) return 0; vfsuid = i_uid_into_vfsuid(mnt_idmap(dir->mnt), inode); cond.uid = vfsuid_into_kuid(vfsuid); cond.mode = inode->i_mode; return common_perm_dir_dentry(op, dir, dentry, mask, &cond); } /** * common_perm_create - common permission wrapper for operations doing create * @op: operation being checked * @dir: directory that dentry will be created in (NOT NULL) * @dentry: dentry to create (NOT NULL) * @mask: request permission mask * @mode: created file mode * * Returns: %0 else error code if error or permission denied */ static int common_perm_create(const char *op, const struct path *dir, struct dentry *dentry, u32 mask, umode_t mode) { struct path_cond cond = { current_fsuid(), mode }; if (!path_mediated_fs(dir->dentry)) return 0; return common_perm_dir_dentry(op, dir, dentry, mask, &cond); } static int apparmor_path_unlink(const struct path *dir, struct dentry *dentry) { return common_perm_rm(OP_UNLINK, dir, dentry, AA_MAY_DELETE); } static int apparmor_path_mkdir(const struct path *dir, struct dentry *dentry, umode_t mode) { return common_perm_create(OP_MKDIR, dir, dentry, AA_MAY_CREATE, S_IFDIR); } static int apparmor_path_rmdir(const struct path *dir, struct dentry *dentry) { return common_perm_rm(OP_RMDIR, dir, dentry, AA_MAY_DELETE); } static int apparmor_path_mknod(const struct path *dir, struct dentry *dentry, umode_t mode, unsigned int dev) { return common_perm_create(OP_MKNOD, dir, dentry, AA_MAY_CREATE, mode); } static int apparmor_path_truncate(const struct path *path) { return common_perm_cond(OP_TRUNC, path, MAY_WRITE | AA_MAY_SETATTR); } static int apparmor_file_truncate(struct file *file) { return apparmor_path_truncate(&file->f_path); } static int apparmor_path_symlink(const struct path *dir, struct dentry *dentry, const char *old_name) { return common_perm_create(OP_SYMLINK, dir, dentry, AA_MAY_CREATE, S_IFLNK); } static int apparmor_path_link(struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry) { struct aa_label *label; int error = 0; if (!path_mediated_fs(old_dentry)) return 0; label = begin_current_label_crit_section(); if (!unconfined(label)) error = aa_path_link(current_cred(), label, old_dentry, new_dir, new_dentry); end_current_label_crit_section(label); return error; } static int apparmor_path_rename(const struct path *old_dir, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry, const unsigned int flags) { struct aa_label *label; int error = 0; if (!path_mediated_fs(old_dentry)) return 0; if ((flags & RENAME_EXCHANGE) && !path_mediated_fs(new_dentry)) return 0; label = begin_current_label_crit_section(); if (!unconfined(label)) { struct mnt_idmap *idmap = mnt_idmap(old_dir->mnt); vfsuid_t vfsuid; struct path old_path = { .mnt = old_dir->mnt, .dentry = old_dentry }; struct path new_path = { .mnt = new_dir->mnt, .dentry = new_dentry }; struct path_cond cond = { .mode = d_backing_inode(old_dentry)->i_mode }; vfsuid = i_uid_into_vfsuid(idmap, d_backing_inode(old_dentry)); cond.uid = vfsuid_into_kuid(vfsuid); if (flags & RENAME_EXCHANGE) { struct path_cond cond_exchange = { .mode = d_backing_inode(new_dentry)->i_mode, }; vfsuid = i_uid_into_vfsuid(idmap, d_backing_inode(old_dentry)); cond_exchange.uid = vfsuid_into_kuid(vfsuid); error = aa_path_perm(OP_RENAME_SRC, current_cred(), label, &new_path, 0, MAY_READ | AA_MAY_GETATTR | MAY_WRITE | AA_MAY_SETATTR | AA_MAY_DELETE, &cond_exchange); if (!error) error = aa_path_perm(OP_RENAME_DEST, current_cred(), label, &old_path, 0, MAY_WRITE | AA_MAY_SETATTR | AA_MAY_CREATE, &cond_exchange); } if (!error) error = aa_path_perm(OP_RENAME_SRC, current_cred(), label, &old_path, 0, MAY_READ | AA_MAY_GETATTR | MAY_WRITE | AA_MAY_SETATTR | AA_MAY_DELETE, &cond); if (!error) error = aa_path_perm(OP_RENAME_DEST, current_cred(), label, &new_path, 0, MAY_WRITE | AA_MAY_SETATTR | AA_MAY_CREATE, &cond); } end_current_label_crit_section(label); return error; } static int apparmor_path_chmod(const struct path *path, umode_t mode) { return common_perm_cond(OP_CHMOD, path, AA_MAY_CHMOD); } static int apparmor_path_chown(const struct path *path, kuid_t uid, kgid_t gid) { return common_perm_cond(OP_CHOWN, path, AA_MAY_CHOWN); } static int apparmor_inode_getattr(const struct path *path) { return common_perm_cond(OP_GETATTR, path, AA_MAY_GETATTR); } static int apparmor_file_open(struct file *file) { struct aa_file_ctx *fctx = file_ctx(file); struct aa_label *label; int error = 0; if (!path_mediated_fs(file->f_path.dentry)) return 0; /* If in exec, permission is handled by bprm hooks. * Cache permissions granted by the previous exec check, with * implicit read and executable mmap which are required to * actually execute the image. * * Illogically, FMODE_EXEC is in f_flags, not f_mode. */ if (file->f_flags & __FMODE_EXEC) { fctx->allow = MAY_EXEC | MAY_READ | AA_EXEC_MMAP; return 0; } label = aa_get_newest_cred_label(file->f_cred); if (!unconfined(label)) { struct mnt_idmap *idmap = file_mnt_idmap(file); struct inode *inode = file_inode(file); vfsuid_t vfsuid; struct path_cond cond = { .mode = inode->i_mode, }; vfsuid = i_uid_into_vfsuid(idmap, inode); cond.uid = vfsuid_into_kuid(vfsuid); error = aa_path_perm(OP_OPEN, file->f_cred, label, &file->f_path, 0, aa_map_file_to_perms(file), &cond); /* todo cache full allowed permissions set and state */ fctx->allow = aa_map_file_to_perms(file); } aa_put_label(label); return error; } static int apparmor_file_alloc_security(struct file *file) { struct aa_file_ctx *ctx = file_ctx(file); struct aa_label *label = begin_current_label_crit_section(); spin_lock_init(&ctx->lock); rcu_assign_pointer(ctx->label, aa_get_label(label)); end_current_label_crit_section(label); return 0; } static void apparmor_file_free_security(struct file *file) { struct aa_file_ctx *ctx = file_ctx(file); if (ctx) aa_put_label(rcu_access_pointer(ctx->label)); } static int common_file_perm(const char *op, struct file *file, u32 mask, bool in_atomic) { struct aa_label *label; int error = 0; /* don't reaudit files closed during inheritance */ if (file->f_path.dentry == aa_null.dentry) return -EACCES; label = __begin_current_label_crit_section(); error = aa_file_perm(op, current_cred(), label, file, mask, in_atomic); __end_current_label_crit_section(label); return error; } static int apparmor_file_receive(struct file *file) { return common_file_perm(OP_FRECEIVE, file, aa_map_file_to_perms(file), false); } static int apparmor_file_permission(struct file *file, int mask) { return common_file_perm(OP_FPERM, file, mask, false); } static int apparmor_file_lock(struct file *file, unsigned int cmd) { u32 mask = AA_MAY_LOCK; if (cmd == F_WRLCK) mask |= MAY_WRITE; return common_file_perm(OP_FLOCK, file, mask, false); } static int common_mmap(const char *op, struct file *file, unsigned long prot, unsigned long flags, bool in_atomic) { int mask = 0; if (!file || !file_ctx(file)) return 0; if (prot & PROT_READ) mask |= MAY_READ; /* * Private mappings don't require write perms since they don't * write back to the files */ if ((prot & PROT_WRITE) && !(flags & MAP_PRIVATE)) mask |= MAY_WRITE; if (prot & PROT_EXEC) mask |= AA_EXEC_MMAP; return common_file_perm(op, file, mask, in_atomic); } static int apparmor_mmap_file(struct file *file, unsigned long reqprot, unsigned long prot, unsigned long flags) { return common_mmap(OP_FMMAP, file, prot, flags, GFP_ATOMIC); } static int apparmor_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot) { return common_mmap(OP_FMPROT, vma->vm_file, prot, !(vma->vm_flags & VM_SHARED) ? MAP_PRIVATE : 0, false); } #ifdef CONFIG_IO_URING static const char *audit_uring_mask(u32 mask) { if (mask & AA_MAY_CREATE_SQPOLL) return "sqpoll"; if (mask & AA_MAY_OVERRIDE_CRED) return "override_creds"; return ""; } static void audit_uring_cb(struct audit_buffer *ab, void *va) { struct apparmor_audit_data *ad = aad_of_va(va); if (ad->request & AA_URING_PERM_MASK) { audit_log_format(ab, " requested=\"%s\"", audit_uring_mask(ad->request)); if (ad->denied & AA_URING_PERM_MASK) { audit_log_format(ab, " denied=\"%s\"", audit_uring_mask(ad->denied)); } } if (ad->uring.target) { audit_log_format(ab, " tcontext="); aa_label_xaudit(ab, labels_ns(ad->subj_label), ad->uring.target, FLAGS_NONE, GFP_ATOMIC); } } static int profile_uring(struct aa_profile *profile, u32 request, struct aa_label *new, int cap, struct apparmor_audit_data *ad) { unsigned int state; struct aa_ruleset *rules; int error = 0; AA_BUG(!profile); rules = list_first_entry(&profile->rules, typeof(*rules), list); state = RULE_MEDIATES(rules, AA_CLASS_IO_URING); if (state) { struct aa_perms perms = { }; if (new) { aa_label_match(profile, rules, new, state, false, request, &perms); } else { perms = *aa_lookup_perms(rules->policy, state); } aa_apply_modes_to_perms(profile, &perms); error = aa_check_perms(profile, &perms, request, ad, audit_uring_cb); } return error; } /** * apparmor_uring_override_creds - check the requested cred override * @new: the target creds * * Check to see if the current task is allowed to override it's credentials * to service an io_uring operation. */ static int apparmor_uring_override_creds(const struct cred *new) { struct aa_profile *profile; struct aa_label *label; int error; DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_NONE, AA_CLASS_IO_URING, OP_URING_OVERRIDE); ad.uring.target = cred_label(new); label = __begin_current_label_crit_section(); error = fn_for_each(label, profile, profile_uring(profile, AA_MAY_OVERRIDE_CRED, cred_label(new), CAP_SYS_ADMIN, &ad)); __end_current_label_crit_section(label); return error; } /** * apparmor_uring_sqpoll - check if a io_uring polling thread can be created * * Check to see if the current task is allowed to create a new io_uring * kernel polling thread. */ static int apparmor_uring_sqpoll(void) { struct aa_profile *profile; struct aa_label *label; int error; DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_NONE, AA_CLASS_IO_URING, OP_URING_SQPOLL); label = __begin_current_label_crit_section(); error = fn_for_each(label, profile, profile_uring(profile, AA_MAY_CREATE_SQPOLL, NULL, CAP_SYS_ADMIN, &ad)); __end_current_label_crit_section(label); return error; } #endif /* CONFIG_IO_URING */ static int apparmor_sb_mount(const char *dev_name, const struct path *path, const char *type, unsigned long flags, void *data) { struct aa_label *label; int error = 0; /* Discard magic */ if ((flags & MS_MGC_MSK) == MS_MGC_VAL) flags &= ~MS_MGC_MSK; flags &= ~AA_MS_IGNORE_MASK; label = __begin_current_label_crit_section(); if (!unconfined(label)) { if (flags & MS_REMOUNT) error = aa_remount(current_cred(), label, path, flags, data); else if (flags & MS_BIND) error = aa_bind_mount(current_cred(), label, path, dev_name, flags); else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) error = aa_mount_change_type(current_cred(), label, path, flags); else if (flags & MS_MOVE) error = aa_move_mount_old(current_cred(), label, path, dev_name); else error = aa_new_mount(current_cred(), label, dev_name, path, type, flags, data); } __end_current_label_crit_section(label); return error; } static int apparmor_move_mount(const struct path *from_path, const struct path *to_path) { struct aa_label *label; int error = 0; label = __begin_current_label_crit_section(); if (!unconfined(label)) error = aa_move_mount(current_cred(), label, from_path, to_path); __end_current_label_crit_section(label); return error; } static int apparmor_sb_umount(struct vfsmount *mnt, int flags) { struct aa_label *label; int error = 0; label = __begin_current_label_crit_section(); if (!unconfined(label)) error = aa_umount(current_cred(), label, mnt, flags); __end_current_label_crit_section(label); return error; } static int apparmor_sb_pivotroot(const struct path *old_path, const struct path *new_path) { struct aa_label *label; int error = 0; label = aa_get_current_label(); if (!unconfined(label)) error = aa_pivotroot(current_cred(), label, old_path, new_path); aa_put_label(label); return error; } static int apparmor_getselfattr(unsigned int attr, struct lsm_ctx __user *lx, size_t *size, u32 flags) { int error = -ENOENT; struct aa_task_ctx *ctx = task_ctx(current); struct aa_label *label = NULL; char *value = NULL; switch (attr) { case LSM_ATTR_CURRENT: label = aa_get_newest_label(cred_label(current_cred())); break; case LSM_ATTR_PREV: if (ctx->previous) label = aa_get_newest_label(ctx->previous); break; case LSM_ATTR_EXEC: if (ctx->onexec) label = aa_get_newest_label(ctx->onexec); break; default: error = -EOPNOTSUPP; break; } if (label) { error = aa_getprocattr(label, &value, false); if (error > 0) error = lsm_fill_user_ctx(lx, size, value, error, LSM_ID_APPARMOR, 0); kfree(value); } aa_put_label(label); if (error < 0) return error; return 1; } static int apparmor_getprocattr(struct task_struct *task, const char *name, char **value) { int error = -ENOENT; /* released below */ const struct cred *cred = get_task_cred(task); struct aa_task_ctx *ctx = task_ctx(current); struct aa_label *label = NULL; if (strcmp(name, "current") == 0) label = aa_get_newest_label(cred_label(cred)); else if (strcmp(name, "prev") == 0 && ctx->previous) label = aa_get_newest_label(ctx->previous); else if (strcmp(name, "exec") == 0 && ctx->onexec) label = aa_get_newest_label(ctx->onexec); else error = -EINVAL; if (label) error = aa_getprocattr(label, value, true); aa_put_label(label); put_cred(cred); return error; } static int do_setattr(u64 attr, void *value, size_t size) { char *command, *largs = NULL, *args = value; size_t arg_size; int error; DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_NONE, AA_CLASS_NONE, OP_SETPROCATTR); if (size == 0) return -EINVAL; /* AppArmor requires that the buffer must be null terminated atm */ if (args[size - 1] != '\0') { /* null terminate */ largs = args = kmalloc(size + 1, GFP_KERNEL); if (!args) return -ENOMEM; memcpy(args, value, size); args[size] = '\0'; } error = -EINVAL; args = strim(args); command = strsep(&args, " "); if (!args) goto out; args = skip_spaces(args); if (!*args) goto out; arg_size = size - (args - (largs ? largs : (char *) value)); if (attr == LSM_ATTR_CURRENT) { if (strcmp(command, "changehat") == 0) { error = aa_setprocattr_changehat(args, arg_size, AA_CHANGE_NOFLAGS); } else if (strcmp(command, "permhat") == 0) { error = aa_setprocattr_changehat(args, arg_size, AA_CHANGE_TEST); } else if (strcmp(command, "changeprofile") == 0) { error = aa_change_profile(args, AA_CHANGE_NOFLAGS); } else if (strcmp(command, "permprofile") == 0) { error = aa_change_profile(args, AA_CHANGE_TEST); } else if (strcmp(command, "stack") == 0) { error = aa_change_profile(args, AA_CHANGE_STACK); } else goto fail; } else if (attr == LSM_ATTR_EXEC) { if (strcmp(command, "exec") == 0) error = aa_change_profile(args, AA_CHANGE_ONEXEC); else if (strcmp(command, "stack") == 0) error = aa_change_profile(args, (AA_CHANGE_ONEXEC | AA_CHANGE_STACK)); else goto fail; } else /* only support the "current" and "exec" process attributes */ goto fail; if (!error) error = size; out: kfree(largs); return error; fail: ad.subj_label = begin_current_label_crit_section(); if (attr == LSM_ATTR_CURRENT) ad.info = "current"; else if (attr == LSM_ATTR_EXEC) ad.info = "exec"; else ad.info = "invalid"; ad.error = error = -EINVAL; aa_audit_msg(AUDIT_APPARMOR_DENIED, &ad, NULL); end_current_label_crit_section(ad.subj_label); goto out; } static int apparmor_setselfattr(unsigned int attr, struct lsm_ctx *ctx, size_t size, u32 flags) { int rc; if (attr != LSM_ATTR_CURRENT && attr != LSM_ATTR_EXEC) return -EOPNOTSUPP; rc = do_setattr(attr, ctx->ctx, ctx->ctx_len); if (rc > 0) return 0; return rc; } static int apparmor_setprocattr(const char *name, void *value, size_t size) { int attr = lsm_name_to_attr(name); if (attr) return do_setattr(attr, value, size); return -EINVAL; } /** * apparmor_bprm_committing_creds - do task cleanup on committing new creds * @bprm: binprm for the exec (NOT NULL) */ static void apparmor_bprm_committing_creds(const struct linux_binprm *bprm) { struct aa_label *label = aa_current_raw_label(); struct aa_label *new_label = cred_label(bprm->cred); /* bail out if unconfined or not changing profile */ if ((new_label->proxy == label->proxy) || (unconfined(new_label))) return; aa_inherit_files(bprm->cred, current->files); current->pdeath_signal = 0; /* reset soft limits and set hard limits for the new label */ __aa_transition_rlimits(label, new_label); } /** * apparmor_bprm_committed_creds() - do cleanup after new creds committed * @bprm: binprm for the exec (NOT NULL) */ static void apparmor_bprm_committed_creds(const struct linux_binprm *bprm) { /* clear out temporary/transitional state from the context */ aa_clear_task_ctx_trans(task_ctx(current)); return; } static void apparmor_current_getsecid_subj(u32 *secid) { struct aa_label *label = __begin_current_label_crit_section(); *secid = label->secid; __end_current_label_crit_section(label); } static void apparmor_task_getsecid_obj(struct task_struct *p, u32 *secid) { struct aa_label *label = aa_get_task_label(p); *secid = label->secid; aa_put_label(label); } static int apparmor_task_setrlimit(struct task_struct *task, unsigned int resource, struct rlimit *new_rlim) { struct aa_label *label = __begin_current_label_crit_section(); int error = 0; if (!unconfined(label)) error = aa_task_setrlimit(current_cred(), label, task, resource, new_rlim); __end_current_label_crit_section(label); return error; } static int apparmor_task_kill(struct task_struct *target, struct kernel_siginfo *info, int sig, const struct cred *cred) { const struct cred *tc; struct aa_label *cl, *tl; int error; tc = get_task_cred(target); tl = aa_get_newest_cred_label(tc); if (cred) { /* * Dealing with USB IO specific behavior */ cl = aa_get_newest_cred_label(cred); error = aa_may_signal(cred, cl, tc, tl, sig); aa_put_label(cl); } else { cl = __begin_current_label_crit_section(); error = aa_may_signal(current_cred(), cl, tc, tl, sig); __end_current_label_crit_section(cl); } aa_put_label(tl); put_cred(tc); return error; } static int apparmor_userns_create(const struct cred *cred) { struct aa_label *label; struct aa_profile *profile; int error = 0; DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_TASK, AA_CLASS_NS, OP_USERNS_CREATE); ad.subj_cred = current_cred(); label = begin_current_label_crit_section(); if (!unconfined(label)) { error = fn_for_each(label, profile, aa_profile_ns_perm(profile, &ad, AA_USERNS_CREATE)); } end_current_label_crit_section(label); return error; } static int apparmor_sk_alloc_security(struct sock *sk, int family, gfp_t flags) { struct aa_sk_ctx *ctx; ctx = kzalloc(sizeof(*ctx), flags); if (!ctx) return -ENOMEM; sk->sk_security = ctx; return 0; } static void apparmor_sk_free_security(struct sock *sk) { struct aa_sk_ctx *ctx = aa_sock(sk); sk->sk_security = NULL; aa_put_label(ctx->label); aa_put_label(ctx->peer); kfree(ctx); } /** * apparmor_sk_clone_security - clone the sk_security field * @sk: sock to have security cloned * @newsk: sock getting clone */ static void apparmor_sk_clone_security(const struct sock *sk, struct sock *newsk) { struct aa_sk_ctx *ctx = aa_sock(sk); struct aa_sk_ctx *new = aa_sock(newsk); if (new->label) aa_put_label(new->label); new->label = aa_get_label(ctx->label); if (new->peer) aa_put_label(new->peer); new->peer = aa_get_label(ctx->peer); } static int apparmor_socket_create(int family, int type, int protocol, int kern) { struct aa_label *label; int error = 0; AA_BUG(in_interrupt()); label = begin_current_label_crit_section(); if (!(kern || unconfined(label))) error = af_select(family, create_perm(label, family, type, protocol), aa_af_perm(current_cred(), label, OP_CREATE, AA_MAY_CREATE, family, type, protocol)); end_current_label_crit_section(label); return error; } /** * apparmor_socket_post_create - setup the per-socket security struct * @sock: socket that is being setup * @family: family of socket being created * @type: type of the socket * @ptotocol: protocol of the socket * @kern: socket is a special kernel socket * * Note: * - kernel sockets labeled kernel_t used to use unconfined * - socket may not have sk here if created with sock_create_lite or * sock_alloc. These should be accept cases which will be handled in * sock_graft. */ static int apparmor_socket_post_create(struct socket *sock, int family, int type, int protocol, int kern) { struct aa_label *label; if (kern) { label = aa_get_label(kernel_t); } else label = aa_get_current_label(); if (sock->sk) { struct aa_sk_ctx *ctx = aa_sock(sock->sk); aa_put_label(ctx->label); ctx->label = aa_get_label(label); } aa_put_label(label); return 0; } static int apparmor_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(!address); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, bind_perm(sock, address, addrlen), aa_sk_perm(OP_BIND, AA_MAY_BIND, sock->sk)); } static int apparmor_socket_connect(struct socket *sock, struct sockaddr *address, int addrlen) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(!address); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, connect_perm(sock, address, addrlen), aa_sk_perm(OP_CONNECT, AA_MAY_CONNECT, sock->sk)); } static int apparmor_socket_listen(struct socket *sock, int backlog) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, listen_perm(sock, backlog), aa_sk_perm(OP_LISTEN, AA_MAY_LISTEN, sock->sk)); } /* * Note: while @newsock is created and has some information, the accept * has not been done. */ static int apparmor_socket_accept(struct socket *sock, struct socket *newsock) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(!newsock); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, accept_perm(sock, newsock), aa_sk_perm(OP_ACCEPT, AA_MAY_ACCEPT, sock->sk)); } static int aa_sock_msg_perm(const char *op, u32 request, struct socket *sock, struct msghdr *msg, int size) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(!msg); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, msg_perm(op, request, sock, msg, size), aa_sk_perm(op, request, sock->sk)); } static int apparmor_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size) { return aa_sock_msg_perm(OP_SENDMSG, AA_MAY_SEND, sock, msg, size); } static int apparmor_socket_recvmsg(struct socket *sock, struct msghdr *msg, int size, int flags) { return aa_sock_msg_perm(OP_RECVMSG, AA_MAY_RECEIVE, sock, msg, size); } /* revaliation, get/set attr, shutdown */ static int aa_sock_perm(const char *op, u32 request, struct socket *sock) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, sock_perm(op, request, sock), aa_sk_perm(op, request, sock->sk)); } static int apparmor_socket_getsockname(struct socket *sock) { return aa_sock_perm(OP_GETSOCKNAME, AA_MAY_GETATTR, sock); } static int apparmor_socket_getpeername(struct socket *sock) { return aa_sock_perm(OP_GETPEERNAME, AA_MAY_GETATTR, sock); } /* revaliation, get/set attr, opt */ static int aa_sock_opt_perm(const char *op, u32 request, struct socket *sock, int level, int optname) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, opt_perm(op, request, sock, level, optname), aa_sk_perm(op, request, sock->sk)); } static int apparmor_socket_getsockopt(struct socket *sock, int level, int optname) { return aa_sock_opt_perm(OP_GETSOCKOPT, AA_MAY_GETOPT, sock, level, optname); } static int apparmor_socket_setsockopt(struct socket *sock, int level, int optname) { return aa_sock_opt_perm(OP_SETSOCKOPT, AA_MAY_SETOPT, sock, level, optname); } static int apparmor_socket_shutdown(struct socket *sock, int how) { return aa_sock_perm(OP_SHUTDOWN, AA_MAY_SHUTDOWN, sock); } #ifdef CONFIG_NETWORK_SECMARK /** * apparmor_socket_sock_rcv_skb - check perms before associating skb to sk * @sk: sk to associate @skb with * @skb: skb to check for perms * * Note: can not sleep may be called with locks held * * dont want protocol specific in __skb_recv_datagram() * to deny an incoming connection socket_sock_rcv_skb() */ static int apparmor_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct aa_sk_ctx *ctx = aa_sock(sk); if (!skb->secmark) return 0; return apparmor_secmark_check(ctx->label, OP_RECVMSG, AA_MAY_RECEIVE, skb->secmark, sk); } #endif static struct aa_label *sk_peer_label(struct sock *sk) { struct aa_sk_ctx *ctx = aa_sock(sk); if (ctx->peer) return ctx->peer; return ERR_PTR(-ENOPROTOOPT); } /** * apparmor_socket_getpeersec_stream - get security context of peer * @sock: socket that we are trying to get the peer context of * @optval: output - buffer to copy peer name to * @optlen: output - size of copied name in @optval * @len: size of @optval buffer * Returns: 0 on success, -errno of failure * * Note: for tcp only valid if using ipsec or cipso on lan */ static int apparmor_socket_getpeersec_stream(struct socket *sock, sockptr_t optval, sockptr_t optlen, unsigned int len) { char *name = NULL; int slen, error = 0; struct aa_label *label; struct aa_label *peer; label = begin_current_label_crit_section(); peer = sk_peer_label(sock->sk); if (IS_ERR(peer)) { error = PTR_ERR(peer); goto done; } slen = aa_label_asxprint(&name, labels_ns(label), peer, FLAG_SHOW_MODE | FLAG_VIEW_SUBNS | FLAG_HIDDEN_UNCONFINED, GFP_KERNEL); /* don't include terminating \0 in slen, it breaks some apps */ if (slen < 0) { error = -ENOMEM; goto done; } if (slen > len) { error = -ERANGE; goto done_len; } if (copy_to_sockptr(optval, name, slen)) error = -EFAULT; done_len: if (copy_to_sockptr(optlen, &slen, sizeof(slen))) error = -EFAULT; done: end_current_label_crit_section(label); kfree(name); return error; } /** * apparmor_socket_getpeersec_dgram - get security label of packet * @sock: the peer socket * @skb: packet data * @secid: pointer to where to put the secid of the packet * * Sets the netlabel socket state on sk from parent */ static int apparmor_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid) { /* TODO: requires secid support */ return -ENOPROTOOPT; } /** * apparmor_sock_graft - Initialize newly created socket * @sk: child sock * @parent: parent socket * * Note: could set off of SOCK_CTX(parent) but need to track inode and we can * just set sk security information off of current creating process label * Labeling of sk for accept case - probably should be sock based * instead of task, because of the case where an implicitly labeled * socket is shared by different tasks. */ static void apparmor_sock_graft(struct sock *sk, struct socket *parent) { struct aa_sk_ctx *ctx = aa_sock(sk); if (!ctx->label) ctx->label = aa_get_current_label(); } #ifdef CONFIG_NETWORK_SECMARK static int apparmor_inet_conn_request(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { struct aa_sk_ctx *ctx = aa_sock(sk); if (!skb->secmark) return 0; return apparmor_secmark_check(ctx->label, OP_CONNECT, AA_MAY_CONNECT, skb->secmark, sk); } #endif /* * The cred blob is a pointer to, not an instance of, an aa_label. */ struct lsm_blob_sizes apparmor_blob_sizes __ro_after_init = { .lbs_cred = sizeof(struct aa_label *), .lbs_file = sizeof(struct aa_file_ctx), .lbs_task = sizeof(struct aa_task_ctx), }; static const struct lsm_id apparmor_lsmid = { .name = "apparmor", .id = LSM_ID_APPARMOR, }; static struct security_hook_list apparmor_hooks[] __ro_after_init = { LSM_HOOK_INIT(ptrace_access_check, apparmor_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, apparmor_ptrace_traceme), LSM_HOOK_INIT(capget, apparmor_capget), LSM_HOOK_INIT(capable, apparmor_capable), LSM_HOOK_INIT(move_mount, apparmor_move_mount), LSM_HOOK_INIT(sb_mount, apparmor_sb_mount), LSM_HOOK_INIT(sb_umount, apparmor_sb_umount), LSM_HOOK_INIT(sb_pivotroot, apparmor_sb_pivotroot), LSM_HOOK_INIT(path_link, apparmor_path_link), LSM_HOOK_INIT(path_unlink, apparmor_path_unlink), LSM_HOOK_INIT(path_symlink, apparmor_path_symlink), LSM_HOOK_INIT(path_mkdir, apparmor_path_mkdir), LSM_HOOK_INIT(path_rmdir, apparmor_path_rmdir), LSM_HOOK_INIT(path_mknod, apparmor_path_mknod), LSM_HOOK_INIT(path_rename, apparmor_path_rename), LSM_HOOK_INIT(path_chmod, apparmor_path_chmod), LSM_HOOK_INIT(path_chown, apparmor_path_chown), LSM_HOOK_INIT(path_truncate, apparmor_path_truncate), LSM_HOOK_INIT(inode_getattr, apparmor_inode_getattr), LSM_HOOK_INIT(file_open, apparmor_file_open), LSM_HOOK_INIT(file_receive, apparmor_file_receive), LSM_HOOK_INIT(file_permission, apparmor_file_permission), LSM_HOOK_INIT(file_alloc_security, apparmor_file_alloc_security), LSM_HOOK_INIT(file_free_security, apparmor_file_free_security), LSM_HOOK_INIT(mmap_file, apparmor_mmap_file), LSM_HOOK_INIT(file_mprotect, apparmor_file_mprotect), LSM_HOOK_INIT(file_lock, apparmor_file_lock), LSM_HOOK_INIT(file_truncate, apparmor_file_truncate), LSM_HOOK_INIT(getselfattr, apparmor_getselfattr), LSM_HOOK_INIT(setselfattr, apparmor_setselfattr), LSM_HOOK_INIT(getprocattr, apparmor_getprocattr), LSM_HOOK_INIT(setprocattr, apparmor_setprocattr), LSM_HOOK_INIT(sk_alloc_security, apparmor_sk_alloc_security), LSM_HOOK_INIT(sk_free_security, apparmor_sk_free_security), LSM_HOOK_INIT(sk_clone_security, apparmor_sk_clone_security), LSM_HOOK_INIT(socket_create, apparmor_socket_create), LSM_HOOK_INIT(socket_post_create, apparmor_socket_post_create), LSM_HOOK_INIT(socket_bind, apparmor_socket_bind), LSM_HOOK_INIT(socket_connect, apparmor_socket_connect), LSM_HOOK_INIT(socket_listen, apparmor_socket_listen), LSM_HOOK_INIT(socket_accept, apparmor_socket_accept), LSM_HOOK_INIT(socket_sendmsg, apparmor_socket_sendmsg), LSM_HOOK_INIT(socket_recvmsg, apparmor_socket_recvmsg), LSM_HOOK_INIT(socket_getsockname, apparmor_socket_getsockname), LSM_HOOK_INIT(socket_getpeername, apparmor_socket_getpeername), LSM_HOOK_INIT(socket_getsockopt, apparmor_socket_getsockopt), LSM_HOOK_INIT(socket_setsockopt, apparmor_socket_setsockopt), LSM_HOOK_INIT(socket_shutdown, apparmor_socket_shutdown), #ifdef CONFIG_NETWORK_SECMARK LSM_HOOK_INIT(socket_sock_rcv_skb, apparmor_socket_sock_rcv_skb), #endif LSM_HOOK_INIT(socket_getpeersec_stream, apparmor_socket_getpeersec_stream), LSM_HOOK_INIT(socket_getpeersec_dgram, apparmor_socket_getpeersec_dgram), LSM_HOOK_INIT(sock_graft, apparmor_sock_graft), #ifdef CONFIG_NETWORK_SECMARK LSM_HOOK_INIT(inet_conn_request, apparmor_inet_conn_request), #endif LSM_HOOK_INIT(cred_alloc_blank, apparmor_cred_alloc_blank), LSM_HOOK_INIT(cred_free, apparmor_cred_free), LSM_HOOK_INIT(cred_prepare, apparmor_cred_prepare), LSM_HOOK_INIT(cred_transfer, apparmor_cred_transfer), LSM_HOOK_INIT(bprm_creds_for_exec, apparmor_bprm_creds_for_exec), LSM_HOOK_INIT(bprm_committing_creds, apparmor_bprm_committing_creds), LSM_HOOK_INIT(bprm_committed_creds, apparmor_bprm_committed_creds), LSM_HOOK_INIT(task_free, apparmor_task_free), LSM_HOOK_INIT(task_alloc, apparmor_task_alloc), LSM_HOOK_INIT(current_getsecid_subj, apparmor_current_getsecid_subj), LSM_HOOK_INIT(task_getsecid_obj, apparmor_task_getsecid_obj), LSM_HOOK_INIT(task_setrlimit, apparmor_task_setrlimit), LSM_HOOK_INIT(task_kill, apparmor_task_kill), LSM_HOOK_INIT(userns_create, apparmor_userns_create), #ifdef CONFIG_AUDIT LSM_HOOK_INIT(audit_rule_init, aa_audit_rule_init), LSM_HOOK_INIT(audit_rule_known, aa_audit_rule_known), LSM_HOOK_INIT(audit_rule_match, aa_audit_rule_match), LSM_HOOK_INIT(audit_rule_free, aa_audit_rule_free), #endif LSM_HOOK_INIT(secid_to_secctx, apparmor_secid_to_secctx), LSM_HOOK_INIT(secctx_to_secid, apparmor_secctx_to_secid), LSM_HOOK_INIT(release_secctx, apparmor_release_secctx), #ifdef CONFIG_IO_URING LSM_HOOK_INIT(uring_override_creds, apparmor_uring_override_creds), LSM_HOOK_INIT(uring_sqpoll, apparmor_uring_sqpoll), #endif }; /* * AppArmor sysfs module parameters */ static int param_set_aabool(const char *val, const struct kernel_param *kp); static int param_get_aabool(char *buffer, const struct kernel_param *kp); #define param_check_aabool param_check_bool static const struct kernel_param_ops param_ops_aabool = { .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_aabool, .get = param_get_aabool }; static int param_set_aauint(const char *val, const struct kernel_param *kp); static int param_get_aauint(char *buffer, const struct kernel_param *kp); #define param_check_aauint param_check_uint static const struct kernel_param_ops param_ops_aauint = { .set = param_set_aauint, .get = param_get_aauint }; static int param_set_aacompressionlevel(const char *val, const struct kernel_param *kp); static int param_get_aacompressionlevel(char *buffer, const struct kernel_param *kp); #define param_check_aacompressionlevel param_check_int static const struct kernel_param_ops param_ops_aacompressionlevel = { .set = param_set_aacompressionlevel, .get = param_get_aacompressionlevel }; static int param_set_aalockpolicy(const char *val, const struct kernel_param *kp); static int param_get_aalockpolicy(char *buffer, const struct kernel_param *kp); #define param_check_aalockpolicy param_check_bool static const struct kernel_param_ops param_ops_aalockpolicy = { .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_aalockpolicy, .get = param_get_aalockpolicy }; static int param_set_audit(const char *val, const struct kernel_param *kp); static int param_get_audit(char *buffer, const struct kernel_param *kp); static int param_set_mode(const char *val, const struct kernel_param *kp); static int param_get_mode(char *buffer, const struct kernel_param *kp); /* Flag values, also controllable via /sys/module/apparmor/parameters * We define special types as we want to do additional mediation. */ /* AppArmor global enforcement switch - complain, enforce, kill */ enum profile_mode aa_g_profile_mode = APPARMOR_ENFORCE; module_param_call(mode, param_set_mode, param_get_mode, &aa_g_profile_mode, S_IRUSR | S_IWUSR); /* whether policy verification hashing is enabled */ bool aa_g_hash_policy = IS_ENABLED(CONFIG_SECURITY_APPARMOR_HASH_DEFAULT); #ifdef CONFIG_SECURITY_APPARMOR_HASH module_param_named(hash_policy, aa_g_hash_policy, aabool, S_IRUSR | S_IWUSR); #endif /* whether policy exactly as loaded is retained for debug and checkpointing */ bool aa_g_export_binary = IS_ENABLED(CONFIG_SECURITY_APPARMOR_EXPORT_BINARY); #ifdef CONFIG_SECURITY_APPARMOR_EXPORT_BINARY module_param_named(export_binary, aa_g_export_binary, aabool, 0600); #endif /* policy loaddata compression level */ int aa_g_rawdata_compression_level = AA_DEFAULT_CLEVEL; module_param_named(rawdata_compression_level, aa_g_rawdata_compression_level, aacompressionlevel, 0400); /* Debug mode */ bool aa_g_debug = IS_ENABLED(CONFIG_SECURITY_APPARMOR_DEBUG_MESSAGES); module_param_named(debug, aa_g_debug, aabool, S_IRUSR | S_IWUSR); /* Audit mode */ enum audit_mode aa_g_audit; module_param_call(audit, param_set_audit, param_get_audit, &aa_g_audit, S_IRUSR | S_IWUSR); /* Determines if audit header is included in audited messages. This * provides more context if the audit daemon is not running */ bool aa_g_audit_header = true; module_param_named(audit_header, aa_g_audit_header, aabool, S_IRUSR | S_IWUSR); /* lock out loading/removal of policy * TODO: add in at boot loading of policy, which is the only way to * load policy, if lock_policy is set */ bool aa_g_lock_policy; module_param_named(lock_policy, aa_g_lock_policy, aalockpolicy, S_IRUSR | S_IWUSR); /* Syscall logging mode */ bool aa_g_logsyscall; module_param_named(logsyscall, aa_g_logsyscall, aabool, S_IRUSR | S_IWUSR); /* Maximum pathname length before accesses will start getting rejected */ unsigned int aa_g_path_max = 2 * PATH_MAX; module_param_named(path_max, aa_g_path_max, aauint, S_IRUSR); /* Determines how paranoid loading of policy is and how much verification * on the loaded policy is done. * DEPRECATED: read only as strict checking of load is always done now * that none root users (user namespaces) can load policy. */ bool aa_g_paranoid_load = IS_ENABLED(CONFIG_SECURITY_APPARMOR_PARANOID_LOAD); module_param_named(paranoid_load, aa_g_paranoid_load, aabool, S_IRUGO); static int param_get_aaintbool(char *buffer, const struct kernel_param *kp); static int param_set_aaintbool(const char *val, const struct kernel_param *kp); #define param_check_aaintbool param_check_int static const struct kernel_param_ops param_ops_aaintbool = { .set = param_set_aaintbool, .get = param_get_aaintbool }; /* Boot time disable flag */ static int apparmor_enabled __ro_after_init = 1; module_param_named(enabled, apparmor_enabled, aaintbool, 0444); static int __init apparmor_enabled_setup(char *str) { unsigned long enabled; int error = kstrtoul(str, 0, &enabled); if (!error) apparmor_enabled = enabled ? 1 : 0; return 1; } __setup("apparmor=", apparmor_enabled_setup); /* set global flag turning off the ability to load policy */ static int param_set_aalockpolicy(const char *val, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_admin_capable(NULL)) return -EPERM; return param_set_bool(val, kp); } static int param_get_aalockpolicy(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return param_get_bool(buffer, kp); } static int param_set_aabool(const char *val, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_admin_capable(NULL)) return -EPERM; return param_set_bool(val, kp); } static int param_get_aabool(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return param_get_bool(buffer, kp); } static int param_set_aauint(const char *val, const struct kernel_param *kp) { int error; if (!apparmor_enabled) return -EINVAL; /* file is ro but enforce 2nd line check */ if (apparmor_initialized) return -EPERM; error = param_set_uint(val, kp); aa_g_path_max = max_t(uint32_t, aa_g_path_max, sizeof(union aa_buffer)); pr_info("AppArmor: buffer size set to %d bytes\n", aa_g_path_max); return error; } static int param_get_aauint(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return param_get_uint(buffer, kp); } /* Can only be set before AppArmor is initialized (i.e. on boot cmdline). */ static int param_set_aaintbool(const char *val, const struct kernel_param *kp) { struct kernel_param kp_local; bool value; int error; if (apparmor_initialized) return -EPERM; /* Create local copy, with arg pointing to bool type. */ value = !!*((int *)kp->arg); memcpy(&kp_local, kp, sizeof(kp_local)); kp_local.arg = &value; error = param_set_bool(val, &kp_local); if (!error) *((int *)kp->arg) = *((bool *)kp_local.arg); return error; } /* * To avoid changing /sys/module/apparmor/parameters/enabled from Y/N to * 1/0, this converts the "int that is actually bool" back to bool for * display in the /sys filesystem, while keeping it "int" for the LSM * infrastructure. */ static int param_get_aaintbool(char *buffer, const struct kernel_param *kp) { struct kernel_param kp_local; bool value; /* Create local copy, with arg pointing to bool type. */ value = !!*((int *)kp->arg); memcpy(&kp_local, kp, sizeof(kp_local)); kp_local.arg = &value; return param_get_bool(buffer, &kp_local); } static int param_set_aacompressionlevel(const char *val, const struct kernel_param *kp) { int error; if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized) return -EPERM; error = param_set_int(val, kp); aa_g_rawdata_compression_level = clamp(aa_g_rawdata_compression_level, AA_MIN_CLEVEL, AA_MAX_CLEVEL); pr_info("AppArmor: policy rawdata compression level set to %d\n", aa_g_rawdata_compression_level); return error; } static int param_get_aacompressionlevel(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return param_get_int(buffer, kp); } static int param_get_audit(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return sprintf(buffer, "%s", audit_mode_names[aa_g_audit]); } static int param_set_audit(const char *val, const struct kernel_param *kp) { int i; if (!apparmor_enabled) return -EINVAL; if (!val) return -EINVAL; if (apparmor_initialized && !aa_current_policy_admin_capable(NULL)) return -EPERM; i = match_string(audit_mode_names, AUDIT_MAX_INDEX, val); if (i < 0) return -EINVAL; aa_g_audit = i; return 0; } static int param_get_mode(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return sprintf(buffer, "%s", aa_profile_mode_names[aa_g_profile_mode]); } static int param_set_mode(const char *val, const struct kernel_param *kp) { int i; if (!apparmor_enabled) return -EINVAL; if (!val) return -EINVAL; if (apparmor_initialized && !aa_current_policy_admin_capable(NULL)) return -EPERM; i = match_string(aa_profile_mode_names, APPARMOR_MODE_NAMES_MAX_INDEX, val); if (i < 0) return -EINVAL; aa_g_profile_mode = i; return 0; } char *aa_get_buffer(bool in_atomic) { union aa_buffer *aa_buf; struct aa_local_cache *cache; bool try_again = true; gfp_t flags = (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); /* use per cpu cached buffers first */ cache = get_cpu_ptr(&aa_local_buffers); if (!list_empty(&cache->head)) { aa_buf = list_first_entry(&cache->head, union aa_buffer, list); list_del(&aa_buf->list); cache->hold--; cache->count--; put_cpu_ptr(&aa_local_buffers); return &aa_buf->buffer[0]; } put_cpu_ptr(&aa_local_buffers); if (!spin_trylock(&aa_buffers_lock)) { cache = get_cpu_ptr(&aa_local_buffers); cache->hold += 1; put_cpu_ptr(&aa_local_buffers); spin_lock(&aa_buffers_lock); } else { cache = get_cpu_ptr(&aa_local_buffers); put_cpu_ptr(&aa_local_buffers); } retry: if (buffer_count > reserve_count || (in_atomic && !list_empty(&aa_global_buffers))) { aa_buf = list_first_entry(&aa_global_buffers, union aa_buffer, list); list_del(&aa_buf->list); buffer_count--; spin_unlock(&aa_buffers_lock); return aa_buf->buffer; } if (in_atomic) { /* * out of reserve buffers and in atomic context so increase * how many buffers to keep in reserve */ reserve_count++; flags = GFP_ATOMIC; } spin_unlock(&aa_buffers_lock); if (!in_atomic) might_sleep(); aa_buf = kmalloc(aa_g_path_max, flags); if (!aa_buf) { if (try_again) { try_again = false; spin_lock(&aa_buffers_lock); goto retry; } pr_warn_once("AppArmor: Failed to allocate a memory buffer.\n"); return NULL; } return aa_buf->buffer; } void aa_put_buffer(char *buf) { union aa_buffer *aa_buf; struct aa_local_cache *cache; if (!buf) return; aa_buf = container_of(buf, union aa_buffer, buffer[0]); cache = get_cpu_ptr(&aa_local_buffers); if (!cache->hold) { put_cpu_ptr(&aa_local_buffers); if (spin_trylock(&aa_buffers_lock)) { /* put back on global list */ list_add(&aa_buf->list, &aa_global_buffers); buffer_count++; spin_unlock(&aa_buffers_lock); cache = get_cpu_ptr(&aa_local_buffers); put_cpu_ptr(&aa_local_buffers); return; } /* contention on global list, fallback to percpu */ cache = get_cpu_ptr(&aa_local_buffers); cache->hold += 1; } /* cache in percpu list */ list_add(&aa_buf->list, &cache->head); cache->count++; put_cpu_ptr(&aa_local_buffers); } /* * AppArmor init functions */ /** * set_init_ctx - set a task context and profile on the first task. * * TODO: allow setting an alternate profile than unconfined */ static int __init set_init_ctx(void) { struct cred *cred = (__force struct cred *)current->real_cred; set_cred_label(cred, aa_get_label(ns_unconfined(root_ns))); return 0; } static void destroy_buffers(void) { union aa_buffer *aa_buf; spin_lock(&aa_buffers_lock); while (!list_empty(&aa_global_buffers)) { aa_buf = list_first_entry(&aa_global_buffers, union aa_buffer, list); list_del(&aa_buf->list); spin_unlock(&aa_buffers_lock); kfree(aa_buf); spin_lock(&aa_buffers_lock); } spin_unlock(&aa_buffers_lock); } static int __init alloc_buffers(void) { union aa_buffer *aa_buf; int i, num; /* * per cpu set of cached allocated buffers used to help reduce * lock contention */ for_each_possible_cpu(i) { per_cpu(aa_local_buffers, i).hold = 0; per_cpu(aa_local_buffers, i).count = 0; INIT_LIST_HEAD(&per_cpu(aa_local_buffers, i).head); } /* * A function may require two buffers at once. Usually the buffers are * used for a short period of time and are shared. On UP kernel buffers * two should be enough, with more CPUs it is possible that more * buffers will be used simultaneously. The preallocated pool may grow. * This preallocation has also the side-effect that AppArmor will be * disabled early at boot if aa_g_path_max is extremly high. */ if (num_online_cpus() > 1) num = 4 + RESERVE_COUNT; else num = 2 + RESERVE_COUNT; for (i = 0; i < num; i++) { aa_buf = kmalloc(aa_g_path_max, GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); if (!aa_buf) { destroy_buffers(); return -ENOMEM; } aa_put_buffer(aa_buf->buffer); } return 0; } #ifdef CONFIG_SYSCTL static int apparmor_dointvec(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (!aa_current_policy_admin_capable(NULL)) return -EPERM; if (!apparmor_enabled) return -EINVAL; return proc_dointvec(table, write, buffer, lenp, ppos); } static struct ctl_table apparmor_sysctl_table[] = { #ifdef CONFIG_USER_NS { .procname = "unprivileged_userns_apparmor_policy", .data = &unprivileged_userns_apparmor_policy, .maxlen = sizeof(int), .mode = 0600, .proc_handler = apparmor_dointvec, }, #endif /* CONFIG_USER_NS */ { .procname = "apparmor_display_secid_mode", .data = &apparmor_display_secid_mode, .maxlen = sizeof(int), .mode = 0600, .proc_handler = apparmor_dointvec, }, { .procname = "apparmor_restrict_unprivileged_unconfined", .data = &aa_unprivileged_unconfined_restricted, .maxlen = sizeof(int), .mode = 0600, .proc_handler = apparmor_dointvec, }, { } }; static int __init apparmor_init_sysctl(void) { return register_sysctl("kernel", apparmor_sysctl_table) ? 0 : -ENOMEM; } #else static inline int apparmor_init_sysctl(void) { return 0; } #endif /* CONFIG_SYSCTL */ #if defined(CONFIG_NETFILTER) && defined(CONFIG_NETWORK_SECMARK) static unsigned int apparmor_ip_postroute(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct aa_sk_ctx *ctx; struct sock *sk; if (!skb->secmark) return NF_ACCEPT; sk = skb_to_full_sk(skb); if (sk == NULL) return NF_ACCEPT; ctx = aa_sock(sk); if (!apparmor_secmark_check(ctx->label, OP_SENDMSG, AA_MAY_SEND, skb->secmark, sk)) return NF_ACCEPT; return NF_DROP_ERR(-ECONNREFUSED); } static const struct nf_hook_ops apparmor_nf_ops[] = { { .hook = apparmor_ip_postroute, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_SELINUX_FIRST, }, #if IS_ENABLED(CONFIG_IPV6) { .hook = apparmor_ip_postroute, .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_SELINUX_FIRST, }, #endif }; static int __net_init apparmor_nf_register(struct net *net) { return nf_register_net_hooks(net, apparmor_nf_ops, ARRAY_SIZE(apparmor_nf_ops)); } static void __net_exit apparmor_nf_unregister(struct net *net) { nf_unregister_net_hooks(net, apparmor_nf_ops, ARRAY_SIZE(apparmor_nf_ops)); } static struct pernet_operations apparmor_net_ops = { .init = apparmor_nf_register, .exit = apparmor_nf_unregister, }; static int __init apparmor_nf_ip_init(void) { int err; if (!apparmor_enabled) return 0; err = register_pernet_subsys(&apparmor_net_ops); if (err) panic("Apparmor: register_pernet_subsys: error %d\n", err); return 0; } __initcall(apparmor_nf_ip_init); #endif static char nulldfa_src[] = { #include "nulldfa.in" }; static struct aa_dfa *nulldfa; static char stacksplitdfa_src[] = { #include "stacksplitdfa.in" }; struct aa_dfa *stacksplitdfa; struct aa_policydb *nullpdb; static int __init aa_setup_dfa_engine(void) { int error = -ENOMEM; nullpdb = aa_alloc_pdb(GFP_KERNEL); if (!nullpdb) return -ENOMEM; nulldfa = aa_dfa_unpack(nulldfa_src, sizeof(nulldfa_src), TO_ACCEPT1_FLAG(YYTD_DATA32) | TO_ACCEPT2_FLAG(YYTD_DATA32)); if (IS_ERR(nulldfa)) { error = PTR_ERR(nulldfa); goto fail; } nullpdb->dfa = aa_get_dfa(nulldfa); nullpdb->perms = kcalloc(2, sizeof(struct aa_perms), GFP_KERNEL); if (!nullpdb->perms) goto fail; nullpdb->size = 2; stacksplitdfa = aa_dfa_unpack(stacksplitdfa_src, sizeof(stacksplitdfa_src), TO_ACCEPT1_FLAG(YYTD_DATA32) | TO_ACCEPT2_FLAG(YYTD_DATA32)); if (IS_ERR(stacksplitdfa)) { error = PTR_ERR(stacksplitdfa); goto fail; } return 0; fail: aa_put_pdb(nullpdb); aa_put_dfa(nulldfa); nullpdb = NULL; nulldfa = NULL; stacksplitdfa = NULL; return error; } static void __init aa_teardown_dfa_engine(void) { aa_put_dfa(stacksplitdfa); aa_put_dfa(nulldfa); aa_put_pdb(nullpdb); nullpdb = NULL; stacksplitdfa = NULL; nulldfa = NULL; } static int __init apparmor_init(void) { int error; error = aa_setup_dfa_engine(); if (error) { AA_ERROR("Unable to setup dfa engine\n"); goto alloc_out; } error = aa_alloc_root_ns(); if (error) { AA_ERROR("Unable to allocate default profile namespace\n"); goto alloc_out; } error = apparmor_init_sysctl(); if (error) { AA_ERROR("Unable to register sysctls\n"); goto alloc_out; } error = alloc_buffers(); if (error) { AA_ERROR("Unable to allocate work buffers\n"); goto alloc_out; } error = set_init_ctx(); if (error) { AA_ERROR("Failed to set context on init task\n"); aa_free_root_ns(); goto buffers_out; } security_add_hooks(apparmor_hooks, ARRAY_SIZE(apparmor_hooks), &apparmor_lsmid); /* Report that AppArmor successfully initialized */ apparmor_initialized = 1; if (aa_g_profile_mode == APPARMOR_COMPLAIN) aa_info_message("AppArmor initialized: complain mode enabled"); else if (aa_g_profile_mode == APPARMOR_KILL) aa_info_message("AppArmor initialized: kill mode enabled"); else aa_info_message("AppArmor initialized"); return error; buffers_out: destroy_buffers(); alloc_out: aa_destroy_aafs(); aa_teardown_dfa_engine(); apparmor_enabled = false; return error; } DEFINE_LSM(apparmor) = { .name = "apparmor", .flags = LSM_FLAG_LEGACY_MAJOR | LSM_FLAG_EXCLUSIVE, .enabled = &apparmor_enabled, .blobs = &apparmor_blob_sizes, .init = apparmor_init, };
15 16 16 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/seq_file.c * * helper functions for making synthetic files from sequences of records. * initial implementation -- AV, Oct 2001. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/cache.h> #include <linux/fs.h> #include <linux/export.h> #include <linux/seq_file.h> #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/mm.h> #include <linux/printk.h> #include <linux/string_helpers.h> #include <linux/uio.h> #include <linux/uaccess.h> #include <asm/page.h> static struct kmem_cache *seq_file_cache __ro_after_init; static void seq_set_overflow(struct seq_file *m) { m->count = m->size; } static void *seq_buf_alloc(unsigned long size) { if (unlikely(size > MAX_RW_COUNT)) return NULL; return kvmalloc(size, GFP_KERNEL_ACCOUNT); } /** * seq_open - initialize sequential file * @file: file we initialize * @op: method table describing the sequence * * seq_open() sets @file, associating it with a sequence described * by @op. @op->start() sets the iterator up and returns the first * element of sequence. @op->stop() shuts it down. @op->next() * returns the next element of sequence. @op->show() prints element * into the buffer. In case of error ->start() and ->next() return * ERR_PTR(error). In the end of sequence they return %NULL. ->show() * returns 0 in case of success and negative number in case of error. * Returning SEQ_SKIP means "discard this element and move on". * Note: seq_open() will allocate a struct seq_file and store its * pointer in @file->private_data. This pointer should not be modified. */ int seq_open(struct file *file, const struct seq_operations *op) { struct seq_file *p; WARN_ON(file->private_data); p = kmem_cache_zalloc(seq_file_cache, GFP_KERNEL); if (!p) return -ENOMEM; file->private_data = p; mutex_init(&p->lock); p->op = op; // No refcounting: the lifetime of 'p' is constrained // to the lifetime of the file. p->file = file; /* * seq_files support lseek() and pread(). They do not implement * write() at all, but we clear FMODE_PWRITE here for historical * reasons. * * If a client of seq_files a) implements file.write() and b) wishes to * support pwrite() then that client will need to implement its own * file.open() which calls seq_open() and then sets FMODE_PWRITE. */ file->f_mode &= ~FMODE_PWRITE; return 0; } EXPORT_SYMBOL(seq_open); static int traverse(struct seq_file *m, loff_t offset) { loff_t pos = 0; int error = 0; void *p; m->index = 0; m->count = m->from = 0; if (!offset) return 0; if (!m->buf) { m->buf = seq_buf_alloc(m->size = PAGE_SIZE); if (!m->buf) return -ENOMEM; } p = m->op->start(m, &m->index); while (p) { error = PTR_ERR(p); if (IS_ERR(p)) break; error = m->op->show(m, p); if (error < 0) break; if (unlikely(error)) { error = 0; m->count = 0; } if (seq_has_overflowed(m)) goto Eoverflow; p = m->op->next(m, p, &m->index); if (pos + m->count > offset) { m->from = offset - pos; m->count -= m->from; break; } pos += m->count; m->count = 0; if (pos == offset) break; } m->op->stop(m, p); return error; Eoverflow: m->op->stop(m, p); kvfree(m->buf); m->count = 0; m->buf = seq_buf_alloc(m->size <<= 1); return !m->buf ? -ENOMEM : -EAGAIN; } /** * seq_read - ->read() method for sequential files. * @file: the file to read from * @buf: the buffer to read to * @size: the maximum number of bytes to read * @ppos: the current position in the file * * Ready-made ->f_op->read() */ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct iovec iov = { .iov_base = buf, .iov_len = size}; struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, file); iov_iter_init(&iter, ITER_DEST, &iov, 1, size); kiocb.ki_pos = *ppos; ret = seq_read_iter(&kiocb, &iter); *ppos = kiocb.ki_pos; return ret; } EXPORT_SYMBOL(seq_read); /* * Ready-made ->f_op->read_iter() */ ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct seq_file *m = iocb->ki_filp->private_data; size_t copied = 0; size_t n; void *p; int err = 0; if (!iov_iter_count(iter)) return 0; mutex_lock(&m->lock); /* * if request is to read from zero offset, reset iterator to first * record as it might have been already advanced by previous requests */ if (iocb->ki_pos == 0) { m->index = 0; m->count = 0; } /* Don't assume ki_pos is where we left it */ if (unlikely(iocb->ki_pos != m->read_pos)) { while ((err = traverse(m, iocb->ki_pos)) == -EAGAIN) ; if (err) { /* With prejudice... */ m->read_pos = 0; m->index = 0; m->count = 0; goto Done; } else { m->read_pos = iocb->ki_pos; } } /* grab buffer if we didn't have one */ if (!m->buf) { m->buf = seq_buf_alloc(m->size = PAGE_SIZE); if (!m->buf) goto Enomem; } // something left in the buffer - copy it out first if (m->count) { n = copy_to_iter(m->buf + m->from, m->count, iter); m->count -= n; m->from += n; copied += n; if (m->count) // hadn't managed to copy everything goto Done; } // get a non-empty record in the buffer m->from = 0; p = m->op->start(m, &m->index); while (1) { err = PTR_ERR(p); if (!p || IS_ERR(p)) // EOF or an error break; err = m->op->show(m, p); if (err < 0) // hard error break; if (unlikely(err)) // ->show() says "skip it" m->count = 0; if (unlikely(!m->count)) { // empty record p = m->op->next(m, p, &m->index); continue; } if (!seq_has_overflowed(m)) // got it goto Fill; // need a bigger buffer m->op->stop(m, p); kvfree(m->buf); m->count = 0; m->buf = seq_buf_alloc(m->size <<= 1); if (!m->buf) goto Enomem; p = m->op->start(m, &m->index); } // EOF or an error m->op->stop(m, p); m->count = 0; goto Done; Fill: // one non-empty record is in the buffer; if they want more, // try to fit more in, but in any case we need to advance // the iterator once for every record shown. while (1) { size_t offs = m->count; loff_t pos = m->index; p = m->op->next(m, p, &m->index); if (pos == m->index) { pr_info_ratelimited("buggy .next function %ps did not update position index\n", m->op->next); m->index++; } if (!p || IS_ERR(p)) // no next record for us break; if (m->count >= iov_iter_count(iter)) break; err = m->op->show(m, p); if (err > 0) { // ->show() says "skip it" m->count = offs; } else if (err || seq_has_overflowed(m)) { m->count = offs; break; } } m->op->stop(m, p); n = copy_to_iter(m->buf, m->count, iter); copied += n; m->count -= n; m->from = n; Done: if (unlikely(!copied)) { copied = m->count ? -EFAULT : err; } else { iocb->ki_pos += copied; m->read_pos += copied; } mutex_unlock(&m->lock); return copied; Enomem: err = -ENOMEM; goto Done; } EXPORT_SYMBOL(seq_read_iter); /** * seq_lseek - ->llseek() method for sequential files. * @file: the file in question * @offset: new position * @whence: 0 for absolute, 1 for relative position * * Ready-made ->f_op->llseek() */ loff_t seq_lseek(struct file *file, loff_t offset, int whence) { struct seq_file *m = file->private_data; loff_t retval = -EINVAL; mutex_lock(&m->lock); switch (whence) { case SEEK_CUR: offset += file->f_pos; fallthrough; case SEEK_SET: if (offset < 0) break; retval = offset; if (offset != m->read_pos) { while ((retval = traverse(m, offset)) == -EAGAIN) ; if (retval) { /* with extreme prejudice... */ file->f_pos = 0; m->read_pos = 0; m->index = 0; m->count = 0; } else { m->read_pos = offset; retval = file->f_pos = offset; } } else { file->f_pos = offset; } } mutex_unlock(&m->lock); return retval; } EXPORT_SYMBOL(seq_lseek); /** * seq_release - free the structures associated with sequential file. * @file: file in question * @inode: its inode * * Frees the structures associated with sequential file; can be used * as ->f_op->release() if you don't have private data to destroy. */ int seq_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; kvfree(m->buf); kmem_cache_free(seq_file_cache, m); return 0; } EXPORT_SYMBOL(seq_release); /** * seq_escape_mem - print data into buffer, escaping some characters * @m: target buffer * @src: source buffer * @len: size of source buffer * @flags: flags to pass to string_escape_mem() * @esc: set of characters that need escaping * * Puts data into buffer, replacing each occurrence of character from * given class (defined by @flags and @esc) with printable escaped sequence. * * Use seq_has_overflowed() to check for errors. */ void seq_escape_mem(struct seq_file *m, const char *src, size_t len, unsigned int flags, const char *esc) { char *buf; size_t size = seq_get_buf(m, &buf); int ret; ret = string_escape_mem(src, len, buf, size, flags, esc); seq_commit(m, ret < size ? ret : -1); } EXPORT_SYMBOL(seq_escape_mem); void seq_vprintf(struct seq_file *m, const char *f, va_list args) { int len; if (m->count < m->size) { len = vsnprintf(m->buf + m->count, m->size - m->count, f, args); if (m->count + len < m->size) { m->count += len; return; } } seq_set_overflow(m); } EXPORT_SYMBOL(seq_vprintf); void seq_printf(struct seq_file *m, const char *f, ...) { va_list args; va_start(args, f); seq_vprintf(m, f, args); va_end(args); } EXPORT_SYMBOL(seq_printf); #ifdef CONFIG_BINARY_PRINTF void seq_bprintf(struct seq_file *m, const char *f, const u32 *binary) { int len; if (m->count < m->size) { len = bstr_printf(m->buf + m->count, m->size - m->count, f, binary); if (m->count + len < m->size) { m->count += len; return; } } seq_set_overflow(m); } EXPORT_SYMBOL(seq_bprintf); #endif /* CONFIG_BINARY_PRINTF */ /** * mangle_path - mangle and copy path to buffer beginning * @s: buffer start * @p: beginning of path in above buffer * @esc: set of characters that need escaping * * Copy the path from @p to @s, replacing each occurrence of character from * @esc with usual octal escape. * Returns pointer past last written character in @s, or NULL in case of * failure. */ char *mangle_path(char *s, const char *p, const char *esc) { while (s <= p) { char c = *p++; if (!c) { return s; } else if (!strchr(esc, c)) { *s++ = c; } else if (s + 4 > p) { break; } else { *s++ = '\\'; *s++ = '0' + ((c & 0300) >> 6); *s++ = '0' + ((c & 070) >> 3); *s++ = '0' + (c & 07); } } return NULL; } EXPORT_SYMBOL(mangle_path); /** * seq_path - seq_file interface to print a pathname * @m: the seq_file handle * @path: the struct path to print * @esc: set of characters to escape in the output * * return the absolute path of 'path', as represented by the * dentry / mnt pair in the path parameter. */ int seq_path(struct seq_file *m, const struct path *path, const char *esc) { char *buf; size_t size = seq_get_buf(m, &buf); int res = -1; if (size) { char *p = d_path(path, buf, size); if (!IS_ERR(p)) { char *end = mangle_path(buf, p, esc); if (end) res = end - buf; } } seq_commit(m, res); return res; } EXPORT_SYMBOL(seq_path); /** * seq_file_path - seq_file interface to print a pathname of a file * @m: the seq_file handle * @file: the struct file to print * @esc: set of characters to escape in the output * * return the absolute path to the file. */ int seq_file_path(struct seq_file *m, struct file *file, const char *esc) { return seq_path(m, &file->f_path, esc); } EXPORT_SYMBOL(seq_file_path); /* * Same as seq_path, but relative to supplied root. */ int seq_path_root(struct seq_file *m, const struct path *path, const struct path *root, const char *esc) { char *buf; size_t size = seq_get_buf(m, &buf); int res = -ENAMETOOLONG; if (size) { char *p; p = __d_path(path, root, buf, size); if (!p) return SEQ_SKIP; res = PTR_ERR(p); if (!IS_ERR(p)) { char *end = mangle_path(buf, p, esc); if (end) res = end - buf; else res = -ENAMETOOLONG; } } seq_commit(m, res); return res < 0 && res != -ENAMETOOLONG ? res : 0; } /* * returns the path of the 'dentry' from the root of its filesystem. */ int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc) { char *buf; size_t size = seq_get_buf(m, &buf); int res = -1; if (size) { char *p = dentry_path(dentry, buf, size); if (!IS_ERR(p)) { char *end = mangle_path(buf, p, esc); if (end) res = end - buf; } } seq_commit(m, res); return res; } EXPORT_SYMBOL(seq_dentry); void *single_start(struct seq_file *p, loff_t *pos) { return *pos ? NULL : SEQ_START_TOKEN; } static void *single_next(struct seq_file *p, void *v, loff_t *pos) { ++*pos; return NULL; } static void single_stop(struct seq_file *p, void *v) { } int single_open(struct file *file, int (*show)(struct seq_file *, void *), void *data) { struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL_ACCOUNT); int res = -ENOMEM; if (op) { op->start = single_start; op->next = single_next; op->stop = single_stop; op->show = show; res = seq_open(file, op); if (!res) ((struct seq_file *)file->private_data)->private = data; else kfree(op); } return res; } EXPORT_SYMBOL(single_open); int single_open_size(struct file *file, int (*show)(struct seq_file *, void *), void *data, size_t size) { char *buf = seq_buf_alloc(size); int ret; if (!buf) return -ENOMEM; ret = single_open(file, show, data); if (ret) { kvfree(buf); return ret; } ((struct seq_file *)file->private_data)->buf = buf; ((struct seq_file *)file->private_data)->size = size; return 0; } EXPORT_SYMBOL(single_open_size); int single_release(struct inode *inode, struct file *file) { const struct seq_operations *op = ((struct seq_file *)file->private_data)->op; int res = seq_release(inode, file); kfree(op); return res; } EXPORT_SYMBOL(single_release); int seq_release_private(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; kfree(seq->private); seq->private = NULL; return seq_release(inode, file); } EXPORT_SYMBOL(seq_release_private); void *__seq_open_private(struct file *f, const struct seq_operations *ops, int psize) { int rc; void *private; struct seq_file *seq; private = kzalloc(psize, GFP_KERNEL_ACCOUNT); if (private == NULL) goto out; rc = seq_open(f, ops); if (rc < 0) goto out_free; seq = f->private_data; seq->private = private; return private; out_free: kfree(private); out: return NULL; } EXPORT_SYMBOL(__seq_open_private); int seq_open_private(struct file *filp, const struct seq_operations *ops, int psize) { return __seq_open_private(filp, ops, psize) ? 0 : -ENOMEM; } EXPORT_SYMBOL(seq_open_private); void seq_putc(struct seq_file *m, char c) { if (m->count >= m->size) return; m->buf[m->count++] = c; } EXPORT_SYMBOL(seq_putc); void seq_puts(struct seq_file *m, const char *s) { int len = strlen(s); if (m->count + len >= m->size) { seq_set_overflow(m); return; } memcpy(m->buf + m->count, s, len); m->count += len; } EXPORT_SYMBOL(seq_puts); /** * seq_put_decimal_ull_width - A helper routine for putting decimal numbers * without rich format of printf(). * only 'unsigned long long' is supported. * @m: seq_file identifying the buffer to which data should be written * @delimiter: a string which is printed before the number * @num: the number * @width: a minimum field width * * This routine will put strlen(delimiter) + number into seq_filed. * This routine is very quick when you show lots of numbers. * In usual cases, it will be better to use seq_printf(). It's easier to read. */ void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter, unsigned long long num, unsigned int width) { int len; if (m->count + 2 >= m->size) /* we'll write 2 bytes at least */ goto overflow; if (delimiter && delimiter[0]) { if (delimiter[1] == 0) seq_putc(m, delimiter[0]); else seq_puts(m, delimiter); } if (!width) width = 1; if (m->count + width >= m->size) goto overflow; len = num_to_str(m->buf + m->count, m->size - m->count, num, width); if (!len) goto overflow; m->count += len; return; overflow: seq_set_overflow(m); } void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, unsigned long long num) { return seq_put_decimal_ull_width(m, delimiter, num, 0); } EXPORT_SYMBOL(seq_put_decimal_ull); /** * seq_put_hex_ll - put a number in hexadecimal notation * @m: seq_file identifying the buffer to which data should be written * @delimiter: a string which is printed before the number * @v: the number * @width: a minimum field width * * seq_put_hex_ll(m, "", v, 8) is equal to seq_printf(m, "%08llx", v) * * This routine is very quick when you show lots of numbers. * In usual cases, it will be better to use seq_printf(). It's easier to read. */ void seq_put_hex_ll(struct seq_file *m, const char *delimiter, unsigned long long v, unsigned int width) { unsigned int len; int i; if (delimiter && delimiter[0]) { if (delimiter[1] == 0) seq_putc(m, delimiter[0]); else seq_puts(m, delimiter); } /* If x is 0, the result of __builtin_clzll is undefined */ if (v == 0) len = 1; else len = (sizeof(v) * 8 - __builtin_clzll(v) + 3) / 4; if (len < width) len = width; if (m->count + len > m->size) { seq_set_overflow(m); return; } for (i = len - 1; i >= 0; i--) { m->buf[m->count + i] = hex_asc[0xf & v]; v = v >> 4; } m->count += len; } void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num) { int len; if (m->count + 3 >= m->size) /* we'll write 2 bytes at least */ goto overflow; if (delimiter && delimiter[0]) { if (delimiter[1] == 0) seq_putc(m, delimiter[0]); else seq_puts(m, delimiter); } if (m->count + 2 >= m->size) goto overflow; if (num < 0) { m->buf[m->count++] = '-'; num = -num; } if (num < 10) { m->buf[m->count++] = num + '0'; return; } len = num_to_str(m->buf + m->count, m->size - m->count, num, 0); if (!len) goto overflow; m->count += len; return; overflow: seq_set_overflow(m); } EXPORT_SYMBOL(seq_put_decimal_ll); /** * seq_write - write arbitrary data to buffer * @seq: seq_file identifying the buffer to which data should be written * @data: data address * @len: number of bytes * * Return 0 on success, non-zero otherwise. */ int seq_write(struct seq_file *seq, const void *data, size_t len) { if (seq->count + len < seq->size) { memcpy(seq->buf + seq->count, data, len); seq->count += len; return 0; } seq_set_overflow(seq); return -1; } EXPORT_SYMBOL(seq_write); /** * seq_pad - write padding spaces to buffer * @m: seq_file identifying the buffer to which data should be written * @c: the byte to append after padding if non-zero */ void seq_pad(struct seq_file *m, char c) { int size = m->pad_until - m->count; if (size > 0) { if (size + m->count > m->size) { seq_set_overflow(m); return; } memset(m->buf + m->count, ' ', size); m->count += size; } if (c) seq_putc(m, c); } EXPORT_SYMBOL(seq_pad); /* A complete analogue of print_hex_dump() */ void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii) { const u8 *ptr = buf; int i, linelen, remaining = len; char *buffer; size_t size; int ret; if (rowsize != 16 && rowsize != 32) rowsize = 16; for (i = 0; i < len && !seq_has_overflowed(m); i += rowsize) { linelen = min(remaining, rowsize); remaining -= rowsize; switch (prefix_type) { case DUMP_PREFIX_ADDRESS: seq_printf(m, "%s%p: ", prefix_str, ptr + i); break; case DUMP_PREFIX_OFFSET: seq_printf(m, "%s%.8x: ", prefix_str, i); break; default: seq_printf(m, "%s", prefix_str); break; } size = seq_get_buf(m, &buffer); ret = hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize, buffer, size, ascii); seq_commit(m, ret < size ? ret : -1); seq_putc(m, '\n'); } } EXPORT_SYMBOL(seq_hex_dump); struct list_head *seq_list_start(struct list_head *head, loff_t pos) { struct list_head *lh; list_for_each(lh, head) if (pos-- == 0) return lh; return NULL; } EXPORT_SYMBOL(seq_list_start); struct list_head *seq_list_start_head(struct list_head *head, loff_t pos) { if (!pos) return head; return seq_list_start(head, pos - 1); } EXPORT_SYMBOL(seq_list_start_head); struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos) { struct list_head *lh; lh = ((struct list_head *)v)->next; ++*ppos; return lh == head ? NULL : lh; } EXPORT_SYMBOL(seq_list_next); struct list_head *seq_list_start_rcu(struct list_head *head, loff_t pos) { struct list_head *lh; list_for_each_rcu(lh, head) if (pos-- == 0) return lh; return NULL; } EXPORT_SYMBOL(seq_list_start_rcu); struct list_head *seq_list_start_head_rcu(struct list_head *head, loff_t pos) { if (!pos) return head; return seq_list_start_rcu(head, pos - 1); } EXPORT_SYMBOL(seq_list_start_head_rcu); struct list_head *seq_list_next_rcu(void *v, struct list_head *head, loff_t *ppos) { struct list_head *lh; lh = list_next_rcu((struct list_head *)v); ++*ppos; return lh == head ? NULL : lh; } EXPORT_SYMBOL(seq_list_next_rcu); /** * seq_hlist_start - start an iteration of a hlist * @head: the head of the hlist * @pos: the start position of the sequence * * Called at seq_file->op->start(). */ struct hlist_node *seq_hlist_start(struct hlist_head *head, loff_t pos) { struct hlist_node *node; hlist_for_each(node, head) if (pos-- == 0) return node; return NULL; } EXPORT_SYMBOL(seq_hlist_start); /** * seq_hlist_start_head - start an iteration of a hlist * @head: the head of the hlist * @pos: the start position of the sequence * * Called at seq_file->op->start(). Call this function if you want to * print a header at the top of the output. */ struct hlist_node *seq_hlist_start_head(struct hlist_head *head, loff_t pos) { if (!pos) return SEQ_START_TOKEN; return seq_hlist_start(head, pos - 1); } EXPORT_SYMBOL(seq_hlist_start_head); /** * seq_hlist_next - move to the next position of the hlist * @v: the current iterator * @head: the head of the hlist * @ppos: the current position * * Called at seq_file->op->next(). */ struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head, loff_t *ppos) { struct hlist_node *node = v; ++*ppos; if (v == SEQ_START_TOKEN) return head->first; else return node->next; } EXPORT_SYMBOL(seq_hlist_next); /** * seq_hlist_start_rcu - start an iteration of a hlist protected by RCU * @head: the head of the hlist * @pos: the start position of the sequence * * Called at seq_file->op->start(). * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ struct hlist_node *seq_hlist_start_rcu(struct hlist_head *head, loff_t pos) { struct hlist_node *node; __hlist_for_each_rcu(node, head) if (pos-- == 0) return node; return NULL; } EXPORT_SYMBOL(seq_hlist_start_rcu); /** * seq_hlist_start_head_rcu - start an iteration of a hlist protected by RCU * @head: the head of the hlist * @pos: the start position of the sequence * * Called at seq_file->op->start(). Call this function if you want to * print a header at the top of the output. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head, loff_t pos) { if (!pos) return SEQ_START_TOKEN; return seq_hlist_start_rcu(head, pos - 1); } EXPORT_SYMBOL(seq_hlist_start_head_rcu); /** * seq_hlist_next_rcu - move to the next position of the hlist protected by RCU * @v: the current iterator * @head: the head of the hlist * @ppos: the current position * * Called at seq_file->op->next(). * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ struct hlist_node *seq_hlist_next_rcu(void *v, struct hlist_head *head, loff_t *ppos) { struct hlist_node *node = v; ++*ppos; if (v == SEQ_START_TOKEN) return rcu_dereference(head->first); else return rcu_dereference(node->next); } EXPORT_SYMBOL(seq_hlist_next_rcu); /** * seq_hlist_start_percpu - start an iteration of a percpu hlist array * @head: pointer to percpu array of struct hlist_heads * @cpu: pointer to cpu "cursor" * @pos: start position of sequence * * Called at seq_file->op->start(). */ struct hlist_node * seq_hlist_start_percpu(struct hlist_head __percpu *head, int *cpu, loff_t pos) { struct hlist_node *node; for_each_possible_cpu(*cpu) { hlist_for_each(node, per_cpu_ptr(head, *cpu)) { if (pos-- == 0) return node; } } return NULL; } EXPORT_SYMBOL(seq_hlist_start_percpu); /** * seq_hlist_next_percpu - move to the next position of the percpu hlist array * @v: pointer to current hlist_node * @head: pointer to percpu array of struct hlist_heads * @cpu: pointer to cpu "cursor" * @pos: start position of sequence * * Called at seq_file->op->next(). */ struct hlist_node * seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head, int *cpu, loff_t *pos) { struct hlist_node *node = v; ++*pos; if (node->next) return node->next; for (*cpu = cpumask_next(*cpu, cpu_possible_mask); *cpu < nr_cpu_ids; *cpu = cpumask_next(*cpu, cpu_possible_mask)) { struct hlist_head *bucket = per_cpu_ptr(head, *cpu); if (!hlist_empty(bucket)) return bucket->first; } return NULL; } EXPORT_SYMBOL(seq_hlist_next_percpu); void __init seq_file_init(void) { seq_file_cache = KMEM_CACHE(seq_file, SLAB_ACCOUNT|SLAB_PANIC); }
4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2018 - 2021, 2023 Intel Corporation */ #include <net/cfg80211.h> #include "core.h" #include "nl80211.h" #include "rdev-ops.h" static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev, struct nlattr *ftmreq, struct cfg80211_pmsr_request_peer *out, struct genl_info *info) { const struct cfg80211_pmsr_capabilities *capa = rdev->wiphy.pmsr_capa; struct nlattr *tb[NL80211_PMSR_FTM_REQ_ATTR_MAX + 1]; u32 preamble = NL80211_PREAMBLE_DMG; /* only optional in DMG */ /* validate existing data */ if (!(rdev->wiphy.pmsr_capa->ftm.bandwidths & BIT(out->chandef.width))) { NL_SET_ERR_MSG(info->extack, "FTM: unsupported bandwidth"); return -EINVAL; } /* no validation needed - was already done via nested policy */ nla_parse_nested_deprecated(tb, NL80211_PMSR_FTM_REQ_ATTR_MAX, ftmreq, NULL, NULL); if (tb[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE]) preamble = nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE]); /* set up values - struct is 0-initialized */ out->ftm.requested = true; switch (out->chandef.chan->band) { case NL80211_BAND_60GHZ: /* optional */ break; default: if (!tb[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE]) { NL_SET_ERR_MSG(info->extack, "FTM: must specify preamble"); return -EINVAL; } } if (!(capa->ftm.preambles & BIT(preamble))) { NL_SET_ERR_MSG_ATTR(info->extack, tb[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE], "FTM: invalid preamble"); return -EINVAL; } out->ftm.preamble = preamble; out->ftm.burst_period = 0; if (tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD]) out->ftm.burst_period = nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD]); out->ftm.asap = !!tb[NL80211_PMSR_FTM_REQ_ATTR_ASAP]; if (out->ftm.asap && !capa->ftm.asap) { NL_SET_ERR_MSG_ATTR(info->extack, tb[NL80211_PMSR_FTM_REQ_ATTR_ASAP], "FTM: ASAP mode not supported"); return -EINVAL; } if (!out->ftm.asap && !capa->ftm.non_asap) { NL_SET_ERR_MSG(info->extack, "FTM: non-ASAP mode not supported"); return -EINVAL; } out->ftm.num_bursts_exp = 0; if (tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP]) out->ftm.num_bursts_exp = nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP]); if (capa->ftm.max_bursts_exponent >= 0 && out->ftm.num_bursts_exp > capa->ftm.max_bursts_exponent) { NL_SET_ERR_MSG_ATTR(info->extack, tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP], "FTM: max NUM_BURSTS_EXP must be set lower than the device limit"); return -EINVAL; } out->ftm.burst_duration = 15; if (tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION]) out->ftm.burst_duration = nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION]); out->ftm.ftms_per_burst = 0; if (tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST]) out->ftm.ftms_per_burst = nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST]); if (capa->ftm.max_ftms_per_burst && (out->ftm.ftms_per_burst > capa->ftm.max_ftms_per_burst || out->ftm.ftms_per_burst == 0)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST], "FTM: FTMs per burst must be set lower than the device limit but non-zero"); return -EINVAL; } out->ftm.ftmr_retries = 3; if (tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES]) out->ftm.ftmr_retries = nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES]); out->ftm.request_lci = !!tb[NL80211_PMSR_FTM_REQ_ATTR_REQUEST_LCI]; if (out->ftm.request_lci && !capa->ftm.request_lci) { NL_SET_ERR_MSG_ATTR(info->extack, tb[NL80211_PMSR_FTM_REQ_ATTR_REQUEST_LCI], "FTM: LCI request not supported"); } out->ftm.request_civicloc = !!tb[NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC]; if (out->ftm.request_civicloc && !capa->ftm.request_civicloc) { NL_SET_ERR_MSG_ATTR(info->extack, tb[NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC], "FTM: civic location request not supported"); } out->ftm.trigger_based = !!tb[NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED]; if (out->ftm.trigger_based && !capa->ftm.trigger_based) { NL_SET_ERR_MSG_ATTR(info->extack, tb[NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED], "FTM: trigger based ranging is not supported"); return -EINVAL; } out->ftm.non_trigger_based = !!tb[NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED]; if (out->ftm.non_trigger_based && !capa->ftm.non_trigger_based) { NL_SET_ERR_MSG_ATTR(info->extack, tb[NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED], "FTM: trigger based ranging is not supported"); return -EINVAL; } if (out->ftm.trigger_based && out->ftm.non_trigger_based) { NL_SET_ERR_MSG(info->extack, "FTM: can't set both trigger based and non trigger based"); return -EINVAL; } if ((out->ftm.trigger_based || out->ftm.non_trigger_based) && out->ftm.preamble != NL80211_PREAMBLE_HE) { NL_SET_ERR_MSG_ATTR(info->extack, tb[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE], "FTM: non EDCA based ranging must use HE preamble"); return -EINVAL; } out->ftm.lmr_feedback = !!tb[NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK]; if (!out->ftm.trigger_based && !out->ftm.non_trigger_based && out->ftm.lmr_feedback) { NL_SET_ERR_MSG_ATTR(info->extack, tb[NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK], "FTM: LMR feedback set for EDCA based ranging"); return -EINVAL; } if (tb[NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR]) { if (!out->ftm.non_trigger_based && !out->ftm.trigger_based) { NL_SET_ERR_MSG_ATTR(info->extack, tb[NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR], "FTM: BSS color set for EDCA based ranging"); return -EINVAL; } out->ftm.bss_color = nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR]); } return 0; } static int pmsr_parse_peer(struct cfg80211_registered_device *rdev, struct nlattr *peer, struct cfg80211_pmsr_request_peer *out, struct genl_info *info) { struct nlattr *tb[NL80211_PMSR_PEER_ATTR_MAX + 1]; struct nlattr *req[NL80211_PMSR_REQ_ATTR_MAX + 1]; struct nlattr *treq; int err, rem; /* no validation needed - was already done via nested policy */ nla_parse_nested_deprecated(tb, NL80211_PMSR_PEER_ATTR_MAX, peer, NULL, NULL); if (!tb[NL80211_PMSR_PEER_ATTR_ADDR] || !tb[NL80211_PMSR_PEER_ATTR_CHAN] || !tb[NL80211_PMSR_PEER_ATTR_REQ]) { NL_SET_ERR_MSG_ATTR(info->extack, peer, "insufficient peer data"); return -EINVAL; } memcpy(out->addr, nla_data(tb[NL80211_PMSR_PEER_ATTR_ADDR]), ETH_ALEN); /* reuse info->attrs */ memset(info->attrs, 0, sizeof(*info->attrs) * (NL80211_ATTR_MAX + 1)); err = nla_parse_nested_deprecated(info->attrs, NL80211_ATTR_MAX, tb[NL80211_PMSR_PEER_ATTR_CHAN], NULL, info->extack); if (err) return err; err = nl80211_parse_chandef(rdev, info, &out->chandef); if (err) return err; /* no validation needed - was already done via nested policy */ nla_parse_nested_deprecated(req, NL80211_PMSR_REQ_ATTR_MAX, tb[NL80211_PMSR_PEER_ATTR_REQ], NULL, NULL); if (!req[NL80211_PMSR_REQ_ATTR_DATA]) { NL_SET_ERR_MSG_ATTR(info->extack, tb[NL80211_PMSR_PEER_ATTR_REQ], "missing request type/data"); return -EINVAL; } if (req[NL80211_PMSR_REQ_ATTR_GET_AP_TSF]) out->report_ap_tsf = true; if (out->report_ap_tsf && !rdev->wiphy.pmsr_capa->report_ap_tsf) { NL_SET_ERR_MSG_ATTR(info->extack, req[NL80211_PMSR_REQ_ATTR_GET_AP_TSF], "reporting AP TSF is not supported"); return -EINVAL; } nla_for_each_nested(treq, req[NL80211_PMSR_REQ_ATTR_DATA], rem) { switch (nla_type(treq)) { case NL80211_PMSR_TYPE_FTM: err = pmsr_parse_ftm(rdev, treq, out, info); break; default: NL_SET_ERR_MSG_ATTR(info->extack, treq, "unsupported measurement type"); err = -EINVAL; } } if (err) return err; return 0; } int nl80211_pmsr_start(struct sk_buff *skb, struct genl_info *info) { struct nlattr *reqattr = info->attrs[NL80211_ATTR_PEER_MEASUREMENTS]; struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct wireless_dev *wdev = info->user_ptr[1]; struct cfg80211_pmsr_request *req; struct nlattr *peers, *peer; int count, rem, err, idx; if (!rdev->wiphy.pmsr_capa) return -EOPNOTSUPP; if (!reqattr) return -EINVAL; peers = nla_find(nla_data(reqattr), nla_len(reqattr), NL80211_PMSR_ATTR_PEERS); if (!peers) return -EINVAL; count = 0; nla_for_each_nested(peer, peers, rem) { count++; if (count > rdev->wiphy.pmsr_capa->max_peers) { NL_SET_ERR_MSG_ATTR(info->extack, peer, "Too many peers used"); return -EINVAL; } } req = kzalloc(struct_size(req, peers, count), GFP_KERNEL); if (!req) return -ENOMEM; req->n_peers = count; if (info->attrs[NL80211_ATTR_TIMEOUT]) req->timeout = nla_get_u32(info->attrs[NL80211_ATTR_TIMEOUT]); if (info->attrs[NL80211_ATTR_MAC]) { if (!rdev->wiphy.pmsr_capa->randomize_mac_addr) { NL_SET_ERR_MSG_ATTR(info->extack, info->attrs[NL80211_ATTR_MAC], "device cannot randomize MAC address"); err = -EINVAL; goto out_err; } err = nl80211_parse_random_mac(info->attrs, req->mac_addr, req->mac_addr_mask); if (err) goto out_err; } else { memcpy(req->mac_addr, wdev_address(wdev), ETH_ALEN); eth_broadcast_addr(req->mac_addr_mask); } idx = 0; nla_for_each_nested(peer, peers, rem) { /* NB: this reuses info->attrs, but we no longer need it */ err = pmsr_parse_peer(rdev, peer, &req->peers[idx], info); if (err) goto out_err; idx++; } req->cookie = cfg80211_assign_cookie(rdev); req->nl_portid = info->snd_portid; err = rdev_start_pmsr(rdev, wdev, req); if (err) goto out_err; list_add_tail(&req->list, &wdev->pmsr_list); nl_set_extack_cookie_u64(info->extack, req->cookie); return 0; out_err: kfree(req); return err; } void cfg80211_pmsr_complete(struct wireless_dev *wdev, struct cfg80211_pmsr_request *req, gfp_t gfp) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_pmsr_request *tmp, *prev, *to_free = NULL; struct sk_buff *msg; void *hdr; trace_cfg80211_pmsr_complete(wdev->wiphy, wdev, req->cookie); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); if (!msg) goto free_request; hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_PEER_MEASUREMENT_COMPLETE); if (!hdr) goto free_msg; if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), NL80211_ATTR_PAD)) goto free_msg; if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, req->cookie, NL80211_ATTR_PAD)) goto free_msg; genlmsg_end(msg, hdr); genlmsg_unicast(wiphy_net(wdev->wiphy), msg, req->nl_portid); goto free_request; free_msg: nlmsg_free(msg); free_request: spin_lock_bh(&wdev->pmsr_lock); /* * cfg80211_pmsr_process_abort() may have already moved this request * to the free list, and will free it later. In this case, don't free * it here. */ list_for_each_entry_safe(tmp, prev, &wdev->pmsr_list, list) { if (tmp == req) { list_del(&req->list); to_free = req; break; } } spin_unlock_bh(&wdev->pmsr_lock); kfree(to_free); } EXPORT_SYMBOL_GPL(cfg80211_pmsr_complete); static int nl80211_pmsr_send_ftm_res(struct sk_buff *msg, struct cfg80211_pmsr_result *res) { if (res->status == NL80211_PMSR_STATUS_FAILURE) { if (nla_put_u32(msg, NL80211_PMSR_FTM_RESP_ATTR_FAIL_REASON, res->ftm.failure_reason)) goto error; if (res->ftm.failure_reason == NL80211_PMSR_FTM_FAILURE_PEER_BUSY && res->ftm.busy_retry_time && nla_put_u32(msg, NL80211_PMSR_FTM_RESP_ATTR_BUSY_RETRY_TIME, res->ftm.busy_retry_time)) goto error; return 0; } #define PUT(tp, attr, val) \ do { \ if (nla_put_##tp(msg, \ NL80211_PMSR_FTM_RESP_ATTR_##attr, \ res->ftm.val)) \ goto error; \ } while (0) #define PUTOPT(tp, attr, val) \ do { \ if (res->ftm.val##_valid) \ PUT(tp, attr, val); \ } while (0) #define PUT_U64(attr, val) \ do { \ if (nla_put_u64_64bit(msg, \ NL80211_PMSR_FTM_RESP_ATTR_##attr,\ res->ftm.val, \ NL80211_PMSR_FTM_RESP_ATTR_PAD)) \ goto error; \ } while (0) #define PUTOPT_U64(attr, val) \ do { \ if (res->ftm.val##_valid) \ PUT_U64(attr, val); \ } while (0) if (res->ftm.burst_index >= 0) PUT(u32, BURST_INDEX, burst_index); PUTOPT(u32, NUM_FTMR_ATTEMPTS, num_ftmr_attempts); PUTOPT(u32, NUM_FTMR_SUCCESSES, num_ftmr_successes); PUT(u8, NUM_BURSTS_EXP, num_bursts_exp); PUT(u8, BURST_DURATION, burst_duration); PUT(u8, FTMS_PER_BURST, ftms_per_burst); PUTOPT(s32, RSSI_AVG, rssi_avg); PUTOPT(s32, RSSI_SPREAD, rssi_spread); if (res->ftm.tx_rate_valid && !nl80211_put_sta_rate(msg, &res->ftm.tx_rate, NL80211_PMSR_FTM_RESP_ATTR_TX_RATE)) goto error; if (res->ftm.rx_rate_valid && !nl80211_put_sta_rate(msg, &res->ftm.rx_rate, NL80211_PMSR_FTM_RESP_ATTR_RX_RATE)) goto error; PUTOPT_U64(RTT_AVG, rtt_avg); PUTOPT_U64(RTT_VARIANCE, rtt_variance); PUTOPT_U64(RTT_SPREAD, rtt_spread); PUTOPT_U64(DIST_AVG, dist_avg); PUTOPT_U64(DIST_VARIANCE, dist_variance); PUTOPT_U64(DIST_SPREAD, dist_spread); if (res->ftm.lci && res->ftm.lci_len && nla_put(msg, NL80211_PMSR_FTM_RESP_ATTR_LCI, res->ftm.lci_len, res->ftm.lci)) goto error; if (res->ftm.civicloc && res->ftm.civicloc_len && nla_put(msg, NL80211_PMSR_FTM_RESP_ATTR_CIVICLOC, res->ftm.civicloc_len, res->ftm.civicloc)) goto error; #undef PUT #undef PUTOPT #undef PUT_U64 #undef PUTOPT_U64 return 0; error: return -ENOSPC; } static int nl80211_pmsr_send_result(struct sk_buff *msg, struct cfg80211_pmsr_result *res) { struct nlattr *pmsr, *peers, *peer, *resp, *data, *typedata; pmsr = nla_nest_start_noflag(msg, NL80211_ATTR_PEER_MEASUREMENTS); if (!pmsr) goto error; peers = nla_nest_start_noflag(msg, NL80211_PMSR_ATTR_PEERS); if (!peers) goto error; peer = nla_nest_start_noflag(msg, 1); if (!peer) goto error; if (nla_put(msg, NL80211_PMSR_PEER_ATTR_ADDR, ETH_ALEN, res->addr)) goto error; resp = nla_nest_start_noflag(msg, NL80211_PMSR_PEER_ATTR_RESP); if (!resp) goto error; if (nla_put_u32(msg, NL80211_PMSR_RESP_ATTR_STATUS, res->status) || nla_put_u64_64bit(msg, NL80211_PMSR_RESP_ATTR_HOST_TIME, res->host_time, NL80211_PMSR_RESP_ATTR_PAD)) goto error; if (res->ap_tsf_valid && nla_put_u64_64bit(msg, NL80211_PMSR_RESP_ATTR_AP_TSF, res->ap_tsf, NL80211_PMSR_RESP_ATTR_PAD)) goto error; if (res->final && nla_put_flag(msg, NL80211_PMSR_RESP_ATTR_FINAL)) goto error; data = nla_nest_start_noflag(msg, NL80211_PMSR_RESP_ATTR_DATA); if (!data) goto error; typedata = nla_nest_start_noflag(msg, res->type); if (!typedata) goto error; switch (res->type) { case NL80211_PMSR_TYPE_FTM: if (nl80211_pmsr_send_ftm_res(msg, res)) goto error; break; default: WARN_ON(1); } nla_nest_end(msg, typedata); nla_nest_end(msg, data); nla_nest_end(msg, resp); nla_nest_end(msg, peer); nla_nest_end(msg, peers); nla_nest_end(msg, pmsr); return 0; error: return -ENOSPC; } void cfg80211_pmsr_report(struct wireless_dev *wdev, struct cfg80211_pmsr_request *req, struct cfg80211_pmsr_result *result, gfp_t gfp) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct sk_buff *msg; void *hdr; int err; trace_cfg80211_pmsr_report(wdev->wiphy, wdev, req->cookie, result->addr); /* * Currently, only variable items are LCI and civic location, * both of which are reasonably short so we don't need to * worry about them here for the allocation. */ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); if (!msg) return; hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_PEER_MEASUREMENT_RESULT); if (!hdr) goto free; if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), NL80211_ATTR_PAD)) goto free; if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, req->cookie, NL80211_ATTR_PAD)) goto free; err = nl80211_pmsr_send_result(msg, result); if (err) { pr_err_ratelimited("peer measurement result: message didn't fit!"); goto free; } genlmsg_end(msg, hdr); genlmsg_unicast(wiphy_net(wdev->wiphy), msg, req->nl_portid); return; free: nlmsg_free(msg); } EXPORT_SYMBOL_GPL(cfg80211_pmsr_report); static void cfg80211_pmsr_process_abort(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_pmsr_request *req, *tmp; LIST_HEAD(free_list); lockdep_assert_wiphy(wdev->wiphy); spin_lock_bh(&wdev->pmsr_lock); list_for_each_entry_safe(req, tmp, &wdev->pmsr_list, list) { if (req->nl_portid) continue; list_move_tail(&req->list, &free_list); } spin_unlock_bh(&wdev->pmsr_lock); list_for_each_entry_safe(req, tmp, &free_list, list) { rdev_abort_pmsr(rdev, wdev, req); kfree(req); } } void cfg80211_pmsr_free_wk(struct work_struct *work) { struct wireless_dev *wdev = container_of(work, struct wireless_dev, pmsr_free_wk); wiphy_lock(wdev->wiphy); cfg80211_pmsr_process_abort(wdev); wiphy_unlock(wdev->wiphy); } void cfg80211_pmsr_wdev_down(struct wireless_dev *wdev) { struct cfg80211_pmsr_request *req; bool found = false; spin_lock_bh(&wdev->pmsr_lock); list_for_each_entry(req, &wdev->pmsr_list, list) { found = true; req->nl_portid = 0; } spin_unlock_bh(&wdev->pmsr_lock); if (found) cfg80211_pmsr_process_abort(wdev); WARN_ON(!list_empty(&wdev->pmsr_list)); } void cfg80211_release_pmsr(struct wireless_dev *wdev, u32 portid) { struct cfg80211_pmsr_request *req; spin_lock_bh(&wdev->pmsr_lock); list_for_each_entry(req, &wdev->pmsr_list, list) { if (req->nl_portid == portid) { req->nl_portid = 0; schedule_work(&wdev->pmsr_free_wk); } } spin_unlock_bh(&wdev->pmsr_lock); }
4 4 15244 15257 15260 15240 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs */ #include <linux/sched/debug.h> #include <linux/kallsyms.h> #include <linux/kprobes.h> #include <linux/uaccess.h> #include <linux/hardirq.h> #include <linux/kdebug.h> #include <linux/export.h> #include <linux/ptrace.h> #include <linux/kexec.h> #include <linux/sysfs.h> #include <linux/bug.h> #include <linux/nmi.h> #include <asm/cpu_entry_area.h> #include <asm/stacktrace.h> static const char * const exception_stack_names[] = { [ ESTACK_DF ] = "#DF", [ ESTACK_NMI ] = "NMI", [ ESTACK_DB ] = "#DB", [ ESTACK_MCE ] = "#MC", [ ESTACK_VC ] = "#VC", [ ESTACK_VC2 ] = "#VC2", }; const char *stack_type_name(enum stack_type type) { BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); if (type == STACK_TYPE_TASK) return "TASK"; if (type == STACK_TYPE_IRQ) return "IRQ"; if (type == STACK_TYPE_SOFTIRQ) return "SOFTIRQ"; if (type == STACK_TYPE_ENTRY) { /* * On 64-bit, we have a generic entry stack that we * use for all the kernel entry points, including * SYSENTER. */ return "ENTRY_TRAMPOLINE"; } if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) return exception_stack_names[type - STACK_TYPE_EXCEPTION]; return NULL; } /** * struct estack_pages - Page descriptor for exception stacks * @offs: Offset from the start of the exception stack area * @size: Size of the exception stack * @type: Type to store in the stack_info struct */ struct estack_pages { u32 offs; u16 size; u16 type; }; #define EPAGERANGE(st) \ [PFN_DOWN(CEA_ESTACK_OFFS(st)) ... \ PFN_DOWN(CEA_ESTACK_OFFS(st) + CEA_ESTACK_SIZE(st) - 1)] = { \ .offs = CEA_ESTACK_OFFS(st), \ .size = CEA_ESTACK_SIZE(st), \ .type = STACK_TYPE_EXCEPTION + ESTACK_ ##st, } /* * Array of exception stack page descriptors. If the stack is larger than * PAGE_SIZE, all pages covering a particular stack will have the same * info. The guard pages including the not mapped DB2 stack are zeroed * out. */ static const struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = { EPAGERANGE(DF), EPAGERANGE(NMI), EPAGERANGE(DB), EPAGERANGE(MCE), EPAGERANGE(VC), EPAGERANGE(VC2), }; static __always_inline bool in_exception_stack(unsigned long *stack, struct stack_info *info) { unsigned long begin, end, stk = (unsigned long)stack; const struct estack_pages *ep; struct pt_regs *regs; unsigned int k; BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); begin = (unsigned long)__this_cpu_read(cea_exception_stacks); /* * Handle the case where stack trace is collected _before_ * cea_exception_stacks had been initialized. */ if (!begin) return false; end = begin + sizeof(struct cea_exception_stacks); /* Bail if @stack is outside the exception stack area. */ if (stk < begin || stk >= end) return false; /* Calc page offset from start of exception stacks */ k = (stk - begin) >> PAGE_SHIFT; /* Lookup the page descriptor */ ep = &estack_pages[k]; /* Guard page? */ if (!ep->size) return false; begin += (unsigned long)ep->offs; end = begin + (unsigned long)ep->size; regs = (struct pt_regs *)end - 1; info->type = ep->type; info->begin = (unsigned long *)begin; info->end = (unsigned long *)end; info->next_sp = (unsigned long *)regs->sp; return true; } static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info) { unsigned long *end = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr); unsigned long *begin; /* * @end points directly to the top most stack entry to avoid a -8 * adjustment in the stack switch hotpath. Adjust it back before * calculating @begin. */ end++; begin = end - (IRQ_STACK_SIZE / sizeof(long)); /* * Due to the switching logic RSP can never be == @end because the * final operation is 'popq %rsp' which means after that RSP points * to the original stack and not to @end. */ if (stack < begin || stack >= end) return false; info->type = STACK_TYPE_IRQ; info->begin = begin; info->end = end; /* * The next stack pointer is stored at the top of the irq stack * before switching to the irq stack. Actual stack entries are all * below that. */ info->next_sp = (unsigned long *)*(end - 1); return true; } bool noinstr get_stack_info_noinstr(unsigned long *stack, struct task_struct *task, struct stack_info *info) { if (in_task_stack(stack, task, info)) return true; if (task != current) return false; if (in_exception_stack(stack, info)) return true; if (in_irq_stack(stack, info)) return true; if (in_entry_stack(stack, info)) return true; return false; } int get_stack_info(unsigned long *stack, struct task_struct *task, struct stack_info *info, unsigned long *visit_mask) { task = task ? : current; if (!stack) goto unknown; if (!get_stack_info_noinstr(stack, task, info)) goto unknown; /* * Make sure we don't iterate through any given stack more than once. * If it comes up a second time then there's something wrong going on: * just break out and report an unknown stack type. */ if (visit_mask) { if (*visit_mask & (1UL << info->type)) { if (task == current) printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type); goto unknown; } *visit_mask |= 1UL << info->type; } return 0; unknown: info->type = STACK_TYPE_UNKNOWN; return -EINVAL; }
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 // SPDX-License-Identifier: GPL-2.0-only // Copyright (c) 2020 Facebook Inc. #include <linux/debugfs.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <net/udp_tunnel.h> #include "netdevsim.h" static int nsim_udp_tunnel_set_port(struct net_device *dev, unsigned int table, unsigned int entry, struct udp_tunnel_info *ti) { struct netdevsim *ns = netdev_priv(dev); int ret; ret = -ns->udp_ports.inject_error; ns->udp_ports.inject_error = 0; if (ns->udp_ports.sleep) msleep(ns->udp_ports.sleep); if (!ret) { if (ns->udp_ports.ports[table][entry]) { WARN(1, "entry already in use\n"); ret = -EBUSY; } else { ns->udp_ports.ports[table][entry] = be16_to_cpu(ti->port) << 16 | ti->type; } } netdev_info(dev, "set [%d, %d] type %d family %d port %d - %d\n", table, entry, ti->type, ti->sa_family, ntohs(ti->port), ret); return ret; } static int nsim_udp_tunnel_unset_port(struct net_device *dev, unsigned int table, unsigned int entry, struct udp_tunnel_info *ti) { struct netdevsim *ns = netdev_priv(dev); int ret; ret = -ns->udp_ports.inject_error; ns->udp_ports.inject_error = 0; if (ns->udp_ports.sleep) msleep(ns->udp_ports.sleep); if (!ret) { u32 val = be16_to_cpu(ti->port) << 16 | ti->type; if (val == ns->udp_ports.ports[table][entry]) { ns->udp_ports.ports[table][entry] = 0; } else { WARN(1, "entry not installed %x vs %x\n", val, ns->udp_ports.ports[table][entry]); ret = -ENOENT; } } netdev_info(dev, "unset [%d, %d] type %d family %d port %d - %d\n", table, entry, ti->type, ti->sa_family, ntohs(ti->port), ret); return ret; } static int nsim_udp_tunnel_sync_table(struct net_device *dev, unsigned int table) { struct netdevsim *ns = netdev_priv(dev); struct udp_tunnel_info ti; unsigned int i; int ret; ret = -ns->udp_ports.inject_error; ns->udp_ports.inject_error = 0; for (i = 0; i < NSIM_UDP_TUNNEL_N_PORTS; i++) { udp_tunnel_nic_get_port(dev, table, i, &ti); ns->udp_ports.ports[table][i] = be16_to_cpu(ti.port) << 16 | ti.type; } return ret; } static const struct udp_tunnel_nic_info nsim_udp_tunnel_info = { .set_port = nsim_udp_tunnel_set_port, .unset_port = nsim_udp_tunnel_unset_port, .sync_table = nsim_udp_tunnel_sync_table, .tables = { { .n_entries = NSIM_UDP_TUNNEL_N_PORTS, .tunnel_types = UDP_TUNNEL_TYPE_VXLAN, }, { .n_entries = NSIM_UDP_TUNNEL_N_PORTS, .tunnel_types = UDP_TUNNEL_TYPE_GENEVE | UDP_TUNNEL_TYPE_VXLAN_GPE, }, }, }; static ssize_t nsim_udp_tunnels_info_reset_write(struct file *file, const char __user *data, size_t count, loff_t *ppos) { struct net_device *dev = file->private_data; struct netdevsim *ns = netdev_priv(dev); memset(ns->udp_ports.ports, 0, sizeof(ns->udp_ports.__ports)); rtnl_lock(); udp_tunnel_nic_reset_ntf(dev); rtnl_unlock(); return count; } static const struct file_operations nsim_udp_tunnels_info_reset_fops = { .open = simple_open, .write = nsim_udp_tunnels_info_reset_write, .llseek = generic_file_llseek, .owner = THIS_MODULE, }; int nsim_udp_tunnels_info_create(struct nsim_dev *nsim_dev, struct net_device *dev) { struct netdevsim *ns = netdev_priv(dev); struct udp_tunnel_nic_info *info; if (nsim_dev->udp_ports.shared && nsim_dev->udp_ports.open_only) { dev_err(&nsim_dev->nsim_bus_dev->dev, "shared can't be used in conjunction with open_only\n"); return -EINVAL; } if (!nsim_dev->udp_ports.shared) ns->udp_ports.ports = ns->udp_ports.__ports; else ns->udp_ports.ports = nsim_dev->udp_ports.__ports; debugfs_create_u32("udp_ports_inject_error", 0600, ns->nsim_dev_port->ddir, &ns->udp_ports.inject_error); ns->udp_ports.dfs_ports[0].array = ns->udp_ports.ports[0]; ns->udp_ports.dfs_ports[0].n_elements = NSIM_UDP_TUNNEL_N_PORTS; debugfs_create_u32_array("udp_ports_table0", 0400, ns->nsim_dev_port->ddir, &ns->udp_ports.dfs_ports[0]); ns->udp_ports.dfs_ports[1].array = ns->udp_ports.ports[1]; ns->udp_ports.dfs_ports[1].n_elements = NSIM_UDP_TUNNEL_N_PORTS; debugfs_create_u32_array("udp_ports_table1", 0400, ns->nsim_dev_port->ddir, &ns->udp_ports.dfs_ports[1]); debugfs_create_file("udp_ports_reset", 0200, ns->nsim_dev_port->ddir, dev, &nsim_udp_tunnels_info_reset_fops); /* Note: it's not normal to allocate the info struct like this! * Drivers are expected to use a static const one, here we're testing. */ info = kmemdup(&nsim_udp_tunnel_info, sizeof(nsim_udp_tunnel_info), GFP_KERNEL); if (!info) return -ENOMEM; ns->udp_ports.sleep = nsim_dev->udp_ports.sleep; if (nsim_dev->udp_ports.sync_all) { info->set_port = NULL; info->unset_port = NULL; } else { info->sync_table = NULL; } if (ns->udp_ports.sleep) info->flags |= UDP_TUNNEL_NIC_INFO_MAY_SLEEP; if (nsim_dev->udp_ports.open_only) info->flags |= UDP_TUNNEL_NIC_INFO_OPEN_ONLY; if (nsim_dev->udp_ports.ipv4_only) info->flags |= UDP_TUNNEL_NIC_INFO_IPV4_ONLY; if (nsim_dev->udp_ports.shared) info->shared = &nsim_dev->udp_ports.utn_shared; if (nsim_dev->udp_ports.static_iana_vxlan) info->flags |= UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN; dev->udp_tunnel_nic_info = info; return 0; } void nsim_udp_tunnels_info_destroy(struct net_device *dev) { kfree(dev->udp_tunnel_nic_info); dev->udp_tunnel_nic_info = NULL; } void nsim_udp_tunnels_debugfs_create(struct nsim_dev *nsim_dev) { debugfs_create_bool("udp_ports_sync_all", 0600, nsim_dev->ddir, &nsim_dev->udp_ports.sync_all); debugfs_create_bool("udp_ports_open_only", 0600, nsim_dev->ddir, &nsim_dev->udp_ports.open_only); debugfs_create_bool("udp_ports_ipv4_only", 0600, nsim_dev->ddir, &nsim_dev->udp_ports.ipv4_only); debugfs_create_bool("udp_ports_shared", 0600, nsim_dev->ddir, &nsim_dev->udp_ports.shared); debugfs_create_bool("udp_ports_static_iana_vxlan", 0600, nsim_dev->ddir, &nsim_dev->udp_ports.static_iana_vxlan); debugfs_create_u32("udp_ports_sleep", 0600, nsim_dev->ddir, &nsim_dev->udp_ports.sleep); }
3 3 1 1 1 2 3 3 5 5 5 5 5 5 4 2 2 2 5 3 2 7 7 4 3 7 7 4 3 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 // SPDX-License-Identifier: GPL-2.0-or-later /* * udp_diag.c Module for monitoring UDP transport protocols sockets. * * Authors: Pavel Emelyanov, <xemul@parallels.com> */ #include <linux/module.h> #include <linux/inet_diag.h> #include <linux/udp.h> #include <net/udp.h> #include <net/udplite.h> #include <linux/sock_diag.h> static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, struct nlattr *bc, bool net_admin) { if (!inet_diag_bc_sk(bc, sk)) return 0; return inet_sk_diag_fill(sk, NULL, skb, cb, req, NLM_F_MULTI, net_admin); } static int udp_dump_one(struct udp_table *tbl, struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { struct sk_buff *in_skb = cb->skb; int err; struct sock *sk = NULL; struct sk_buff *rep; struct net *net = sock_net(in_skb->sk); rcu_read_lock(); if (req->sdiag_family == AF_INET) /* src and dst are swapped for historical reasons */ sk = __udp4_lib_lookup(net, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_if, 0, tbl, NULL); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) sk = __udp6_lib_lookup(net, (struct in6_addr *)req->id.idiag_src, req->id.idiag_sport, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, req->id.idiag_if, 0, tbl, NULL); #endif if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; rcu_read_unlock(); err = -ENOENT; if (!sk) goto out_nosk; err = sock_diag_check_cookie(sk, req->id.idiag_cookie); if (err) goto out; err = -ENOMEM; rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) + inet_diag_msg_attrs_size() + nla_total_size(sizeof(struct inet_diag_meminfo)) + 64, GFP_KERNEL); if (!rep) goto out; err = inet_sk_diag_fill(sk, NULL, rep, cb, req, 0, netlink_net_capable(in_skb, CAP_NET_ADMIN)); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(rep); goto out; } err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); out: if (sk) sock_put(sk); out_nosk: return err; } static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct net *net = sock_net(skb->sk); struct inet_diag_dump_data *cb_data; int num, s_num, slot, s_slot; struct nlattr *bc; cb_data = cb->data; bc = cb_data->inet_diag_nla_bc; s_slot = cb->args[0]; num = s_num = cb->args[1]; for (slot = s_slot; slot <= table->mask; s_num = 0, slot++) { struct udp_hslot *hslot = &table->hash[slot]; struct sock *sk; num = 0; if (hlist_empty(&hslot->head)) continue; spin_lock_bh(&hslot->lock); sk_for_each(sk, &hslot->head) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) continue; if (num < s_num) goto next; if (!(r->idiag_states & (1 << sk->sk_state))) goto next; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) goto next; if (r->id.idiag_sport != inet->inet_sport && r->id.idiag_sport) goto next; if (r->id.idiag_dport != inet->inet_dport && r->id.idiag_dport) goto next; if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) { spin_unlock_bh(&hslot->lock); goto done; } next: num++; } spin_unlock_bh(&hslot->lock); } done: cb->args[0] = slot; cb->args[1] = num; } static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { udp_dump(sock_net(cb->skb->sk)->ipv4.udp_table, skb, cb, r); } static int udp_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { return udp_dump_one(sock_net(cb->skb->sk)->ipv4.udp_table, cb, req); } static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *info) { r->idiag_rqueue = udp_rqueue_get(sk); r->idiag_wqueue = sk_wmem_alloc_get(sk); } #ifdef CONFIG_INET_DIAG_DESTROY static int __udp_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req, struct udp_table *tbl) { struct net *net = sock_net(in_skb->sk); struct sock *sk; int err; rcu_read_lock(); if (req->sdiag_family == AF_INET) sk = __udp4_lib_lookup(net, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_if, 0, tbl, NULL); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) { if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) sk = __udp4_lib_lookup(net, req->id.idiag_dst[3], req->id.idiag_dport, req->id.idiag_src[3], req->id.idiag_sport, req->id.idiag_if, 0, tbl, NULL); else sk = __udp6_lib_lookup(net, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, (struct in6_addr *)req->id.idiag_src, req->id.idiag_sport, req->id.idiag_if, 0, tbl, NULL); } #endif else { rcu_read_unlock(); return -EINVAL; } if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; rcu_read_unlock(); if (!sk) return -ENOENT; if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) { sock_put(sk); return -ENOENT; } err = sock_diag_destroy(sk, ECONNABORTED); sock_put(sk); return err; } static int udp_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req) { return __udp_diag_destroy(in_skb, req, sock_net(in_skb->sk)->ipv4.udp_table); } static int udplite_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req) { return __udp_diag_destroy(in_skb, req, &udplite_table); } #endif static const struct inet_diag_handler udp_diag_handler = { .owner = THIS_MODULE, .dump = udp_diag_dump, .dump_one = udp_diag_dump_one, .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDP, .idiag_info_size = 0, #ifdef CONFIG_INET_DIAG_DESTROY .destroy = udp_diag_destroy, #endif }; static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { udp_dump(&udplite_table, skb, cb, r); } static int udplite_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { return udp_dump_one(&udplite_table, cb, req); } static const struct inet_diag_handler udplite_diag_handler = { .owner = THIS_MODULE, .dump = udplite_diag_dump, .dump_one = udplite_diag_dump_one, .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDPLITE, .idiag_info_size = 0, #ifdef CONFIG_INET_DIAG_DESTROY .destroy = udplite_diag_destroy, #endif }; static int __init udp_diag_init(void) { int err; err = inet_diag_register(&udp_diag_handler); if (err) goto out; err = inet_diag_register(&udplite_diag_handler); if (err) goto out_lite; out: return err; out_lite: inet_diag_unregister(&udp_diag_handler); goto out; } static void __exit udp_diag_exit(void) { inet_diag_unregister(&udplite_diag_handler); inet_diag_unregister(&udp_diag_handler); } module_init(udp_diag_init); module_exit(udp_diag_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("UDP socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-17 /* AF_INET - IPPROTO_UDP */); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-136 /* AF_INET - IPPROTO_UDPLITE */);
91 90 2 85 8 2 62 3 70 78 78 78 3 12 2 59 61 4 1 2 92 4 99 98 13 80 93 1 4 1 78 64 3 1 3 1 3 72 5 2 1 2 1 62 70 6 1 60 61 61 61 2 1 2 2 1 69 69 69 69 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 // SPDX-License-Identifier: GPL-2.0-only /* * Connection tracking protocol helper module for SCTP. * * Copyright (c) 2004 Kiran Kumar Immidi <immidi_kiran@yahoo.com> * Copyright (c) 2004-2012 Patrick McHardy <kaber@trash.net> * * SCTP is defined in RFC 2960. References to various sections in this code * are to this RFC. */ #include <linux/types.h> #include <linux/timer.h> #include <linux/netfilter.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/sctp.h> #include <linux/string.h> #include <linux/seq_file.h> #include <linux/spinlock.h> #include <linux/interrupt.h> #include <net/sctp/checksum.h> #include <net/netfilter/nf_log.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_timeout.h> static const char *const sctp_conntrack_names[] = { [SCTP_CONNTRACK_NONE] = "NONE", [SCTP_CONNTRACK_CLOSED] = "CLOSED", [SCTP_CONNTRACK_COOKIE_WAIT] = "COOKIE_WAIT", [SCTP_CONNTRACK_COOKIE_ECHOED] = "COOKIE_ECHOED", [SCTP_CONNTRACK_ESTABLISHED] = "ESTABLISHED", [SCTP_CONNTRACK_SHUTDOWN_SENT] = "SHUTDOWN_SENT", [SCTP_CONNTRACK_SHUTDOWN_RECD] = "SHUTDOWN_RECD", [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = "SHUTDOWN_ACK_SENT", [SCTP_CONNTRACK_HEARTBEAT_SENT] = "HEARTBEAT_SENT", }; #define SECS * HZ #define MINS * 60 SECS #define HOURS * 60 MINS #define DAYS * 24 HOURS static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = { [SCTP_CONNTRACK_CLOSED] = 10 SECS, [SCTP_CONNTRACK_COOKIE_WAIT] = 3 SECS, [SCTP_CONNTRACK_COOKIE_ECHOED] = 3 SECS, [SCTP_CONNTRACK_ESTABLISHED] = 210 SECS, [SCTP_CONNTRACK_SHUTDOWN_SENT] = 3 SECS, [SCTP_CONNTRACK_SHUTDOWN_RECD] = 3 SECS, [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS, [SCTP_CONNTRACK_HEARTBEAT_SENT] = 30 SECS, }; #define SCTP_FLAG_HEARTBEAT_VTAG_FAILED 1 #define sNO SCTP_CONNTRACK_NONE #define sCL SCTP_CONNTRACK_CLOSED #define sCW SCTP_CONNTRACK_COOKIE_WAIT #define sCE SCTP_CONNTRACK_COOKIE_ECHOED #define sES SCTP_CONNTRACK_ESTABLISHED #define sSS SCTP_CONNTRACK_SHUTDOWN_SENT #define sSR SCTP_CONNTRACK_SHUTDOWN_RECD #define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT #define sHS SCTP_CONNTRACK_HEARTBEAT_SENT #define sIV SCTP_CONNTRACK_MAX /* These are the descriptions of the states: NOTE: These state names are tantalizingly similar to the states of an SCTP endpoint. But the interpretation of the states is a little different, considering that these are the states of the connection and not of an end point. Please note the subtleties. -Kiran NONE - Nothing so far. COOKIE WAIT - We have seen an INIT chunk in the original direction, or also an INIT_ACK chunk in the reply direction. COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction. ESTABLISHED - We have seen a COOKIE_ACK in the reply direction. SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction. SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply direction. SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite to that of the SHUTDOWN chunk. CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of the SHUTDOWN chunk. Connection is closed. HEARTBEAT_SENT - We have seen a HEARTBEAT in a new flow. */ /* TODO - I have assumed that the first INIT is in the original direction. This messes things when an INIT comes in the reply direction in CLOSED state. - Check the error type in the reply dir before transitioning from cookie echoed to closed. - Sec 5.2.4 of RFC 2960 - Full Multi Homing support. */ /* SCTP conntrack state transitions */ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { { /* ORIGINAL */ /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */ /* init */ {sCL, sCL, sCW, sCE, sES, sCL, sCL, sSA, sCW}, /* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL}, /* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, /* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL}, /* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA}, /* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't have Stale cookie*/ /* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL},/* 5.2.4 - Big TODO */ /* cookie_ack */ {sCL, sCL, sCW, sES, sES, sSS, sSR, sSA, sCL},/* Can't come in orig dir */ /* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL}, /* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, /* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, }, { /* REPLY */ /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */ /* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV},/* INIT in sCL Big TODO */ /* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV}, /* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV}, /* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV}, /* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV}, /* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV}, /* cookie_echo */ {sIV, sCL, sCE, sCE, sES, sSS, sSR, sSA, sIV},/* Can't come in reply dir */ /* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV}, /* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV}, /* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, /* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sES}, } }; #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct) { seq_printf(s, "%s ", sctp_conntrack_names[ct->proto.sctp.state]); } #endif /* do_basic_checks ensures sch->length > 0, do not use before */ #define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \ for ((offset) = (dataoff) + sizeof(struct sctphdr), (count) = 0; \ (offset) < (skb)->len && \ ((sch) = skb_header_pointer((skb), (offset), sizeof(_sch), &(_sch))); \ (offset) += (ntohs((sch)->length) + 3) & ~3, (count)++) /* Some validity checks to make sure the chunks are fine */ static int do_basic_checks(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, unsigned long *map, const struct nf_hook_state *state) { u_int32_t offset, count; struct sctp_chunkhdr _sch, *sch; int flag; flag = 0; for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { if (sch->type == SCTP_CID_INIT || sch->type == SCTP_CID_INIT_ACK || sch->type == SCTP_CID_SHUTDOWN_COMPLETE) flag = 1; /* * Cookie Ack/Echo chunks not the first OR * Init / Init Ack / Shutdown compl chunks not the only chunks * OR zero-length. */ if (((sch->type == SCTP_CID_COOKIE_ACK || sch->type == SCTP_CID_COOKIE_ECHO || flag) && count != 0) || !sch->length) { nf_ct_l4proto_log_invalid(skb, ct, state, "%s failed. chunk num %d, type %d, len %d flag %d\n", __func__, count, sch->type, sch->length, flag); return 1; } if (map) set_bit(sch->type, map); } return count == 0; } static int sctp_new_state(enum ip_conntrack_dir dir, enum sctp_conntrack cur_state, int chunk_type) { int i; switch (chunk_type) { case SCTP_CID_INIT: i = 0; break; case SCTP_CID_INIT_ACK: i = 1; break; case SCTP_CID_ABORT: i = 2; break; case SCTP_CID_SHUTDOWN: i = 3; break; case SCTP_CID_SHUTDOWN_ACK: i = 4; break; case SCTP_CID_ERROR: i = 5; break; case SCTP_CID_COOKIE_ECHO: i = 6; break; case SCTP_CID_COOKIE_ACK: i = 7; break; case SCTP_CID_SHUTDOWN_COMPLETE: i = 8; break; case SCTP_CID_HEARTBEAT: i = 9; break; case SCTP_CID_HEARTBEAT_ACK: i = 10; break; default: /* Other chunks like DATA or SACK do not change the state */ pr_debug("Unknown chunk type %d, Will stay in %s\n", chunk_type, sctp_conntrack_names[cur_state]); return cur_state; } return sctp_conntracks[dir][i][cur_state]; } /* Don't need lock here: this conntrack not in circulation yet */ static noinline bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb, const struct sctphdr *sh, unsigned int dataoff) { enum sctp_conntrack new_state; const struct sctp_chunkhdr *sch; struct sctp_chunkhdr _sch; u32 offset, count; memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp)); new_state = SCTP_CONNTRACK_MAX; for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) { new_state = sctp_new_state(IP_CT_DIR_ORIGINAL, SCTP_CONNTRACK_NONE, sch->type); /* Invalid: delete conntrack */ if (new_state == SCTP_CONNTRACK_NONE || new_state == SCTP_CONNTRACK_MAX) { pr_debug("nf_conntrack_sctp: invalid new deleting.\n"); return false; } /* Copy the vtag into the state info */ if (sch->type == SCTP_CID_INIT) { struct sctp_inithdr _inithdr, *ih; /* Sec 8.5.1 (A) */ if (sh->vtag) return false; ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(_inithdr), &_inithdr); if (!ih) return false; pr_debug("Setting vtag %x for new conn\n", ih->init_tag); ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = ih->init_tag; } else if (sch->type == SCTP_CID_HEARTBEAT) { pr_debug("Setting vtag %x for secondary conntrack\n", sh->vtag); ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag; } else if (sch->type == SCTP_CID_SHUTDOWN_ACK) { /* If it is a shutdown ack OOTB packet, we expect a return shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ pr_debug("Setting vtag %x for new conn OOTB\n", sh->vtag); ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag; } ct->proto.sctp.state = SCTP_CONNTRACK_NONE; } return true; } static bool sctp_error(struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { const struct sctphdr *sh; const char *logmsg; if (skb->len < dataoff + sizeof(struct sctphdr)) { logmsg = "nf_ct_sctp: short packet "; goto out_invalid; } if (state->hook == NF_INET_PRE_ROUTING && state->net->ct.sysctl_checksum && skb->ip_summed == CHECKSUM_NONE) { if (skb_ensure_writable(skb, dataoff + sizeof(*sh))) { logmsg = "nf_ct_sctp: failed to read header "; goto out_invalid; } sh = (const struct sctphdr *)(skb->data + dataoff); if (sh->checksum != sctp_compute_cksum(skb, dataoff)) { logmsg = "nf_ct_sctp: bad CRC "; goto out_invalid; } skb->ip_summed = CHECKSUM_UNNECESSARY; } return false; out_invalid: nf_l4proto_log_invalid(skb, state, IPPROTO_SCTP, "%s", logmsg); return true; } /* Returns verdict for packet, or -NF_ACCEPT for invalid. */ int nf_conntrack_sctp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { enum sctp_conntrack new_state, old_state; enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); const struct sctphdr *sh; struct sctphdr _sctph; const struct sctp_chunkhdr *sch; struct sctp_chunkhdr _sch; u_int32_t offset, count; unsigned int *timeouts; unsigned long map[256 / sizeof(unsigned long)] = { 0 }; bool ignore = false; if (sctp_error(skb, dataoff, state)) return -NF_ACCEPT; sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); if (sh == NULL) goto out; if (do_basic_checks(ct, skb, dataoff, map, state) != 0) goto out; if (!nf_ct_is_confirmed(ct)) { /* If an OOTB packet has any of these chunks discard (Sec 8.4) */ if (test_bit(SCTP_CID_ABORT, map) || test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) || test_bit(SCTP_CID_COOKIE_ACK, map)) return -NF_ACCEPT; if (!sctp_new(ct, skb, sh, dataoff)) return -NF_ACCEPT; } /* Check the verification tag (Sec 8.5) */ if (!test_bit(SCTP_CID_INIT, map) && !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) && !test_bit(SCTP_CID_COOKIE_ECHO, map) && !test_bit(SCTP_CID_ABORT, map) && !test_bit(SCTP_CID_SHUTDOWN_ACK, map) && !test_bit(SCTP_CID_HEARTBEAT, map) && !test_bit(SCTP_CID_HEARTBEAT_ACK, map) && sh->vtag != ct->proto.sctp.vtag[dir]) { nf_ct_l4proto_log_invalid(skb, ct, state, "verification tag check failed %x vs %x for dir %d", sh->vtag, ct->proto.sctp.vtag[dir], dir); goto out; } old_state = new_state = SCTP_CONNTRACK_NONE; spin_lock_bh(&ct->lock); for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { /* Special cases of Verification tag check (Sec 8.5.1) */ if (sch->type == SCTP_CID_INIT) { /* (A) vtag MUST be zero */ if (sh->vtag != 0) goto out_unlock; } else if (sch->type == SCTP_CID_ABORT) { /* (B) vtag MUST match own vtag if T flag is unset OR * MUST match peer's vtag if T flag is set */ if ((!(sch->flags & SCTP_CHUNK_FLAG_T) && sh->vtag != ct->proto.sctp.vtag[dir]) || ((sch->flags & SCTP_CHUNK_FLAG_T) && sh->vtag != ct->proto.sctp.vtag[!dir])) goto out_unlock; } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { /* (C) vtag MUST match own vtag if T flag is unset OR * MUST match peer's vtag if T flag is set */ if ((!(sch->flags & SCTP_CHUNK_FLAG_T) && sh->vtag != ct->proto.sctp.vtag[dir]) || ((sch->flags & SCTP_CHUNK_FLAG_T) && sh->vtag != ct->proto.sctp.vtag[!dir])) goto out_unlock; } else if (sch->type == SCTP_CID_COOKIE_ECHO) { /* (D) vtag must be same as init_vtag as found in INIT_ACK */ if (sh->vtag != ct->proto.sctp.vtag[dir]) goto out_unlock; } else if (sch->type == SCTP_CID_COOKIE_ACK) { ct->proto.sctp.init[dir] = 0; ct->proto.sctp.init[!dir] = 0; } else if (sch->type == SCTP_CID_HEARTBEAT) { if (ct->proto.sctp.vtag[dir] == 0) { pr_debug("Setting %d vtag %x for dir %d\n", sch->type, sh->vtag, dir); ct->proto.sctp.vtag[dir] = sh->vtag; } else if (sh->vtag != ct->proto.sctp.vtag[dir]) { if (test_bit(SCTP_CID_DATA, map) || ignore) goto out_unlock; ct->proto.sctp.flags |= SCTP_FLAG_HEARTBEAT_VTAG_FAILED; ct->proto.sctp.last_dir = dir; ignore = true; continue; } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) { ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; } } else if (sch->type == SCTP_CID_HEARTBEAT_ACK) { if (ct->proto.sctp.vtag[dir] == 0) { pr_debug("Setting vtag %x for dir %d\n", sh->vtag, dir); ct->proto.sctp.vtag[dir] = sh->vtag; } else if (sh->vtag != ct->proto.sctp.vtag[dir]) { if (test_bit(SCTP_CID_DATA, map) || ignore) goto out_unlock; if ((ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) == 0 || ct->proto.sctp.last_dir == dir) goto out_unlock; ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; ct->proto.sctp.vtag[dir] = sh->vtag; ct->proto.sctp.vtag[!dir] = 0; } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) { ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; } } old_state = ct->proto.sctp.state; new_state = sctp_new_state(dir, old_state, sch->type); /* Invalid */ if (new_state == SCTP_CONNTRACK_MAX) { nf_ct_l4proto_log_invalid(skb, ct, state, "Invalid, old_state %d, dir %d, type %d", old_state, dir, sch->type); goto out_unlock; } /* If it is an INIT or an INIT ACK note down the vtag */ if (sch->type == SCTP_CID_INIT) { struct sctp_inithdr _ih, *ih; ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih); if (!ih) goto out_unlock; if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir]) ct->proto.sctp.init[!dir] = 0; ct->proto.sctp.init[dir] = 1; pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir); ct->proto.sctp.vtag[!dir] = ih->init_tag; /* don't renew timeout on init retransmit so * port reuse by client or NAT middlebox cannot * keep entry alive indefinitely (incl. nat info). */ if (new_state == SCTP_CONNTRACK_CLOSED && old_state == SCTP_CONNTRACK_CLOSED && nf_ct_is_confirmed(ct)) ignore = true; } else if (sch->type == SCTP_CID_INIT_ACK) { struct sctp_inithdr _ih, *ih; __be32 vtag; ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih); if (!ih) goto out_unlock; vtag = ct->proto.sctp.vtag[!dir]; if (!ct->proto.sctp.init[!dir] && vtag && vtag != ih->init_tag) goto out_unlock; /* collision */ if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir] && vtag != ih->init_tag) goto out_unlock; pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir); ct->proto.sctp.vtag[!dir] = ih->init_tag; } ct->proto.sctp.state = new_state; if (old_state != new_state) { nf_conntrack_event_cache(IPCT_PROTOINFO, ct); if (new_state == SCTP_CONNTRACK_ESTABLISHED && !test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } } spin_unlock_bh(&ct->lock); /* allow but do not refresh timeout */ if (ignore) return NF_ACCEPT; timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) timeouts = nf_sctp_pernet(nf_ct_net(ct))->timeouts; nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); return NF_ACCEPT; out_unlock: spin_unlock_bh(&ct->lock); out: return -NF_ACCEPT; } static bool sctp_can_early_drop(const struct nf_conn *ct) { switch (ct->proto.sctp.state) { case SCTP_CONNTRACK_SHUTDOWN_SENT: case SCTP_CONNTRACK_SHUTDOWN_RECD: case SCTP_CONNTRACK_SHUTDOWN_ACK_SENT: return true; default: break; } return false; } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, struct nf_conn *ct, bool destroy) { struct nlattr *nest_parms; spin_lock_bh(&ct->lock); nest_parms = nla_nest_start(skb, CTA_PROTOINFO_SCTP); if (!nest_parms) goto nla_put_failure; if (nla_put_u8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state)) goto nla_put_failure; if (destroy) goto skip_state; if (nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_ORIGINAL, ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL]) || nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_REPLY, ct->proto.sctp.vtag[IP_CT_DIR_REPLY])) goto nla_put_failure; skip_state: spin_unlock_bh(&ct->lock); nla_nest_end(skb, nest_parms); return 0; nla_put_failure: spin_unlock_bh(&ct->lock); return -1; } static const struct nla_policy sctp_nla_policy[CTA_PROTOINFO_SCTP_MAX+1] = { [CTA_PROTOINFO_SCTP_STATE] = { .type = NLA_U8 }, [CTA_PROTOINFO_SCTP_VTAG_ORIGINAL] = { .type = NLA_U32 }, [CTA_PROTOINFO_SCTP_VTAG_REPLY] = { .type = NLA_U32 }, }; #define SCTP_NLATTR_SIZE ( \ NLA_ALIGN(NLA_HDRLEN + 1) + \ NLA_ALIGN(NLA_HDRLEN + 4) + \ NLA_ALIGN(NLA_HDRLEN + 4)) static int nlattr_to_sctp(struct nlattr *cda[], struct nf_conn *ct) { struct nlattr *attr = cda[CTA_PROTOINFO_SCTP]; struct nlattr *tb[CTA_PROTOINFO_SCTP_MAX+1]; int err; /* updates may not contain the internal protocol info, skip parsing */ if (!attr) return 0; err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_SCTP_MAX, attr, sctp_nla_policy, NULL); if (err < 0) return err; if (!tb[CTA_PROTOINFO_SCTP_STATE] || !tb[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL] || !tb[CTA_PROTOINFO_SCTP_VTAG_REPLY]) return -EINVAL; spin_lock_bh(&ct->lock); ct->proto.sctp.state = nla_get_u8(tb[CTA_PROTOINFO_SCTP_STATE]); ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL]); ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_REPLY]); spin_unlock_bh(&ct->lock); return 0; } #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeouts = data; struct nf_sctp_net *sn = nf_sctp_pernet(net); int i; if (!timeouts) timeouts = sn->timeouts; /* set default SCTP timeouts. */ for (i=0; i<SCTP_CONNTRACK_MAX; i++) timeouts[i] = sn->timeouts[i]; /* there's a 1:1 mapping between attributes and protocol states. */ for (i=CTA_TIMEOUT_SCTP_UNSPEC+1; i<CTA_TIMEOUT_SCTP_MAX+1; i++) { if (tb[i]) { timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ; } } timeouts[CTA_TIMEOUT_SCTP_UNSPEC] = timeouts[CTA_TIMEOUT_SCTP_CLOSED]; return 0; } static int sctp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) { const unsigned int *timeouts = data; int i; for (i=CTA_TIMEOUT_SCTP_UNSPEC+1; i<CTA_TIMEOUT_SCTP_MAX+1; i++) { if (nla_put_be32(skb, i, htonl(timeouts[i] / HZ))) goto nla_put_failure; } return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = { [CTA_TIMEOUT_SCTP_CLOSED] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_COOKIE_WAIT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_COOKIE_ECHOED] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_ESTABLISHED] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_SHUTDOWN_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_SHUTDOWN_RECD] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_HEARTBEAT_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ void nf_conntrack_sctp_init_net(struct net *net) { struct nf_sctp_net *sn = nf_sctp_pernet(net); int i; for (i = 0; i < SCTP_CONNTRACK_MAX; i++) sn->timeouts[i] = sctp_timeouts[i]; /* timeouts[0] is unused, init it so ->timeouts[0] contains * 'new' timeout, like udp or icmp. */ sn->timeouts[0] = sctp_timeouts[SCTP_CONNTRACK_CLOSED]; } const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp = { .l4proto = IPPROTO_SCTP, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = sctp_print_conntrack, #endif .can_early_drop = sctp_can_early_drop, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_size = SCTP_NLATTR_SIZE, .to_nlattr = sctp_to_nlattr, .from_nlattr = nlattr_to_sctp, .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nla_policy = nf_ct_port_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = sctp_timeout_nlattr_to_obj, .obj_to_nlattr = sctp_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_SCTP_MAX, .obj_size = sizeof(unsigned int) * SCTP_CONNTRACK_MAX, .nla_policy = sctp_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ };
9 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 // SPDX-License-Identifier: GPL-2.0 /* * NFS exporting and validation. * * We maintain a list of clients, each of which has a list of * exports. To export an fs to a given client, you first have * to create the client entry with NFSCTL_ADDCLIENT, which * creates a client control block and adds it to the hash * table. Then, you call NFSCTL_EXPORT for each fs. * * * Copyright (C) 1995, 1996 Olaf Kirch, <okir@monad.swb.de> */ #include <linux/slab.h> #include <linux/namei.h> #include <linux/module.h> #include <linux/exportfs.h> #include <linux/sunrpc/svc_xprt.h> #include "nfsd.h" #include "nfsfh.h" #include "netns.h" #include "pnfs.h" #include "filecache.h" #include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_EXPORT /* * We have two caches. * One maps client+vfsmnt+dentry to export options - the export map * The other maps client+filehandle-fragment to export options. - the expkey map * * The export options are actually stored in the first map, and the * second map contains a reference to the entry in the first map. */ #define EXPKEY_HASHBITS 8 #define EXPKEY_HASHMAX (1 << EXPKEY_HASHBITS) #define EXPKEY_HASHMASK (EXPKEY_HASHMAX -1) static void expkey_put(struct kref *ref) { struct svc_expkey *key = container_of(ref, struct svc_expkey, h.ref); if (test_bit(CACHE_VALID, &key->h.flags) && !test_bit(CACHE_NEGATIVE, &key->h.flags)) path_put(&key->ek_path); auth_domain_put(key->ek_client); kfree_rcu(key, ek_rcu); } static int expkey_upcall(struct cache_detail *cd, struct cache_head *h) { return sunrpc_cache_pipe_upcall(cd, h); } static void expkey_request(struct cache_detail *cd, struct cache_head *h, char **bpp, int *blen) { /* client fsidtype \xfsid */ struct svc_expkey *ek = container_of(h, struct svc_expkey, h); char type[5]; qword_add(bpp, blen, ek->ek_client->name); snprintf(type, 5, "%d", ek->ek_fsidtype); qword_add(bpp, blen, type); qword_addhex(bpp, blen, (char*)ek->ek_fsid, key_len(ek->ek_fsidtype)); (*bpp)[-1] = '\n'; } static struct svc_expkey *svc_expkey_update(struct cache_detail *cd, struct svc_expkey *new, struct svc_expkey *old); static struct svc_expkey *svc_expkey_lookup(struct cache_detail *cd, struct svc_expkey *); static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) { /* client fsidtype fsid expiry [path] */ char *buf; int len; struct auth_domain *dom = NULL; int err; int fsidtype; char *ep; struct svc_expkey key; struct svc_expkey *ek = NULL; if (mesg[mlen - 1] != '\n') return -EINVAL; mesg[mlen-1] = 0; buf = kmalloc(PAGE_SIZE, GFP_KERNEL); err = -ENOMEM; if (!buf) goto out; err = -EINVAL; if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out; err = -ENOENT; dom = auth_domain_find(buf); if (!dom) goto out; dprintk("found domain %s\n", buf); err = -EINVAL; if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out; fsidtype = simple_strtoul(buf, &ep, 10); if (*ep) goto out; dprintk("found fsidtype %d\n", fsidtype); if (key_len(fsidtype)==0) /* invalid type */ goto out; if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0) goto out; dprintk("found fsid length %d\n", len); if (len != key_len(fsidtype)) goto out; /* OK, we seem to have a valid key */ key.h.flags = 0; err = get_expiry(&mesg, &key.h.expiry_time); if (err) goto out; key.ek_client = dom; key.ek_fsidtype = fsidtype; memcpy(key.ek_fsid, buf, len); ek = svc_expkey_lookup(cd, &key); err = -ENOMEM; if (!ek) goto out; /* now we want a pathname, or empty meaning NEGATIVE */ err = -EINVAL; len = qword_get(&mesg, buf, PAGE_SIZE); if (len < 0) goto out; dprintk("Path seems to be <%s>\n", buf); err = 0; if (len == 0) { set_bit(CACHE_NEGATIVE, &key.h.flags); ek = svc_expkey_update(cd, &key, ek); if (ek) trace_nfsd_expkey_update(ek, NULL); else err = -ENOMEM; } else { err = kern_path(buf, 0, &key.ek_path); if (err) goto out; dprintk("Found the path %s\n", buf); ek = svc_expkey_update(cd, &key, ek); if (ek) trace_nfsd_expkey_update(ek, buf); else err = -ENOMEM; path_put(&key.ek_path); } cache_flush(); out: if (ek) cache_put(&ek->h, cd); if (dom) auth_domain_put(dom); kfree(buf); return err; } static int expkey_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) { struct svc_expkey *ek ; int i; if (h ==NULL) { seq_puts(m, "#domain fsidtype fsid [path]\n"); return 0; } ek = container_of(h, struct svc_expkey, h); seq_printf(m, "%s %d 0x", ek->ek_client->name, ek->ek_fsidtype); for (i=0; i < key_len(ek->ek_fsidtype)/4; i++) seq_printf(m, "%08x", ek->ek_fsid[i]); if (test_bit(CACHE_VALID, &h->flags) && !test_bit(CACHE_NEGATIVE, &h->flags)) { seq_printf(m, " "); seq_path(m, &ek->ek_path, "\\ \t\n"); } seq_printf(m, "\n"); return 0; } static inline int expkey_match (struct cache_head *a, struct cache_head *b) { struct svc_expkey *orig = container_of(a, struct svc_expkey, h); struct svc_expkey *new = container_of(b, struct svc_expkey, h); if (orig->ek_fsidtype != new->ek_fsidtype || orig->ek_client != new->ek_client || memcmp(orig->ek_fsid, new->ek_fsid, key_len(orig->ek_fsidtype)) != 0) return 0; return 1; } static inline void expkey_init(struct cache_head *cnew, struct cache_head *citem) { struct svc_expkey *new = container_of(cnew, struct svc_expkey, h); struct svc_expkey *item = container_of(citem, struct svc_expkey, h); kref_get(&item->ek_client->ref); new->ek_client = item->ek_client; new->ek_fsidtype = item->ek_fsidtype; memcpy(new->ek_fsid, item->ek_fsid, sizeof(new->ek_fsid)); } static inline void expkey_update(struct cache_head *cnew, struct cache_head *citem) { struct svc_expkey *new = container_of(cnew, struct svc_expkey, h); struct svc_expkey *item = container_of(citem, struct svc_expkey, h); new->ek_path = item->ek_path; path_get(&item->ek_path); } static struct cache_head *expkey_alloc(void) { struct svc_expkey *i = kmalloc(sizeof(*i), GFP_KERNEL); if (i) return &i->h; else return NULL; } static void expkey_flush(void) { /* * Take the nfsd_mutex here to ensure that the file cache is not * destroyed while we're in the middle of flushing. */ mutex_lock(&nfsd_mutex); nfsd_file_cache_purge(current->nsproxy->net_ns); mutex_unlock(&nfsd_mutex); } static const struct cache_detail svc_expkey_cache_template = { .owner = THIS_MODULE, .hash_size = EXPKEY_HASHMAX, .name = "nfsd.fh", .cache_put = expkey_put, .cache_upcall = expkey_upcall, .cache_request = expkey_request, .cache_parse = expkey_parse, .cache_show = expkey_show, .match = expkey_match, .init = expkey_init, .update = expkey_update, .alloc = expkey_alloc, .flush = expkey_flush, }; static int svc_expkey_hash(struct svc_expkey *item) { int hash = item->ek_fsidtype; char * cp = (char*)item->ek_fsid; int len = key_len(item->ek_fsidtype); hash ^= hash_mem(cp, len, EXPKEY_HASHBITS); hash ^= hash_ptr(item->ek_client, EXPKEY_HASHBITS); hash &= EXPKEY_HASHMASK; return hash; } static struct svc_expkey * svc_expkey_lookup(struct cache_detail *cd, struct svc_expkey *item) { struct cache_head *ch; int hash = svc_expkey_hash(item); ch = sunrpc_cache_lookup_rcu(cd, &item->h, hash); if (ch) return container_of(ch, struct svc_expkey, h); else return NULL; } static struct svc_expkey * svc_expkey_update(struct cache_detail *cd, struct svc_expkey *new, struct svc_expkey *old) { struct cache_head *ch; int hash = svc_expkey_hash(new); ch = sunrpc_cache_update(cd, &new->h, &old->h, hash); if (ch) return container_of(ch, struct svc_expkey, h); else return NULL; } #define EXPORT_HASHBITS 8 #define EXPORT_HASHMAX (1<< EXPORT_HASHBITS) static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc) { struct nfsd4_fs_location *locations = fsloc->locations; int i; if (!locations) return; for (i = 0; i < fsloc->locations_count; i++) { kfree(locations[i].path); kfree(locations[i].hosts); } kfree(locations); fsloc->locations = NULL; } static int export_stats_init(struct export_stats *stats) { stats->start_time = ktime_get_seconds(); return nfsd_percpu_counters_init(stats->counter, EXP_STATS_COUNTERS_NUM); } static void export_stats_reset(struct export_stats *stats) { if (stats) nfsd_percpu_counters_reset(stats->counter, EXP_STATS_COUNTERS_NUM); } static void export_stats_destroy(struct export_stats *stats) { if (stats) nfsd_percpu_counters_destroy(stats->counter, EXP_STATS_COUNTERS_NUM); } static void svc_export_put(struct kref *ref) { struct svc_export *exp = container_of(ref, struct svc_export, h.ref); path_put(&exp->ex_path); auth_domain_put(exp->ex_client); nfsd4_fslocs_free(&exp->ex_fslocs); export_stats_destroy(exp->ex_stats); kfree(exp->ex_stats); kfree(exp->ex_uuid); kfree_rcu(exp, ex_rcu); } static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h) { return sunrpc_cache_pipe_upcall(cd, h); } static void svc_export_request(struct cache_detail *cd, struct cache_head *h, char **bpp, int *blen) { /* client path */ struct svc_export *exp = container_of(h, struct svc_export, h); char *pth; qword_add(bpp, blen, exp->ex_client->name); pth = d_path(&exp->ex_path, *bpp, *blen); if (IS_ERR(pth)) { /* is this correct? */ (*bpp)[0] = '\n'; return; } qword_add(bpp, blen, pth); (*bpp)[-1] = '\n'; } static struct svc_export *svc_export_update(struct svc_export *new, struct svc_export *old); static struct svc_export *svc_export_lookup(struct svc_export *); static int check_export(struct path *path, int *flags, unsigned char *uuid) { struct inode *inode = d_inode(path->dentry); /* * We currently export only dirs, regular files, and (for v4 * pseudoroot) symlinks. */ if (!S_ISDIR(inode->i_mode) && !S_ISLNK(inode->i_mode) && !S_ISREG(inode->i_mode)) return -ENOTDIR; /* * Mountd should never pass down a writeable V4ROOT export, but, * just to make sure: */ if (*flags & NFSEXP_V4ROOT) *flags |= NFSEXP_READONLY; /* There are two requirements on a filesystem to be exportable. * 1: We must be able to identify the filesystem from a number. * either a device number (so FS_REQUIRES_DEV needed) * or an FSID number (so NFSEXP_FSID or ->uuid is needed). * 2: We must be able to find an inode from a filehandle. * This means that s_export_op must be set. * 3: We must not currently be on an idmapped mount. */ if (!(inode->i_sb->s_type->fs_flags & FS_REQUIRES_DEV) && !(*flags & NFSEXP_FSID) && uuid == NULL) { dprintk("exp_export: export of non-dev fs without fsid\n"); return -EINVAL; } if (!exportfs_can_decode_fh(inode->i_sb->s_export_op)) { dprintk("exp_export: export of invalid fs type.\n"); return -EINVAL; } if (is_idmapped_mnt(path->mnt)) { dprintk("exp_export: export of idmapped mounts not yet supported.\n"); return -EINVAL; } if (inode->i_sb->s_export_op->flags & EXPORT_OP_NOSUBTREECHK && !(*flags & NFSEXP_NOSUBTREECHECK)) { dprintk("%s: %s does not support subtree checking!\n", __func__, inode->i_sb->s_type->name); return -EINVAL; } return 0; } #ifdef CONFIG_NFSD_V4 static int fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc) { int len; int migrated, i, err; /* more than one fsloc */ if (fsloc->locations) return -EINVAL; /* listsize */ err = get_uint(mesg, &fsloc->locations_count); if (err) return err; if (fsloc->locations_count > MAX_FS_LOCATIONS) return -EINVAL; if (fsloc->locations_count == 0) return 0; fsloc->locations = kcalloc(fsloc->locations_count, sizeof(struct nfsd4_fs_location), GFP_KERNEL); if (!fsloc->locations) return -ENOMEM; for (i=0; i < fsloc->locations_count; i++) { /* colon separated host list */ err = -EINVAL; len = qword_get(mesg, buf, PAGE_SIZE); if (len <= 0) goto out_free_all; err = -ENOMEM; fsloc->locations[i].hosts = kstrdup(buf, GFP_KERNEL); if (!fsloc->locations[i].hosts) goto out_free_all; err = -EINVAL; /* slash separated path component list */ len = qword_get(mesg, buf, PAGE_SIZE); if (len <= 0) goto out_free_all; err = -ENOMEM; fsloc->locations[i].path = kstrdup(buf, GFP_KERNEL); if (!fsloc->locations[i].path) goto out_free_all; } /* migrated */ err = get_int(mesg, &migrated); if (err) goto out_free_all; err = -EINVAL; if (migrated < 0 || migrated > 1) goto out_free_all; fsloc->migrated = migrated; return 0; out_free_all: nfsd4_fslocs_free(fsloc); return err; } static int secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { struct exp_flavor_info *f; u32 listsize; int err; /* more than one secinfo */ if (exp->ex_nflavors) return -EINVAL; err = get_uint(mesg, &listsize); if (err) return err; if (listsize > MAX_SECINFO_LIST) return -EINVAL; for (f = exp->ex_flavors; f < exp->ex_flavors + listsize; f++) { err = get_uint(mesg, &f->pseudoflavor); if (err) return err; /* * XXX: It would be nice to also check whether this * pseudoflavor is supported, so we can discover the * problem at export time instead of when a client fails * to authenticate. */ err = get_uint(mesg, &f->flags); if (err) return err; /* Only some flags are allowed to differ between flavors: */ if (~NFSEXP_SECINFO_FLAGS & (f->flags ^ exp->ex_flags)) return -EINVAL; } exp->ex_nflavors = listsize; return 0; } #else /* CONFIG_NFSD_V4 */ static inline int fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc){return 0;} static inline int secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; } #endif static int xprtsec_parse(char **mesg, char *buf, struct svc_export *exp) { unsigned int i, mode, listsize; int err; err = get_uint(mesg, &listsize); if (err) return err; if (listsize > NFSEXP_XPRTSEC_NUM) return -EINVAL; exp->ex_xprtsec_modes = 0; for (i = 0; i < listsize; i++) { err = get_uint(mesg, &mode); if (err) return err; if (mode > NFSEXP_XPRTSEC_MTLS) return -EINVAL; exp->ex_xprtsec_modes |= mode; } return 0; } static inline int nfsd_uuid_parse(char **mesg, char *buf, unsigned char **puuid) { int len; /* more than one uuid */ if (*puuid) return -EINVAL; /* expect a 16 byte uuid encoded as \xXXXX... */ len = qword_get(mesg, buf, PAGE_SIZE); if (len != EX_UUID_LEN) return -EINVAL; *puuid = kmemdup(buf, EX_UUID_LEN, GFP_KERNEL); if (*puuid == NULL) return -ENOMEM; return 0; } static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) { /* client path expiry [flags anonuid anongid fsid] */ char *buf; int err; struct auth_domain *dom = NULL; struct svc_export exp = {}, *expp; int an_int; if (mesg[mlen-1] != '\n') return -EINVAL; mesg[mlen-1] = 0; buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!buf) return -ENOMEM; /* client */ err = -EINVAL; if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out; err = -ENOENT; dom = auth_domain_find(buf); if (!dom) goto out; /* path */ err = -EINVAL; if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out1; err = kern_path(buf, 0, &exp.ex_path); if (err) goto out1; exp.ex_client = dom; exp.cd = cd; exp.ex_devid_map = NULL; exp.ex_xprtsec_modes = NFSEXP_XPRTSEC_ALL; /* expiry */ err = get_expiry(&mesg, &exp.h.expiry_time); if (err) goto out3; /* flags */ err = get_int(&mesg, &an_int); if (err == -ENOENT) { err = 0; set_bit(CACHE_NEGATIVE, &exp.h.flags); } else { if (err || an_int < 0) goto out3; exp.ex_flags= an_int; /* anon uid */ err = get_int(&mesg, &an_int); if (err) goto out3; exp.ex_anon_uid= make_kuid(current_user_ns(), an_int); /* anon gid */ err = get_int(&mesg, &an_int); if (err) goto out3; exp.ex_anon_gid= make_kgid(current_user_ns(), an_int); /* fsid */ err = get_int(&mesg, &an_int); if (err) goto out3; exp.ex_fsid = an_int; while (qword_get(&mesg, buf, PAGE_SIZE) > 0) { if (strcmp(buf, "fsloc") == 0) err = fsloc_parse(&mesg, buf, &exp.ex_fslocs); else if (strcmp(buf, "uuid") == 0) err = nfsd_uuid_parse(&mesg, buf, &exp.ex_uuid); else if (strcmp(buf, "secinfo") == 0) err = secinfo_parse(&mesg, buf, &exp); else if (strcmp(buf, "xprtsec") == 0) err = xprtsec_parse(&mesg, buf, &exp); else /* quietly ignore unknown words and anything * following. Newer user-space can try to set * new values, then see what the result was. */ break; if (err) goto out4; } err = check_export(&exp.ex_path, &exp.ex_flags, exp.ex_uuid); if (err) goto out4; /* * No point caching this if it would immediately expire. * Also, this protects exportfs's dummy export from the * anon_uid/anon_gid checks: */ if (exp.h.expiry_time < seconds_since_boot()) goto out4; /* * For some reason exportfs has been passing down an * invalid (-1) uid & gid on the "dummy" export which it * uses to test export support. To make sure exportfs * sees errors from check_export we therefore need to * delay these checks till after check_export: */ err = -EINVAL; if (!uid_valid(exp.ex_anon_uid)) goto out4; if (!gid_valid(exp.ex_anon_gid)) goto out4; err = 0; nfsd4_setup_layout_type(&exp); } expp = svc_export_lookup(&exp); if (!expp) { err = -ENOMEM; goto out4; } expp = svc_export_update(&exp, expp); if (expp) { trace_nfsd_export_update(expp); cache_flush(); exp_put(expp); } else err = -ENOMEM; out4: nfsd4_fslocs_free(&exp.ex_fslocs); kfree(exp.ex_uuid); out3: path_put(&exp.ex_path); out1: auth_domain_put(dom); out: kfree(buf); return err; } static void exp_flags(struct seq_file *m, int flag, int fsid, kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fslocs); static void show_secinfo(struct seq_file *m, struct svc_export *exp); static int is_export_stats_file(struct seq_file *m) { /* * The export_stats file uses the same ops as the exports file. * We use the file's name to determine the reported info per export. * There is no rename in nsfdfs, so d_name.name is stable. */ return !strcmp(m->file->f_path.dentry->d_name.name, "export_stats"); } static int svc_export_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) { struct svc_export *exp; bool export_stats = is_export_stats_file(m); if (h == NULL) { if (export_stats) seq_puts(m, "#path domain start-time\n#\tstats\n"); else seq_puts(m, "#path domain(flags)\n"); return 0; } exp = container_of(h, struct svc_export, h); seq_path(m, &exp->ex_path, " \t\n\\"); seq_putc(m, '\t'); seq_escape(m, exp->ex_client->name, " \t\n\\"); if (export_stats) { struct percpu_counter *counter = exp->ex_stats->counter; seq_printf(m, "\t%lld\n", exp->ex_stats->start_time); seq_printf(m, "\tfh_stale: %lld\n", percpu_counter_sum_positive(&counter[EXP_STATS_FH_STALE])); seq_printf(m, "\tio_read: %lld\n", percpu_counter_sum_positive(&counter[EXP_STATS_IO_READ])); seq_printf(m, "\tio_write: %lld\n", percpu_counter_sum_positive(&counter[EXP_STATS_IO_WRITE])); seq_putc(m, '\n'); return 0; } seq_putc(m, '('); if (test_bit(CACHE_VALID, &h->flags) && !test_bit(CACHE_NEGATIVE, &h->flags)) { exp_flags(m, exp->ex_flags, exp->ex_fsid, exp->ex_anon_uid, exp->ex_anon_gid, &exp->ex_fslocs); if (exp->ex_uuid) { int i; seq_puts(m, ",uuid="); for (i = 0; i < EX_UUID_LEN; i++) { if ((i&3) == 0 && i) seq_putc(m, ':'); seq_printf(m, "%02x", exp->ex_uuid[i]); } } show_secinfo(m, exp); } seq_puts(m, ")\n"); return 0; } static int svc_export_match(struct cache_head *a, struct cache_head *b) { struct svc_export *orig = container_of(a, struct svc_export, h); struct svc_export *new = container_of(b, struct svc_export, h); return orig->ex_client == new->ex_client && path_equal(&orig->ex_path, &new->ex_path); } static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) { struct svc_export *new = container_of(cnew, struct svc_export, h); struct svc_export *item = container_of(citem, struct svc_export, h); kref_get(&item->ex_client->ref); new->ex_client = item->ex_client; new->ex_path = item->ex_path; path_get(&item->ex_path); new->ex_fslocs.locations = NULL; new->ex_fslocs.locations_count = 0; new->ex_fslocs.migrated = 0; new->ex_layout_types = 0; new->ex_uuid = NULL; new->cd = item->cd; export_stats_reset(new->ex_stats); } static void export_update(struct cache_head *cnew, struct cache_head *citem) { struct svc_export *new = container_of(cnew, struct svc_export, h); struct svc_export *item = container_of(citem, struct svc_export, h); int i; new->ex_flags = item->ex_flags; new->ex_anon_uid = item->ex_anon_uid; new->ex_anon_gid = item->ex_anon_gid; new->ex_fsid = item->ex_fsid; new->ex_devid_map = item->ex_devid_map; item->ex_devid_map = NULL; new->ex_uuid = item->ex_uuid; item->ex_uuid = NULL; new->ex_fslocs.locations = item->ex_fslocs.locations; item->ex_fslocs.locations = NULL; new->ex_fslocs.locations_count = item->ex_fslocs.locations_count; item->ex_fslocs.locations_count = 0; new->ex_fslocs.migrated = item->ex_fslocs.migrated; item->ex_fslocs.migrated = 0; new->ex_layout_types = item->ex_layout_types; new->ex_nflavors = item->ex_nflavors; for (i = 0; i < MAX_SECINFO_LIST; i++) { new->ex_flavors[i] = item->ex_flavors[i]; } new->ex_xprtsec_modes = item->ex_xprtsec_modes; } static struct cache_head *svc_export_alloc(void) { struct svc_export *i = kmalloc(sizeof(*i), GFP_KERNEL); if (!i) return NULL; i->ex_stats = kmalloc(sizeof(*(i->ex_stats)), GFP_KERNEL); if (!i->ex_stats) { kfree(i); return NULL; } if (export_stats_init(i->ex_stats)) { kfree(i->ex_stats); kfree(i); return NULL; } return &i->h; } static const struct cache_detail svc_export_cache_template = { .owner = THIS_MODULE, .hash_size = EXPORT_HASHMAX, .name = "nfsd.export", .cache_put = svc_export_put, .cache_upcall = svc_export_upcall, .cache_request = svc_export_request, .cache_parse = svc_export_parse, .cache_show = svc_export_show, .match = svc_export_match, .init = svc_export_init, .update = export_update, .alloc = svc_export_alloc, }; static int svc_export_hash(struct svc_export *exp) { int hash; hash = hash_ptr(exp->ex_client, EXPORT_HASHBITS); hash ^= hash_ptr(exp->ex_path.dentry, EXPORT_HASHBITS); hash ^= hash_ptr(exp->ex_path.mnt, EXPORT_HASHBITS); return hash; } static struct svc_export * svc_export_lookup(struct svc_export *exp) { struct cache_head *ch; int hash = svc_export_hash(exp); ch = sunrpc_cache_lookup_rcu(exp->cd, &exp->h, hash); if (ch) return container_of(ch, struct svc_export, h); else return NULL; } static struct svc_export * svc_export_update(struct svc_export *new, struct svc_export *old) { struct cache_head *ch; int hash = svc_export_hash(old); ch = sunrpc_cache_update(old->cd, &new->h, &old->h, hash); if (ch) return container_of(ch, struct svc_export, h); else return NULL; } static struct svc_expkey * exp_find_key(struct cache_detail *cd, struct auth_domain *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp) { struct svc_expkey key, *ek; int err; if (!clp) return ERR_PTR(-ENOENT); key.ek_client = clp; key.ek_fsidtype = fsid_type; memcpy(key.ek_fsid, fsidv, key_len(fsid_type)); ek = svc_expkey_lookup(cd, &key); if (ek == NULL) return ERR_PTR(-ENOMEM); err = cache_check(cd, &ek->h, reqp); if (err) { trace_nfsd_exp_find_key(&key, err); return ERR_PTR(err); } return ek; } static struct svc_export * exp_get_by_name(struct cache_detail *cd, struct auth_domain *clp, const struct path *path, struct cache_req *reqp) { struct svc_export *exp, key; int err; if (!clp) return ERR_PTR(-ENOENT); key.ex_client = clp; key.ex_path = *path; key.cd = cd; exp = svc_export_lookup(&key); if (exp == NULL) return ERR_PTR(-ENOMEM); err = cache_check(cd, &exp->h, reqp); if (err) { trace_nfsd_exp_get_by_name(&key, err); return ERR_PTR(err); } return exp; } /* * Find the export entry for a given dentry. */ static struct svc_export * exp_parent(struct cache_detail *cd, struct auth_domain *clp, struct path *path) { struct dentry *saved = dget(path->dentry); struct svc_export *exp = exp_get_by_name(cd, clp, path, NULL); while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) { struct dentry *parent = dget_parent(path->dentry); dput(path->dentry); path->dentry = parent; exp = exp_get_by_name(cd, clp, path, NULL); } dput(path->dentry); path->dentry = saved; return exp; } /* * Obtain the root fh on behalf of a client. * This could be done in user space, but I feel that it adds some safety * since its harder to fool a kernel module than a user space program. */ int exp_rootfh(struct net *net, struct auth_domain *clp, char *name, struct knfsd_fh *f, int maxsize) { struct svc_export *exp; struct path path; struct inode *inode; struct svc_fh fh; int err; struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct cache_detail *cd = nn->svc_export_cache; err = -EPERM; /* NB: we probably ought to check that it's NUL-terminated */ if (kern_path(name, 0, &path)) { printk("nfsd: exp_rootfh path not found %s", name); return err; } inode = d_inode(path.dentry); dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%ld)\n", name, path.dentry, clp->name, inode->i_sb->s_id, inode->i_ino); exp = exp_parent(cd, clp, &path); if (IS_ERR(exp)) { err = PTR_ERR(exp); goto out; } /* * fh must be initialized before calling fh_compose */ fh_init(&fh, maxsize); if (fh_compose(&fh, exp, path.dentry, NULL)) err = -EINVAL; else err = 0; memcpy(f, &fh.fh_handle, sizeof(struct knfsd_fh)); fh_put(&fh); exp_put(exp); out: path_put(&path); return err; } static struct svc_export *exp_find(struct cache_detail *cd, struct auth_domain *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp) { struct svc_export *exp; struct nfsd_net *nn = net_generic(cd->net, nfsd_net_id); struct svc_expkey *ek = exp_find_key(nn->svc_expkey_cache, clp, fsid_type, fsidv, reqp); if (IS_ERR(ek)) return ERR_CAST(ek); exp = exp_get_by_name(cd, clp, &ek->ek_path, reqp); cache_put(&ek->h, nn->svc_expkey_cache); if (IS_ERR(exp)) return ERR_CAST(exp); return exp; } __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp) { struct exp_flavor_info *f, *end = exp->ex_flavors + exp->ex_nflavors; struct svc_xprt *xprt = rqstp->rq_xprt; if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_NONE) { if (!test_bit(XPT_TLS_SESSION, &xprt->xpt_flags)) goto ok; } if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_TLS) { if (test_bit(XPT_TLS_SESSION, &xprt->xpt_flags) && !test_bit(XPT_PEER_AUTH, &xprt->xpt_flags)) goto ok; } if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_MTLS) { if (test_bit(XPT_TLS_SESSION, &xprt->xpt_flags) && test_bit(XPT_PEER_AUTH, &xprt->xpt_flags)) goto ok; } goto denied; ok: /* legacy gss-only clients are always OK: */ if (exp->ex_client == rqstp->rq_gssclient) return 0; /* ip-address based client; check sec= export option: */ for (f = exp->ex_flavors; f < end; f++) { if (f->pseudoflavor == rqstp->rq_cred.cr_flavor) return 0; } /* defaults in absence of sec= options: */ if (exp->ex_nflavors == 0) { if (rqstp->rq_cred.cr_flavor == RPC_AUTH_NULL || rqstp->rq_cred.cr_flavor == RPC_AUTH_UNIX) return 0; } /* If the compound op contains a spo_must_allowed op, * it will be sent with integrity/protection which * will have to be expressly allowed on mounts that * don't support it */ if (nfsd4_spo_must_allow(rqstp)) return 0; denied: return rqstp->rq_vers < 4 ? nfserr_acces : nfserr_wrongsec; } /* * Uses rq_client and rq_gssclient to find an export; uses rq_client (an * auth_unix client) if it's available and has secinfo information; * otherwise, will try to use rq_gssclient. * * Called from functions that handle requests; functions that do work on * behalf of mountd are passed a single client name to use, and should * use exp_get_by_name() or exp_find(). */ struct svc_export * rqst_exp_get_by_name(struct svc_rqst *rqstp, struct path *path) { struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT); struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct cache_detail *cd = nn->svc_export_cache; if (rqstp->rq_client == NULL) goto gss; /* First try the auth_unix client: */ exp = exp_get_by_name(cd, rqstp->rq_client, path, &rqstp->rq_chandle); if (PTR_ERR(exp) == -ENOENT) goto gss; if (IS_ERR(exp)) return exp; /* If it has secinfo, assume there are no gss/... clients */ if (exp->ex_nflavors > 0) return exp; gss: /* Otherwise, try falling back on gss client */ if (rqstp->rq_gssclient == NULL) return exp; gssexp = exp_get_by_name(cd, rqstp->rq_gssclient, path, &rqstp->rq_chandle); if (PTR_ERR(gssexp) == -ENOENT) return exp; if (!IS_ERR(exp)) exp_put(exp); return gssexp; } struct svc_export * rqst_exp_find(struct svc_rqst *rqstp, int fsid_type, u32 *fsidv) { struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT); struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct cache_detail *cd = nn->svc_export_cache; if (rqstp->rq_client == NULL) goto gss; /* First try the auth_unix client: */ exp = exp_find(cd, rqstp->rq_client, fsid_type, fsidv, &rqstp->rq_chandle); if (PTR_ERR(exp) == -ENOENT) goto gss; if (IS_ERR(exp)) return exp; /* If it has secinfo, assume there are no gss/... clients */ if (exp->ex_nflavors > 0) return exp; gss: /* Otherwise, try falling back on gss client */ if (rqstp->rq_gssclient == NULL) return exp; gssexp = exp_find(cd, rqstp->rq_gssclient, fsid_type, fsidv, &rqstp->rq_chandle); if (PTR_ERR(gssexp) == -ENOENT) return exp; if (!IS_ERR(exp)) exp_put(exp); return gssexp; } struct svc_export * rqst_exp_parent(struct svc_rqst *rqstp, struct path *path) { struct dentry *saved = dget(path->dentry); struct svc_export *exp = rqst_exp_get_by_name(rqstp, path); while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) { struct dentry *parent = dget_parent(path->dentry); dput(path->dentry); path->dentry = parent; exp = rqst_exp_get_by_name(rqstp, path); } dput(path->dentry); path->dentry = saved; return exp; } struct svc_export *rqst_find_fsidzero_export(struct svc_rqst *rqstp) { u32 fsidv[2]; mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL); return rqst_exp_find(rqstp, FSID_NUM, fsidv); } /* * Called when we need the filehandle for the root of the pseudofs, * for a given NFSv4 client. The root is defined to be the * export point with fsid==0 */ __be32 exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp) { struct svc_export *exp; __be32 rv; exp = rqst_find_fsidzero_export(rqstp); if (IS_ERR(exp)) return nfserrno(PTR_ERR(exp)); rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL); exp_put(exp); return rv; } static struct flags { int flag; char *name[2]; } expflags[] = { { NFSEXP_READONLY, {"ro", "rw"}}, { NFSEXP_INSECURE_PORT, {"insecure", ""}}, { NFSEXP_ROOTSQUASH, {"root_squash", "no_root_squash"}}, { NFSEXP_ALLSQUASH, {"all_squash", ""}}, { NFSEXP_ASYNC, {"async", "sync"}}, { NFSEXP_GATHERED_WRITES, {"wdelay", "no_wdelay"}}, { NFSEXP_NOREADDIRPLUS, {"nordirplus", ""}}, { NFSEXP_NOHIDE, {"nohide", ""}}, { NFSEXP_CROSSMOUNT, {"crossmnt", ""}}, { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}}, { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}}, { NFSEXP_V4ROOT, {"v4root", ""}}, { NFSEXP_PNFS, {"pnfs", ""}}, { NFSEXP_SECURITY_LABEL, {"security_label", ""}}, { 0, {"", ""}} }; static void show_expflags(struct seq_file *m, int flags, int mask) { struct flags *flg; int state, first = 0; for (flg = expflags; flg->flag; flg++) { if (flg->flag & ~mask) continue; state = (flg->flag & flags) ? 0 : 1; if (*flg->name[state]) seq_printf(m, "%s%s", first++?",":"", flg->name[state]); } } static void show_secinfo_flags(struct seq_file *m, int flags) { seq_printf(m, ","); show_expflags(m, flags, NFSEXP_SECINFO_FLAGS); } static bool secinfo_flags_equal(int f, int g) { f &= NFSEXP_SECINFO_FLAGS; g &= NFSEXP_SECINFO_FLAGS; return f == g; } static int show_secinfo_run(struct seq_file *m, struct exp_flavor_info **fp, struct exp_flavor_info *end) { int flags; flags = (*fp)->flags; seq_printf(m, ",sec=%d", (*fp)->pseudoflavor); (*fp)++; while (*fp != end && secinfo_flags_equal(flags, (*fp)->flags)) { seq_printf(m, ":%d", (*fp)->pseudoflavor); (*fp)++; } return flags; } static void show_secinfo(struct seq_file *m, struct svc_export *exp) { struct exp_flavor_info *f; struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors; int flags; if (exp->ex_nflavors == 0) return; f = exp->ex_flavors; flags = show_secinfo_run(m, &f, end); if (!secinfo_flags_equal(flags, exp->ex_flags)) show_secinfo_flags(m, flags); while (f != end) { flags = show_secinfo_run(m, &f, end); show_secinfo_flags(m, flags); } } static void exp_flags(struct seq_file *m, int flag, int fsid, kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fsloc) { struct user_namespace *userns = m->file->f_cred->user_ns; show_expflags(m, flag, NFSEXP_ALLFLAGS); if (flag & NFSEXP_FSID) seq_printf(m, ",fsid=%d", fsid); if (!uid_eq(anonu, make_kuid(userns, (uid_t)-2)) && !uid_eq(anonu, make_kuid(userns, 0x10000-2))) seq_printf(m, ",anonuid=%u", from_kuid_munged(userns, anonu)); if (!gid_eq(anong, make_kgid(userns, (gid_t)-2)) && !gid_eq(anong, make_kgid(userns, 0x10000-2))) seq_printf(m, ",anongid=%u", from_kgid_munged(userns, anong)); if (fsloc && fsloc->locations_count > 0) { char *loctype = (fsloc->migrated) ? "refer" : "replicas"; int i; seq_printf(m, ",%s=", loctype); seq_escape(m, fsloc->locations[0].path, ",;@ \t\n\\"); seq_putc(m, '@'); seq_escape(m, fsloc->locations[0].hosts, ",;@ \t\n\\"); for (i = 1; i < fsloc->locations_count; i++) { seq_putc(m, ';'); seq_escape(m, fsloc->locations[i].path, ",;@ \t\n\\"); seq_putc(m, '@'); seq_escape(m, fsloc->locations[i].hosts, ",;@ \t\n\\"); } } } static int e_show(struct seq_file *m, void *p) { struct cache_head *cp = p; struct svc_export *exp = container_of(cp, struct svc_export, h); struct cache_detail *cd = m->private; bool export_stats = is_export_stats_file(m); if (p == SEQ_START_TOKEN) { seq_puts(m, "# Version 1.1\n"); if (export_stats) seq_puts(m, "# Path Client Start-time\n#\tStats\n"); else seq_puts(m, "# Path Client(Flags) # IPs\n"); return 0; } exp_get(exp); if (cache_check(cd, &exp->h, NULL)) return 0; exp_put(exp); return svc_export_show(m, cd, cp); } const struct seq_operations nfs_exports_op = { .start = cache_seq_start_rcu, .next = cache_seq_next_rcu, .stop = cache_seq_stop_rcu, .show = e_show, }; /* * Initialize the exports module. */ int nfsd_export_init(struct net *net) { int rv; struct nfsd_net *nn = net_generic(net, nfsd_net_id); dprintk("nfsd: initializing export module (net: %x).\n", net->ns.inum); nn->svc_export_cache = cache_create_net(&svc_export_cache_template, net); if (IS_ERR(nn->svc_export_cache)) return PTR_ERR(nn->svc_export_cache); rv = cache_register_net(nn->svc_export_cache, net); if (rv) goto destroy_export_cache; nn->svc_expkey_cache = cache_create_net(&svc_expkey_cache_template, net); if (IS_ERR(nn->svc_expkey_cache)) { rv = PTR_ERR(nn->svc_expkey_cache); goto unregister_export_cache; } rv = cache_register_net(nn->svc_expkey_cache, net); if (rv) goto destroy_expkey_cache; return 0; destroy_expkey_cache: cache_destroy_net(nn->svc_expkey_cache, net); unregister_export_cache: cache_unregister_net(nn->svc_export_cache, net); destroy_export_cache: cache_destroy_net(nn->svc_export_cache, net); return rv; } /* * Flush exports table - called when last nfsd thread is killed */ void nfsd_export_flush(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); cache_purge(nn->svc_expkey_cache); cache_purge(nn->svc_export_cache); } /* * Shutdown the exports module. */ void nfsd_export_shutdown(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); dprintk("nfsd: shutting down export module (net: %x).\n", net->ns.inum); cache_unregister_net(nn->svc_expkey_cache, net); cache_unregister_net(nn->svc_export_cache, net); cache_destroy_net(nn->svc_expkey_cache, net); cache_destroy_net(nn->svc_export_cache, net); svcauth_unix_purge(net); dprintk("nfsd: export shutdown complete (net: %x).\n", net->ns.inum); }
38 20 23 37 38 111 39 11 1 1 6 1 1 100 120 117 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Red Black Trees (C) 1999 Andrea Arcangeli <andrea@suse.de> linux/include/linux/rbtree.h To use rbtrees you'll have to implement your own insert and search cores. This will avoid us to use callbacks and to drop drammatically performances. I know it's not the cleaner way, but in C (not in C++) to get performances and genericity... See Documentation/core-api/rbtree.rst for documentation and samples. */ #ifndef _LINUX_RBTREE_H #define _LINUX_RBTREE_H #include <linux/container_of.h> #include <linux/rbtree_types.h> #include <linux/stddef.h> #include <linux/rcupdate.h> #define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3)) #define rb_entry(ptr, type, member) container_of(ptr, type, member) #define RB_EMPTY_ROOT(root) (READ_ONCE((root)->rb_node) == NULL) /* 'empty' nodes are nodes that are known not to be inserted in an rbtree */ #define RB_EMPTY_NODE(node) \ ((node)->__rb_parent_color == (unsigned long)(node)) #define RB_CLEAR_NODE(node) \ ((node)->__rb_parent_color = (unsigned long)(node)) extern void rb_insert_color(struct rb_node *, struct rb_root *); extern void rb_erase(struct rb_node *, struct rb_root *); /* Find logical next and previous nodes in a tree */ extern struct rb_node *rb_next(const struct rb_node *); extern struct rb_node *rb_prev(const struct rb_node *); extern struct rb_node *rb_first(const struct rb_root *); extern struct rb_node *rb_last(const struct rb_root *); /* Postorder iteration - always visit the parent after its children */ extern struct rb_node *rb_first_postorder(const struct rb_root *); extern struct rb_node *rb_next_postorder(const struct rb_node *); /* Fast replacement of a single node without remove/rebalance/add/rebalance */ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new, struct rb_root *root); static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) { node->__rb_parent_color = (unsigned long)parent; node->rb_left = node->rb_right = NULL; *rb_link = node; } static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) { node->__rb_parent_color = (unsigned long)parent; node->rb_left = node->rb_right = NULL; rcu_assign_pointer(*rb_link, node); } #define rb_entry_safe(ptr, type, member) \ ({ typeof(ptr) ____ptr = (ptr); \ ____ptr ? rb_entry(____ptr, type, member) : NULL; \ }) /** * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of * given type allowing the backing memory of @pos to be invalidated * * @pos: the 'type *' to use as a loop cursor. * @n: another 'type *' to use as temporary storage * @root: 'rb_root *' of the rbtree. * @field: the name of the rb_node field within 'type'. * * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as * list_for_each_entry_safe() and allows the iteration to continue independent * of changes to @pos by the body of the loop. * * Note, however, that it cannot handle other modifications that re-order the * rbtree it is iterating over. This includes calling rb_erase() on @pos, as * rb_erase() may rebalance the tree, causing us to miss some nodes. */ #define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \ for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \ pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \ typeof(*pos), field); 1; }); \ pos = n) /* Same as rb_first(), but O(1) */ #define rb_first_cached(root) (root)->rb_leftmost static inline void rb_insert_color_cached(struct rb_node *node, struct rb_root_cached *root, bool leftmost) { if (leftmost) root->rb_leftmost = node; rb_insert_color(node, &root->rb_root); } static inline struct rb_node * rb_erase_cached(struct rb_node *node, struct rb_root_cached *root) { struct rb_node *leftmost = NULL; if (root->rb_leftmost == node) leftmost = root->rb_leftmost = rb_next(node); rb_erase(node, &root->rb_root); return leftmost; } static inline void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new, struct rb_root_cached *root) { if (root->rb_leftmost == victim) root->rb_leftmost = new; rb_replace_node(victim, new, &root->rb_root); } /* * The below helper functions use 2 operators with 3 different * calling conventions. The operators are related like: * * comp(a->key,b) < 0 := less(a,b) * comp(a->key,b) > 0 := less(b,a) * comp(a->key,b) == 0 := !less(a,b) && !less(b,a) * * If these operators define a partial order on the elements we make no * guarantee on which of the elements matching the key is found. See * rb_find(). * * The reason for this is to allow the find() interface without requiring an * on-stack dummy object, which might not be feasible due to object size. */ /** * rb_add_cached() - insert @node into the leftmost cached tree @tree * @node: node to insert * @tree: leftmost cached tree to insert @node into * @less: operator defining the (partial) node order * * Returns @node when it is the new leftmost, or NULL. */ static __always_inline struct rb_node * rb_add_cached(struct rb_node *node, struct rb_root_cached *tree, bool (*less)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_root.rb_node; struct rb_node *parent = NULL; bool leftmost = true; while (*link) { parent = *link; if (less(node, parent)) { link = &parent->rb_left; } else { link = &parent->rb_right; leftmost = false; } } rb_link_node(node, parent, link); rb_insert_color_cached(node, tree, leftmost); return leftmost ? node : NULL; } /** * rb_add() - insert @node into @tree * @node: node to insert * @tree: tree to insert @node into * @less: operator defining the (partial) node order */ static __always_inline void rb_add(struct rb_node *node, struct rb_root *tree, bool (*less)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_node; struct rb_node *parent = NULL; while (*link) { parent = *link; if (less(node, parent)) link = &parent->rb_left; else link = &parent->rb_right; } rb_link_node(node, parent, link); rb_insert_color(node, tree); } /** * rb_find_add() - find equivalent @node in @tree, or add @node * @node: node to look-for / insert * @tree: tree to search / modify * @cmp: operator defining the node order * * Returns the rb_node matching @node, or NULL when no match is found and @node * is inserted. */ static __always_inline struct rb_node * rb_find_add(struct rb_node *node, struct rb_root *tree, int (*cmp)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_node; struct rb_node *parent = NULL; int c; while (*link) { parent = *link; c = cmp(node, parent); if (c < 0) link = &parent->rb_left; else if (c > 0) link = &parent->rb_right; else return parent; } rb_link_node(node, parent, link); rb_insert_color(node, tree); return NULL; } /** * rb_find() - find @key in tree @tree * @key: key to match * @tree: tree to search * @cmp: operator defining the node order * * Returns the rb_node matching @key or NULL. */ static __always_inline struct rb_node * rb_find(const void *key, const struct rb_root *tree, int (*cmp)(const void *key, const struct rb_node *)) { struct rb_node *node = tree->rb_node; while (node) { int c = cmp(key, node); if (c < 0) node = node->rb_left; else if (c > 0) node = node->rb_right; else return node; } return NULL; } /** * rb_find_first() - find the first @key in @tree * @key: key to match * @tree: tree to search * @cmp: operator defining node order * * Returns the leftmost node matching @key, or NULL. */ static __always_inline struct rb_node * rb_find_first(const void *key, const struct rb_root *tree, int (*cmp)(const void *key, const struct rb_node *)) { struct rb_node *node = tree->rb_node; struct rb_node *match = NULL; while (node) { int c = cmp(key, node); if (c <= 0) { if (!c) match = node; node = node->rb_left; } else if (c > 0) { node = node->rb_right; } } return match; } /** * rb_next_match() - find the next @key in @tree * @key: key to match * @tree: tree to search * @cmp: operator defining node order * * Returns the next node matching @key, or NULL. */ static __always_inline struct rb_node * rb_next_match(const void *key, struct rb_node *node, int (*cmp)(const void *key, const struct rb_node *)) { node = rb_next(node); if (node && cmp(key, node)) node = NULL; return node; } /** * rb_for_each() - iterates a subtree matching @key * @node: iterator * @key: key to match * @tree: tree to search * @cmp: operator defining node order */ #define rb_for_each(node, key, tree, cmp) \ for ((node) = rb_find_first((key), (tree), (cmp)); \ (node); (node) = rb_next_match((key), (node), (cmp))) #endif /* _LINUX_RBTREE_H */
5051 5061 7 7 5061 383 4895 4903 5058 5053 4893 383 600 600 600 598 600 600 600 600 599 595 598 599 600 599 599 1338 1340 356 356 1128 1128 57 1128 26 1126 25 1128 1129 1334 1341 1341 1341 281 1077 1341 1338 1340 1076 1076 1076 356 356 325 353 356 356 356 356 355 356 356 356 356 601 595 599 600 601 171 171 171 171 171 171 171 590 171 599 329 146 330 64 64 64 64 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/swap.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds */ /* * This file contains the default values for the operation of the * Linux VM subsystem. Fine-tuning documentation can be found in * Documentation/admin-guide/sysctl/vm.rst. * Started 18.12.91 * Swap aging added 23.2.95, Stephen Tweedie. * Buffermem limits added 12.3.98, Rik van Riel. */ #include <linux/mm.h> #include <linux/sched.h> #include <linux/kernel_stat.h> #include <linux/swap.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/pagevec.h> #include <linux/init.h> #include <linux/export.h> #include <linux/mm_inline.h> #include <linux/percpu_counter.h> #include <linux/memremap.h> #include <linux/percpu.h> #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/backing-dev.h> #include <linux/memcontrol.h> #include <linux/gfp.h> #include <linux/uio.h> #include <linux/hugetlb.h> #include <linux/page_idle.h> #include <linux/local_lock.h> #include <linux/buffer_head.h> #include "internal.h" #define CREATE_TRACE_POINTS #include <trace/events/pagemap.h> /* How many pages do we try to swap or page in/out together? As a power of 2 */ int page_cluster; const int page_cluster_max = 31; /* Protecting only lru_rotate.fbatch which requires disabling interrupts */ struct lru_rotate { local_lock_t lock; struct folio_batch fbatch; }; static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = { .lock = INIT_LOCAL_LOCK(lock), }; /* * The following folio batches are grouped together because they are protected * by disabling preemption (and interrupts remain enabled). */ struct cpu_fbatches { local_lock_t lock; struct folio_batch lru_add; struct folio_batch lru_deactivate_file; struct folio_batch lru_deactivate; struct folio_batch lru_lazyfree; #ifdef CONFIG_SMP struct folio_batch activate; #endif }; static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = { .lock = INIT_LOCAL_LOCK(lock), }; /* * This path almost never happens for VM activity - pages are normally freed * in batches. But it gets used by networking - and for compound pages. */ static void __page_cache_release(struct folio *folio) { if (folio_test_lru(folio)) { struct lruvec *lruvec; unsigned long flags; lruvec = folio_lruvec_lock_irqsave(folio, &flags); lruvec_del_folio(lruvec, folio); __folio_clear_lru_flags(folio); unlock_page_lruvec_irqrestore(lruvec, flags); } /* See comment on folio_test_mlocked in release_pages() */ if (unlikely(folio_test_mlocked(folio))) { long nr_pages = folio_nr_pages(folio); __folio_clear_mlocked(folio); zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages); count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages); } } static void __folio_put_small(struct folio *folio) { __page_cache_release(folio); mem_cgroup_uncharge(folio); free_unref_page(&folio->page, 0); } static void __folio_put_large(struct folio *folio) { /* * __page_cache_release() is supposed to be called for thp, not for * hugetlb. This is because hugetlb page does never have PageLRU set * (it's never listed to any LRU lists) and no memcg routines should * be called for hugetlb (it has a separate hugetlb_cgroup.) */ if (!folio_test_hugetlb(folio)) __page_cache_release(folio); destroy_large_folio(folio); } void __folio_put(struct folio *folio) { if (unlikely(folio_is_zone_device(folio))) free_zone_device_page(&folio->page); else if (unlikely(folio_test_large(folio))) __folio_put_large(folio); else __folio_put_small(folio); } EXPORT_SYMBOL(__folio_put); /** * put_pages_list() - release a list of pages * @pages: list of pages threaded on page->lru * * Release a list of pages which are strung together on page.lru. */ void put_pages_list(struct list_head *pages) { struct folio *folio, *next; list_for_each_entry_safe(folio, next, pages, lru) { if (!folio_put_testzero(folio)) { list_del(&folio->lru); continue; } if (folio_test_large(folio)) { list_del(&folio->lru); __folio_put_large(folio); continue; } /* LRU flag must be clear because it's passed using the lru */ } free_unref_page_list(pages); INIT_LIST_HEAD(pages); } EXPORT_SYMBOL(put_pages_list); typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio); static void lru_add_fn(struct lruvec *lruvec, struct folio *folio) { int was_unevictable = folio_test_clear_unevictable(folio); long nr_pages = folio_nr_pages(folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); /* * Is an smp_mb__after_atomic() still required here, before * folio_evictable() tests the mlocked flag, to rule out the possibility * of stranding an evictable folio on an unevictable LRU? I think * not, because __munlock_folio() only clears the mlocked flag * while the LRU lock is held. * * (That is not true of __page_cache_release(), and not necessarily * true of release_pages(): but those only clear the mlocked flag after * folio_put_testzero() has excluded any other users of the folio.) */ if (folio_evictable(folio)) { if (was_unevictable) __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } else { folio_clear_active(folio); folio_set_unevictable(folio); /* * folio->mlock_count = !!folio_test_mlocked(folio)? * But that leaves __mlock_folio() in doubt whether another * actor has already counted the mlock or not. Err on the * safe side, underestimate, let page reclaim fix it, rather * than leaving a page on the unevictable LRU indefinitely. */ folio->mlock_count = 0; if (!was_unevictable) __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); } lruvec_add_folio(lruvec, folio); trace_mm_lru_insertion(folio); } static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) { int i; struct lruvec *lruvec = NULL; unsigned long flags = 0; for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; /* block memcg migration while the folio moves between lru */ if (move_fn != lru_add_fn && !folio_test_clear_lru(folio)) continue; lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); move_fn(lruvec, folio); folio_set_lru(folio); } if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); folios_put(fbatch->folios, folio_batch_count(fbatch)); folio_batch_reinit(fbatch); } static void folio_batch_add_and_move(struct folio_batch *fbatch, struct folio *folio, move_fn_t move_fn) { if (folio_batch_add(fbatch, folio) && !folio_test_large(folio) && !lru_cache_disabled()) return; folio_batch_move_lru(fbatch, move_fn); } static void lru_move_tail_fn(struct lruvec *lruvec, struct folio *folio) { if (!folio_test_unevictable(folio)) { lruvec_del_folio(lruvec, folio); folio_clear_active(folio); lruvec_add_folio_tail(lruvec, folio); __count_vm_events(PGROTATED, folio_nr_pages(folio)); } } /* * Writeback is about to end against a folio which has been marked for * immediate reclaim. If it still appears to be reclaimable, move it * to the tail of the inactive list. * * folio_rotate_reclaimable() must disable IRQs, to prevent nasty races. */ void folio_rotate_reclaimable(struct folio *folio) { if (!folio_test_locked(folio) && !folio_test_dirty(folio) && !folio_test_unevictable(folio) && folio_test_lru(folio)) { struct folio_batch *fbatch; unsigned long flags; folio_get(folio); local_lock_irqsave(&lru_rotate.lock, flags); fbatch = this_cpu_ptr(&lru_rotate.fbatch); folio_batch_add_and_move(fbatch, folio, lru_move_tail_fn); local_unlock_irqrestore(&lru_rotate.lock, flags); } } void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_io, unsigned int nr_rotated) { unsigned long cost; /* * Reflect the relative cost of incurring IO and spending CPU * time on rotations. This doesn't attempt to make a precise * comparison, it just says: if reloads are about comparable * between the LRU lists, or rotations are overwhelmingly * different between them, adjust scan balance for CPU work. */ cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated; do { unsigned long lrusize; /* * Hold lruvec->lru_lock is safe here, since * 1) The pinned lruvec in reclaim, or * 2) From a pre-LRU page during refault (which also holds the * rcu lock, so would be safe even if the page was on the LRU * and could move simultaneously to a new lruvec). */ spin_lock_irq(&lruvec->lru_lock); /* Record cost event */ if (file) lruvec->file_cost += cost; else lruvec->anon_cost += cost; /* * Decay previous events * * Because workloads change over time (and to avoid * overflow) we keep these statistics as a floating * average, which ends up weighing recent refaults * more than old ones. */ lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) + lruvec_page_state(lruvec, NR_ACTIVE_ANON) + lruvec_page_state(lruvec, NR_INACTIVE_FILE) + lruvec_page_state(lruvec, NR_ACTIVE_FILE); if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) { lruvec->file_cost /= 2; lruvec->anon_cost /= 2; } spin_unlock_irq(&lruvec->lru_lock); } while ((lruvec = parent_lruvec(lruvec))); } void lru_note_cost_refault(struct folio *folio) { lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio), folio_nr_pages(folio), 0); } static void folio_activate_fn(struct lruvec *lruvec, struct folio *folio) { if (!folio_test_active(folio) && !folio_test_unevictable(folio)) { long nr_pages = folio_nr_pages(folio); lruvec_del_folio(lruvec, folio); folio_set_active(folio); lruvec_add_folio(lruvec, folio); trace_mm_lru_activate(folio); __count_vm_events(PGACTIVATE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages); } } #ifdef CONFIG_SMP static void folio_activate_drain(int cpu) { struct folio_batch *fbatch = &per_cpu(cpu_fbatches.activate, cpu); if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, folio_activate_fn); } void folio_activate(struct folio *folio) { if (folio_test_lru(folio) && !folio_test_active(folio) && !folio_test_unevictable(folio)) { struct folio_batch *fbatch; folio_get(folio); local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.activate); folio_batch_add_and_move(fbatch, folio, folio_activate_fn); local_unlock(&cpu_fbatches.lock); } } #else static inline void folio_activate_drain(int cpu) { } void folio_activate(struct folio *folio) { struct lruvec *lruvec; if (folio_test_clear_lru(folio)) { lruvec = folio_lruvec_lock_irq(folio); folio_activate_fn(lruvec, folio); unlock_page_lruvec_irq(lruvec); folio_set_lru(folio); } } #endif static void __lru_cache_activate_folio(struct folio *folio) { struct folio_batch *fbatch; int i; local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_add); /* * Search backwards on the optimistic assumption that the folio being * activated has just been added to this batch. Note that only * the local batch is examined as a !LRU folio could be in the * process of being released, reclaimed, migrated or on a remote * batch that is currently being drained. Furthermore, marking * a remote batch's folio active potentially hits a race where * a folio is marked active just after it is added to the inactive * list causing accounting errors and BUG_ON checks to trigger. */ for (i = folio_batch_count(fbatch) - 1; i >= 0; i--) { struct folio *batch_folio = fbatch->folios[i]; if (batch_folio == folio) { folio_set_active(folio); break; } } local_unlock(&cpu_fbatches.lock); } #ifdef CONFIG_LRU_GEN static void folio_inc_refs(struct folio *folio) { unsigned long new_flags, old_flags = READ_ONCE(folio->flags); if (folio_test_unevictable(folio)) return; if (!folio_test_referenced(folio)) { folio_set_referenced(folio); return; } if (!folio_test_workingset(folio)) { folio_set_workingset(folio); return; } /* see the comment on MAX_NR_TIERS */ do { new_flags = old_flags & LRU_REFS_MASK; if (new_flags == LRU_REFS_MASK) break; new_flags += BIT(LRU_REFS_PGOFF); new_flags |= old_flags & ~LRU_REFS_MASK; } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); } #else static void folio_inc_refs(struct folio *folio) { } #endif /* CONFIG_LRU_GEN */ /* * Mark a page as having seen activity. * * inactive,unreferenced -> inactive,referenced * inactive,referenced -> active,unreferenced * active,unreferenced -> active,referenced * * When a newly allocated page is not yet visible, so safe for non-atomic ops, * __SetPageReferenced(page) may be substituted for mark_page_accessed(page). */ void folio_mark_accessed(struct folio *folio) { if (lru_gen_enabled()) { folio_inc_refs(folio); return; } if (!folio_test_referenced(folio)) { folio_set_referenced(folio); } else if (folio_test_unevictable(folio)) { /* * Unevictable pages are on the "LRU_UNEVICTABLE" list. But, * this list is never rotated or maintained, so marking an * unevictable page accessed has no effect. */ } else if (!folio_test_active(folio)) { /* * If the folio is on the LRU, queue it for activation via * cpu_fbatches.activate. Otherwise, assume the folio is in a * folio_batch, mark it active and it'll be moved to the active * LRU on the next drain. */ if (folio_test_lru(folio)) folio_activate(folio); else __lru_cache_activate_folio(folio); folio_clear_referenced(folio); workingset_activation(folio); } if (folio_test_idle(folio)) folio_clear_idle(folio); } EXPORT_SYMBOL(folio_mark_accessed); /** * folio_add_lru - Add a folio to an LRU list. * @folio: The folio to be added to the LRU. * * Queue the folio for addition to the LRU. The decision on whether * to add the page to the [in]active [file|anon] list is deferred until the * folio_batch is drained. This gives a chance for the caller of folio_add_lru() * have the folio added to the active list using folio_mark_accessed(). */ void folio_add_lru(struct folio *folio) { struct folio_batch *fbatch; VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); /* see the comment in lru_gen_add_folio() */ if (lru_gen_enabled() && !folio_test_unevictable(folio) && lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) folio_set_active(folio); folio_get(folio); local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_add); folio_batch_add_and_move(fbatch, folio, lru_add_fn); local_unlock(&cpu_fbatches.lock); } EXPORT_SYMBOL(folio_add_lru); /** * folio_add_lru_vma() - Add a folio to the appropate LRU list for this VMA. * @folio: The folio to be added to the LRU. * @vma: VMA in which the folio is mapped. * * If the VMA is mlocked, @folio is added to the unevictable list. * Otherwise, it is treated the same way as folio_add_lru(). */ void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma) { VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED)) mlock_new_folio(folio); else folio_add_lru(folio); } /* * If the folio cannot be invalidated, it is moved to the * inactive list to speed up its reclaim. It is moved to the * head of the list, rather than the tail, to give the flusher * threads some time to write it out, as this is much more * effective than the single-page writeout from reclaim. * * If the folio isn't mapped and dirty/writeback, the folio * could be reclaimed asap using the reclaim flag. * * 1. active, mapped folio -> none * 2. active, dirty/writeback folio -> inactive, head, reclaim * 3. inactive, mapped folio -> none * 4. inactive, dirty/writeback folio -> inactive, head, reclaim * 5. inactive, clean -> inactive, tail * 6. Others -> none * * In 4, it moves to the head of the inactive list so the folio is * written out by flusher threads as this is much more efficient * than the single-page writeout from reclaim. */ static void lru_deactivate_file_fn(struct lruvec *lruvec, struct folio *folio) { bool active = folio_test_active(folio); long nr_pages = folio_nr_pages(folio); if (folio_test_unevictable(folio)) return; /* Some processes are using the folio */ if (folio_mapped(folio)) return; lruvec_del_folio(lruvec, folio); folio_clear_active(folio); folio_clear_referenced(folio); if (folio_test_writeback(folio) || folio_test_dirty(folio)) { /* * Setting the reclaim flag could race with * folio_end_writeback() and confuse readahead. But the * race window is _really_ small and it's not a critical * problem. */ lruvec_add_folio(lruvec, folio); folio_set_reclaim(folio); } else { /* * The folio's writeback ended while it was in the batch. * We move that folio to the tail of the inactive list. */ lruvec_add_folio_tail(lruvec, folio); __count_vm_events(PGROTATED, nr_pages); } if (active) { __count_vm_events(PGDEACTIVATE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); } } static void lru_deactivate_fn(struct lruvec *lruvec, struct folio *folio) { if (!folio_test_unevictable(folio) && (folio_test_active(folio) || lru_gen_enabled())) { long nr_pages = folio_nr_pages(folio); lruvec_del_folio(lruvec, folio); folio_clear_active(folio); folio_clear_referenced(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(PGDEACTIVATE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); } } static void lru_lazyfree_fn(struct lruvec *lruvec, struct folio *folio) { if (folio_test_anon(folio) && folio_test_swapbacked(folio) && !folio_test_swapcache(folio) && !folio_test_unevictable(folio)) { long nr_pages = folio_nr_pages(folio); lruvec_del_folio(lruvec, folio); folio_clear_active(folio); folio_clear_referenced(folio); /* * Lazyfree folios are clean anonymous folios. They have * the swapbacked flag cleared, to distinguish them from normal * anonymous folios */ folio_clear_swapbacked(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(PGLAZYFREE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages); } } /* * Drain pages out of the cpu's folio_batch. * Either "cpu" is the current CPU, and preemption has already been * disabled; or "cpu" is being hot-unplugged, and is already dead. */ void lru_add_drain_cpu(int cpu) { struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu); struct folio_batch *fbatch = &fbatches->lru_add; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_add_fn); fbatch = &per_cpu(lru_rotate.fbatch, cpu); /* Disabling interrupts below acts as a compiler barrier. */ if (data_race(folio_batch_count(fbatch))) { unsigned long flags; /* No harm done if a racing interrupt already did this */ local_lock_irqsave(&lru_rotate.lock, flags); folio_batch_move_lru(fbatch, lru_move_tail_fn); local_unlock_irqrestore(&lru_rotate.lock, flags); } fbatch = &fbatches->lru_deactivate_file; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_deactivate_file_fn); fbatch = &fbatches->lru_deactivate; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_deactivate_fn); fbatch = &fbatches->lru_lazyfree; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_lazyfree_fn); folio_activate_drain(cpu); } /** * deactivate_file_folio() - Deactivate a file folio. * @folio: Folio to deactivate. * * This function hints to the VM that @folio is a good reclaim candidate, * for example if its invalidation fails due to the folio being dirty * or under writeback. * * Context: Caller holds a reference on the folio. */ void deactivate_file_folio(struct folio *folio) { struct folio_batch *fbatch; /* Deactivating an unevictable folio will not accelerate reclaim */ if (folio_test_unevictable(folio)) return; folio_get(folio); local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate_file); folio_batch_add_and_move(fbatch, folio, lru_deactivate_file_fn); local_unlock(&cpu_fbatches.lock); } /* * folio_deactivate - deactivate a folio * @folio: folio to deactivate * * folio_deactivate() moves @folio to the inactive list if @folio was on the * active list and was not unevictable. This is done to accelerate the * reclaim of @folio. */ void folio_deactivate(struct folio *folio) { if (folio_test_lru(folio) && !folio_test_unevictable(folio) && (folio_test_active(folio) || lru_gen_enabled())) { struct folio_batch *fbatch; folio_get(folio); local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate); folio_batch_add_and_move(fbatch, folio, lru_deactivate_fn); local_unlock(&cpu_fbatches.lock); } } /** * folio_mark_lazyfree - make an anon folio lazyfree * @folio: folio to deactivate * * folio_mark_lazyfree() moves @folio to the inactive file list. * This is done to accelerate the reclaim of @folio. */ void folio_mark_lazyfree(struct folio *folio) { if (folio_test_lru(folio) && folio_test_anon(folio) && folio_test_swapbacked(folio) && !folio_test_swapcache(folio) && !folio_test_unevictable(folio)) { struct folio_batch *fbatch; folio_get(folio); local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_lazyfree); folio_batch_add_and_move(fbatch, folio, lru_lazyfree_fn); local_unlock(&cpu_fbatches.lock); } } void lru_add_drain(void) { local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); local_unlock(&cpu_fbatches.lock); mlock_drain_local(); } /* * It's called from per-cpu workqueue context in SMP case so * lru_add_drain_cpu and invalidate_bh_lrus_cpu should run on * the same cpu. It shouldn't be a problem in !SMP case since * the core is only one and the locks will disable preemption. */ static void lru_add_and_bh_lrus_drain(void) { local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); local_unlock(&cpu_fbatches.lock); invalidate_bh_lrus_cpu(); mlock_drain_local(); } void lru_add_drain_cpu_zone(struct zone *zone) { local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); drain_local_pages(zone); local_unlock(&cpu_fbatches.lock); mlock_drain_local(); } #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); static void lru_add_drain_per_cpu(struct work_struct *dummy) { lru_add_and_bh_lrus_drain(); } static bool cpu_needs_drain(unsigned int cpu) { struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu); /* Check these in order of likelihood that they're not zero */ return folio_batch_count(&fbatches->lru_add) || data_race(folio_batch_count(&per_cpu(lru_rotate.fbatch, cpu))) || folio_batch_count(&fbatches->lru_deactivate_file) || folio_batch_count(&fbatches->lru_deactivate) || folio_batch_count(&fbatches->lru_lazyfree) || folio_batch_count(&fbatches->activate) || need_mlock_drain(cpu) || has_bh_in_lru(cpu, NULL); } /* * Doesn't need any cpu hotplug locking because we do rely on per-cpu * kworkers being shut down before our page_alloc_cpu_dead callback is * executed on the offlined cpu. * Calling this function with cpu hotplug locks held can actually lead * to obscure indirect dependencies via WQ context. */ static inline void __lru_add_drain_all(bool force_all_cpus) { /* * lru_drain_gen - Global pages generation number * * (A) Definition: global lru_drain_gen = x implies that all generations * 0 < n <= x are already *scheduled* for draining. * * This is an optimization for the highly-contended use case where a * user space workload keeps constantly generating a flow of pages for * each CPU. */ static unsigned int lru_drain_gen; static struct cpumask has_work; static DEFINE_MUTEX(lock); unsigned cpu, this_gen; /* * Make sure nobody triggers this path before mm_percpu_wq is fully * initialized. */ if (WARN_ON(!mm_percpu_wq)) return; /* * Guarantee folio_batch counter stores visible by this CPU * are visible to other CPUs before loading the current drain * generation. */ smp_mb(); /* * (B) Locally cache global LRU draining generation number * * The read barrier ensures that the counter is loaded before the mutex * is taken. It pairs with smp_mb() inside the mutex critical section * at (D). */ this_gen = smp_load_acquire(&lru_drain_gen); mutex_lock(&lock); /* * (C) Exit the draining operation if a newer generation, from another * lru_add_drain_all(), was already scheduled for draining. Check (A). */ if (unlikely(this_gen != lru_drain_gen && !force_all_cpus)) goto done; /* * (D) Increment global generation number * * Pairs with smp_load_acquire() at (B), outside of the critical * section. Use a full memory barrier to guarantee that the * new global drain generation number is stored before loading * folio_batch counters. * * This pairing must be done here, before the for_each_online_cpu loop * below which drains the page vectors. * * Let x, y, and z represent some system CPU numbers, where x < y < z. * Assume CPU #z is in the middle of the for_each_online_cpu loop * below and has already reached CPU #y's per-cpu data. CPU #x comes * along, adds some pages to its per-cpu vectors, then calls * lru_add_drain_all(). * * If the paired barrier is done at any later step, e.g. after the * loop, CPU #x will just exit at (C) and miss flushing out all of its * added pages. */ WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1); smp_mb(); cpumask_clear(&has_work); for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); if (cpu_needs_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, mm_percpu_wq, work); __cpumask_set_cpu(cpu, &has_work); } } for_each_cpu(cpu, &has_work) flush_work(&per_cpu(lru_add_drain_work, cpu)); done: mutex_unlock(&lock); } void lru_add_drain_all(void) { __lru_add_drain_all(false); } #else void lru_add_drain_all(void) { lru_add_drain(); } #endif /* CONFIG_SMP */ atomic_t lru_disable_count = ATOMIC_INIT(0); /* * lru_cache_disable() needs to be called before we start compiling * a list of pages to be migrated using isolate_lru_page(). * It drains pages on LRU cache and then disable on all cpus until * lru_cache_enable is called. * * Must be paired with a call to lru_cache_enable(). */ void lru_cache_disable(void) { atomic_inc(&lru_disable_count); /* * Readers of lru_disable_count are protected by either disabling * preemption or rcu_read_lock: * * preempt_disable, local_irq_disable [bh_lru_lock()] * rcu_read_lock [rt_spin_lock CONFIG_PREEMPT_RT] * preempt_disable [local_lock !CONFIG_PREEMPT_RT] * * Since v5.1 kernel, synchronize_rcu() is guaranteed to wait on * preempt_disable() regions of code. So any CPU which sees * lru_disable_count = 0 will have exited the critical * section when synchronize_rcu() returns. */ synchronize_rcu_expedited(); #ifdef CONFIG_SMP __lru_add_drain_all(true); #else lru_add_and_bh_lrus_drain(); #endif } /** * release_pages - batched put_page() * @arg: array of pages to release * @nr: number of pages * * Decrement the reference count on all the pages in @arg. If it * fell to zero, remove the page from the LRU and free it. * * Note that the argument can be an array of pages, encoded pages, * or folio pointers. We ignore any encoded bits, and turn any of * them into just a folio that gets free'd. */ void release_pages(release_pages_arg arg, int nr) { int i; struct encoded_page **encoded = arg.encoded_pages; LIST_HEAD(pages_to_free); struct lruvec *lruvec = NULL; unsigned long flags = 0; unsigned int lock_batch; for (i = 0; i < nr; i++) { struct folio *folio; /* Turn any of the argument types into a folio */ folio = page_folio(encoded_page_ptr(encoded[i])); /* * Make sure the IRQ-safe lock-holding time does not get * excessive with a continuous string of pages from the * same lruvec. The lock is held only if lruvec != NULL. */ if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) { unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } if (is_huge_zero_page(&folio->page)) continue; if (folio_is_zone_device(folio)) { if (lruvec) { unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } if (put_devmap_managed_page(&folio->page)) continue; if (folio_put_testzero(folio)) free_zone_device_page(&folio->page); continue; } if (!folio_put_testzero(folio)) continue; if (folio_test_large(folio)) { if (lruvec) { unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } __folio_put_large(folio); continue; } if (folio_test_lru(folio)) { struct lruvec *prev_lruvec = lruvec; lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); if (prev_lruvec != lruvec) lock_batch = 0; lruvec_del_folio(lruvec, folio); __folio_clear_lru_flags(folio); } /* * In rare cases, when truncation or holepunching raced with * munlock after VM_LOCKED was cleared, Mlocked may still be * found set here. This does not indicate a problem, unless * "unevictable_pgs_cleared" appears worryingly large. */ if (unlikely(folio_test_mlocked(folio))) { __folio_clear_mlocked(folio); zone_stat_sub_folio(folio, NR_MLOCK); count_vm_event(UNEVICTABLE_PGCLEARED); } list_add(&folio->lru, &pages_to_free); } if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); mem_cgroup_uncharge_list(&pages_to_free); free_unref_page_list(&pages_to_free); } EXPORT_SYMBOL(release_pages); /* * The folios which we're about to release may be in the deferred lru-addition * queues. That would prevent them from really being freed right now. That's * OK from a correctness point of view but is inefficient - those folios may be * cache-warm and we want to give them back to the page allocator ASAP. * * So __folio_batch_release() will drain those queues here. * folio_batch_move_lru() calls folios_put() directly to avoid * mutual recursion. */ void __folio_batch_release(struct folio_batch *fbatch) { if (!fbatch->percpu_pvec_drained) { lru_add_drain(); fbatch->percpu_pvec_drained = true; } release_pages(fbatch->folios, folio_batch_count(fbatch)); folio_batch_reinit(fbatch); } EXPORT_SYMBOL(__folio_batch_release); /** * folio_batch_remove_exceptionals() - Prune non-folios from a batch. * @fbatch: The batch to prune * * find_get_entries() fills a batch with both folios and shadow/swap/DAX * entries. This function prunes all the non-folio entries from @fbatch * without leaving holes, so that it can be passed on to folio-only batch * operations. */ void folio_batch_remove_exceptionals(struct folio_batch *fbatch) { unsigned int i, j; for (i = 0, j = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; if (!xa_is_value(folio)) fbatch->folios[j++] = folio; } fbatch->nr = j; } /* * Perform any setup for the swap system */ void __init swap_setup(void) { unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); /* Use a smaller cluster for small-memory machines */ if (megs < 16) page_cluster = 2; else page_cluster = 3; /* * Right now other parts of the system means that we * _really_ don't want to cluster much more */ }
90 91 91 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 // SPDX-License-Identifier: GPL-2.0 /* * linux/mm/swap_state.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * Swap reorganised 29.12.95, Stephen Tweedie * * Rewritten to use page cache, (C) 1998 Stephen Tweedie */ #include <linux/mm.h> #include <linux/gfp.h> #include <linux/kernel_stat.h> #include <linux/mempolicy.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/init.h> #include <linux/pagemap.h> #include <linux/backing-dev.h> #include <linux/blkdev.h> #include <linux/migrate.h> #include <linux/vmalloc.h> #include <linux/swap_slots.h> #include <linux/huge_mm.h> #include <linux/shmem_fs.h> #include "internal.h" #include "swap.h" /* * swapper_space is a fiction, retained to simplify the path through * vmscan's shrink_page_list. */ static const struct address_space_operations swap_aops = { .writepage = swap_writepage, .dirty_folio = noop_dirty_folio, #ifdef CONFIG_MIGRATION .migrate_folio = migrate_folio, #endif }; struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; static bool enable_vma_readahead __read_mostly = true; #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) #define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK #define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK) #define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK) #define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT) #define SWAP_RA_ADDR(v) ((v) & PAGE_MASK) #define SWAP_RA_VAL(addr, win, hits) \ (((addr) & PAGE_MASK) | \ (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \ ((hits) & SWAP_RA_HITS_MASK)) /* Initial readahead hits is 4 to start up with a small window */ #define GET_SWAP_RA_VAL(vma) \ (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); void show_swap_cache_info(void) { printk("%lu pages in swap cache\n", total_swapcache_pages()); printk("Free swap = %ldkB\n", K(get_nr_swap_pages())); printk("Total swap = %lukB\n", K(total_swap_pages)); } void *get_shadow_from_swap_cache(swp_entry_t entry) { struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swp_offset(entry); struct page *page; page = xa_load(&address_space->i_pages, idx); if (xa_is_value(page)) return page; return NULL; } /* * add_to_swap_cache resembles filemap_add_folio on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ int add_to_swap_cache(struct folio *folio, swp_entry_t entry, gfp_t gfp, void **shadowp) { struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swp_offset(entry); XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio)); unsigned long i, nr = folio_nr_pages(folio); void *old; xas_set_update(&xas, workingset_update_node); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); folio_ref_add(folio, nr); folio_set_swapcache(folio); folio->swap = entry; do { xas_lock_irq(&xas); xas_create_range(&xas); if (xas_error(&xas)) goto unlock; for (i = 0; i < nr; i++) { VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio); if (shadowp) { old = xas_load(&xas); if (xa_is_value(old)) *shadowp = old; } xas_store(&xas, folio); xas_next(&xas); } address_space->nrpages += nr; __node_stat_mod_folio(folio, NR_FILE_PAGES, nr); __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr); unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); if (!xas_error(&xas)) return 0; folio_clear_swapcache(folio); folio_ref_sub(folio, nr); return xas_error(&xas); } /* * This must be called only on folios that have * been verified to be in the swap cache. */ void __delete_from_swap_cache(struct folio *folio, swp_entry_t entry, void *shadow) { struct address_space *address_space = swap_address_space(entry); int i; long nr = folio_nr_pages(folio); pgoff_t idx = swp_offset(entry); XA_STATE(xas, &address_space->i_pages, idx); xas_set_update(&xas, workingset_update_node); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); for (i = 0; i < nr; i++) { void *entry = xas_store(&xas, shadow); VM_BUG_ON_PAGE(entry != folio, entry); xas_next(&xas); } folio->swap.val = 0; folio_clear_swapcache(folio); address_space->nrpages -= nr; __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr); __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr); } /** * add_to_swap - allocate swap space for a folio * @folio: folio we want to move to swap * * Allocate swap space for the folio and add the folio to the * swap cache. * * Context: Caller needs to hold the folio lock. * Return: Whether the folio was added to the swap cache. */ bool add_to_swap(struct folio *folio) { swp_entry_t entry; int err; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio); entry = folio_alloc_swap(folio); if (!entry.val) return false; /* * XArray node allocations from PF_MEMALLOC contexts could * completely exhaust the page allocator. __GFP_NOMEMALLOC * stops emergency reserves from being allocated. * * TODO: this could cause a theoretical memory reclaim * deadlock in the swap out path. */ /* * Add it to the swap cache. */ err = add_to_swap_cache(folio, entry, __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL); if (err) /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. */ goto fail; /* * Normally the folio will be dirtied in unmap because its * pte should be dirty. A special case is MADV_FREE page. The * page's pte could have dirty bit cleared but the folio's * SwapBacked flag is still set because clearing the dirty bit * and SwapBacked flag has no lock protected. For such folio, * unmap will not set dirty bit for it, so folio reclaim will * not write the folio out. This can cause data corruption when * the folio is swapped in later. Always setting the dirty flag * for the folio solves the problem. */ folio_mark_dirty(folio); return true; fail: put_swap_folio(folio, entry); return false; } /* * This must be called only on folios that have * been verified to be in the swap cache and locked. * It will never put the folio into the free list, * the caller has a reference on the folio. */ void delete_from_swap_cache(struct folio *folio) { swp_entry_t entry = folio->swap; struct address_space *address_space = swap_address_space(entry); xa_lock_irq(&address_space->i_pages); __delete_from_swap_cache(folio, entry, NULL); xa_unlock_irq(&address_space->i_pages); put_swap_folio(folio, entry); folio_ref_sub(folio, folio_nr_pages(folio)); } void clear_shadow_from_swap_cache(int type, unsigned long begin, unsigned long end) { unsigned long curr = begin; void *old; for (;;) { swp_entry_t entry = swp_entry(type, curr); struct address_space *address_space = swap_address_space(entry); XA_STATE(xas, &address_space->i_pages, curr); xas_set_update(&xas, workingset_update_node); xa_lock_irq(&address_space->i_pages); xas_for_each(&xas, old, end) { if (!xa_is_value(old)) continue; xas_store(&xas, NULL); } xa_unlock_irq(&address_space->i_pages); /* search the next swapcache until we meet end */ curr >>= SWAP_ADDRESS_SPACE_SHIFT; curr++; curr <<= SWAP_ADDRESS_SPACE_SHIFT; if (curr > end) break; } } /* * If we are the only user, then try to free up the swap cache. * * Its ok to check the swapcache flag without the folio lock * here because we are going to recheck again inside * folio_free_swap() _with_ the lock. * - Marcelo */ void free_swap_cache(struct page *page) { struct folio *folio = page_folio(page); if (folio_test_swapcache(folio) && !folio_mapped(folio) && folio_trylock(folio)) { folio_free_swap(folio); folio_unlock(folio); } } /* * Perform a free_page(), also freeing any swap cache associated with * this page if it is the last user of the page. */ void free_page_and_swap_cache(struct page *page) { free_swap_cache(page); if (!is_huge_zero_page(page)) put_page(page); } /* * Passed an array of pages, drop them all from swapcache and then release * them. They are removed from the LRU and freed if this is their last use. */ void free_pages_and_swap_cache(struct encoded_page **pages, int nr) { lru_add_drain(); for (int i = 0; i < nr; i++) free_swap_cache(encoded_page_ptr(pages[i])); release_pages(pages, nr); } static inline bool swap_use_vma_readahead(void) { return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap); } /* * Lookup a swap entry in the swap cache. A found folio will be returned * unlocked and with its refcount incremented - we rely on the kernel * lock getting page table operations atomic even if we drop the folio * lock before returning. * * Caller must lock the swap device or hold a reference to keep it valid. */ struct folio *swap_cache_get_folio(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr) { struct folio *folio; folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry)); if (!IS_ERR(folio)) { bool vma_ra = swap_use_vma_readahead(); bool readahead; /* * At the moment, we don't support PG_readahead for anon THP * so let's bail out rather than confusing the readahead stat. */ if (unlikely(folio_test_large(folio))) return folio; readahead = folio_test_clear_readahead(folio); if (vma && vma_ra) { unsigned long ra_val; int win, hits; ra_val = GET_SWAP_RA_VAL(vma); win = SWAP_RA_WIN(ra_val); hits = SWAP_RA_HITS(ra_val); if (readahead) hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(addr, win, hits)); } if (readahead) { count_vm_event(SWAP_RA_HIT); if (!vma || !vma_ra) atomic_inc(&swapin_readahead_hits); } } else { folio = NULL; } return folio; } /** * filemap_get_incore_folio - Find and get a folio from the page or swap caches. * @mapping: The address_space to search. * @index: The page cache index. * * This differs from filemap_get_folio() in that it will also look for the * folio in the swap cache. * * Return: The found folio or %NULL. */ struct folio *filemap_get_incore_folio(struct address_space *mapping, pgoff_t index) { swp_entry_t swp; struct swap_info_struct *si; struct folio *folio = filemap_get_entry(mapping, index); if (!folio) return ERR_PTR(-ENOENT); if (!xa_is_value(folio)) return folio; if (!shmem_mapping(mapping)) return ERR_PTR(-ENOENT); swp = radix_to_swp_entry(folio); /* There might be swapin error entries in shmem mapping. */ if (non_swap_entry(swp)) return ERR_PTR(-ENOENT); /* Prevent swapoff from happening to us */ si = get_swap_device(swp); if (!si) return ERR_PTR(-ENOENT); index = swp_offset(swp); folio = filemap_get_folio(swap_address_space(swp), index); put_swap_device(si); return folio; } struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated, bool skip_if_exists) { struct swap_info_struct *si; struct folio *folio; void *shadow = NULL; *new_page_allocated = false; si = get_swap_device(entry); if (!si) return NULL; for (;;) { int err; /* * First check the swap cache. Since this is normally * called after swap_cache_get_folio() failed, re-calling * that would confuse statistics. */ folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry)); if (!IS_ERR(folio)) goto got_folio; /* * Just skip read ahead for unused swap slot. * During swap_off when swap_slot_cache is disabled, * we have to handle the race between putting * swap entry in swap cache and marking swap slot * as SWAP_HAS_CACHE. That's done in later part of code or * else swap_off will be aborted if we return NULL. */ if (!swap_swapcount(si, entry) && swap_slot_cache_enabled) goto fail_put_swap; /* * Get a new folio to read into from swap. Allocate it now, * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will * cause any racers to loop around until we add it to cache. */ folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0, mpol, ilx, numa_node_id()); if (!folio) goto fail_put_swap; /* * Swap entry may have been freed since our caller observed it. */ err = swapcache_prepare(entry); if (!err) break; folio_put(folio); if (err != -EEXIST) goto fail_put_swap; /* * Protect against a recursive call to __read_swap_cache_async() * on the same entry waiting forever here because SWAP_HAS_CACHE * is set but the folio is not the swap cache yet. This can * happen today if mem_cgroup_swapin_charge_folio() below * triggers reclaim through zswap, which may call * __read_swap_cache_async() in the writeback path. */ if (skip_if_exists) goto fail_put_swap; /* * We might race against __delete_from_swap_cache(), and * stumble across a swap_map entry whose SWAP_HAS_CACHE * has not yet been cleared. Or race against another * __read_swap_cache_async(), which has set SWAP_HAS_CACHE * in swap_map, but not yet added its folio to swap cache. */ schedule_timeout_uninterruptible(1); } /* * The swap entry is ours to swap in. Prepare the new folio. */ __folio_set_locked(folio); __folio_set_swapbacked(folio); if (mem_cgroup_swapin_charge_folio(folio, NULL, gfp_mask, entry)) goto fail_unlock; /* May fail (-ENOMEM) if XArray node allocation failed. */ if (add_to_swap_cache(folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) goto fail_unlock; mem_cgroup_swapin_uncharge_swap(entry); if (shadow) workingset_refault(folio, shadow); /* Caller will initiate read into locked folio */ folio_add_lru(folio); *new_page_allocated = true; got_folio: put_swap_device(si); return folio; fail_unlock: put_swap_folio(folio, entry); folio_unlock(folio); folio_put(folio); fail_put_swap: put_swap_device(si); return NULL; } /* * Locate a page of swap in physical memory, reserving swap cache space * and reading the disk if it is not already cached. * A failure return means that either the page allocation failed or that * the swap entry is no longer in use. * * get/put_swap_device() aren't needed to call this function, because * __read_swap_cache_async() call them and swap_read_folio() holds the * swap cache folio lock. */ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, struct swap_iocb **plug) { bool page_allocated; struct mempolicy *mpol; pgoff_t ilx; struct folio *folio; mpol = get_vma_policy(vma, addr, 0, &ilx); folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); mpol_cond_put(mpol); if (page_allocated) swap_read_folio(folio, false, plug); return folio; } static unsigned int __swapin_nr_pages(unsigned long prev_offset, unsigned long offset, int hits, int max_pages, int prev_win) { unsigned int pages, last_ra; /* * This heuristic has been found to work well on both sequential and * random loads, swapping to hard disk or to SSD: please don't ask * what the "+ 2" means, it just happens to work well, that's all. */ pages = hits + 2; if (pages == 2) { /* * We can have no readahead hits to judge by: but must not get * stuck here forever, so check for an adjacent offset instead * (and don't even bother to check whether swap type is same). */ if (offset != prev_offset + 1 && offset != prev_offset - 1) pages = 1; } else { unsigned int roundup = 4; while (roundup < pages) roundup <<= 1; pages = roundup; } if (pages > max_pages) pages = max_pages; /* Don't shrink readahead too fast */ last_ra = prev_win / 2; if (pages < last_ra) pages = last_ra; return pages; } static unsigned long swapin_nr_pages(unsigned long offset) { static unsigned long prev_offset; unsigned int hits, pages, max_pages; static atomic_t last_readahead_pages; max_pages = 1 << READ_ONCE(page_cluster); if (max_pages <= 1) return 1; hits = atomic_xchg(&swapin_readahead_hits, 0); pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits, max_pages, atomic_read(&last_readahead_pages)); if (!hits) WRITE_ONCE(prev_offset, offset); atomic_set(&last_readahead_pages, pages); return pages; } /** * swap_cluster_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory * @gfp_mask: memory allocation flags * @mpol: NUMA memory allocation policy to be applied * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE * * Returns the struct folio for entry and addr, after queueing swapin. * * Primitive swap readahead code. We simply read an aligned block of * (1 << page_cluster) entries in the swap area. This method is chosen * because it doesn't cost us any seek time. We also make sure to queue * the 'original' request together with the readahead ones... * * Note: it is intentional that the same NUMA policy and interleave index * are used for every page of the readahead: neighbouring pages on swap * are fairly likely to have been swapped out from the same node. */ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx) { struct folio *folio; unsigned long entry_offset = swp_offset(entry); unsigned long offset = entry_offset; unsigned long start_offset, end_offset; unsigned long mask; struct swap_info_struct *si = swp_swap_info(entry); struct blk_plug plug; struct swap_iocb *splug = NULL; bool page_allocated; mask = swapin_nr_pages(offset) - 1; if (!mask) goto skip; /* Read a page_cluster sized and aligned cluster around offset. */ start_offset = offset & ~mask; end_offset = offset | mask; if (!start_offset) /* First page is swap header. */ start_offset++; if (end_offset >= si->max) end_offset = si->max - 1; blk_start_plug(&plug); for (offset = start_offset; offset <= end_offset ; offset++) { /* Ok, do the async read-ahead now */ folio = __read_swap_cache_async( swp_entry(swp_type(entry), offset), gfp_mask, mpol, ilx, &page_allocated, false); if (!folio) continue; if (page_allocated) { swap_read_folio(folio, false, &splug); if (offset != entry_offset) { folio_set_readahead(folio); count_vm_event(SWAP_RA); } } folio_put(folio); } blk_finish_plug(&plug); swap_read_unplug(splug); lru_add_drain(); /* Push any new pages onto the LRU now */ skip: /* The page was likely read above, so no need for plugging here */ folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); if (unlikely(page_allocated)) { zswap_folio_swapin(folio); swap_read_folio(folio, false, NULL); } return folio; } int init_swap_address_space(unsigned int type, unsigned long nr_pages) { struct address_space *spaces, *space; unsigned int i, nr; nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL); if (!spaces) return -ENOMEM; for (i = 0; i < nr; i++) { space = spaces + i; xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ); atomic_set(&space->i_mmap_writable, 0); space->a_ops = &swap_aops; /* swap cache doesn't use writeback related tags */ mapping_set_no_writeback_tags(space); } nr_swapper_spaces[type] = nr; swapper_spaces[type] = spaces; return 0; } void exit_swap_address_space(unsigned int type) { int i; struct address_space *spaces = swapper_spaces[type]; for (i = 0; i < nr_swapper_spaces[type]; i++) VM_WARN_ON_ONCE(!mapping_empty(&spaces[i])); kvfree(spaces); nr_swapper_spaces[type] = 0; swapper_spaces[type] = NULL; } #define SWAP_RA_ORDER_CEILING 5 struct vma_swap_readahead { unsigned short win; unsigned short offset; unsigned short nr_pte; }; static void swap_ra_info(struct vm_fault *vmf, struct vma_swap_readahead *ra_info) { struct vm_area_struct *vma = vmf->vma; unsigned long ra_val; unsigned long faddr, pfn, fpfn, lpfn, rpfn; unsigned long start, end; unsigned int max_win, hits, prev_win, win; max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster), SWAP_RA_ORDER_CEILING); if (max_win == 1) { ra_info->win = 1; return; } faddr = vmf->address; fpfn = PFN_DOWN(faddr); ra_val = GET_SWAP_RA_VAL(vma); pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val)); prev_win = SWAP_RA_WIN(ra_val); hits = SWAP_RA_HITS(ra_val); ra_info->win = win = __swapin_nr_pages(pfn, fpfn, hits, max_win, prev_win); atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(faddr, win, 0)); if (win == 1) return; if (fpfn == pfn + 1) { lpfn = fpfn; rpfn = fpfn + win; } else if (pfn == fpfn + 1) { lpfn = fpfn - win + 1; rpfn = fpfn + 1; } else { unsigned int left = (win - 1) / 2; lpfn = fpfn - left; rpfn = fpfn + win - left; } start = max3(lpfn, PFN_DOWN(vma->vm_start), PFN_DOWN(faddr & PMD_MASK)); end = min3(rpfn, PFN_DOWN(vma->vm_end), PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); ra_info->nr_pte = end - start; ra_info->offset = fpfn - start; } /** * swap_vma_readahead - swap in pages in hope we need them soon * @targ_entry: swap entry of the targeted memory * @gfp_mask: memory allocation flags * @mpol: NUMA memory allocation policy to be applied * @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE * @vmf: fault information * * Returns the struct folio for entry and addr, after queueing swapin. * * Primitive swap readahead code. We simply read in a few pages whose * virtual addresses are around the fault address in the same vma. * * Caller must hold read mmap_lock if vmf->vma is not NULL. * */ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t targ_ilx, struct vm_fault *vmf) { struct blk_plug plug; struct swap_iocb *splug = NULL; struct folio *folio; pte_t *pte = NULL, pentry; unsigned long addr; swp_entry_t entry; pgoff_t ilx; unsigned int i; bool page_allocated; struct vma_swap_readahead ra_info = { .win = 1, }; swap_ra_info(vmf, &ra_info); if (ra_info.win == 1) goto skip; addr = vmf->address - (ra_info.offset * PAGE_SIZE); ilx = targ_ilx - ra_info.offset; blk_start_plug(&plug); for (i = 0; i < ra_info.nr_pte; i++, ilx++, addr += PAGE_SIZE) { if (!pte++) { pte = pte_offset_map(vmf->pmd, addr); if (!pte) break; } pentry = ptep_get_lockless(pte); if (!is_swap_pte(pentry)) continue; entry = pte_to_swp_entry(pentry); if (unlikely(non_swap_entry(entry))) continue; pte_unmap(pte); pte = NULL; folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); if (!folio) continue; if (page_allocated) { swap_read_folio(folio, false, &splug); if (i != ra_info.offset) { folio_set_readahead(folio); count_vm_event(SWAP_RA); } } folio_put(folio); } if (pte) pte_unmap(pte); blk_finish_plug(&plug); swap_read_unplug(splug); lru_add_drain(); skip: /* The folio was likely read above, so no need for plugging here */ folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, &page_allocated, false); if (unlikely(page_allocated)) { zswap_folio_swapin(folio); swap_read_folio(folio, false, NULL); } return folio; } /** * swapin_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory * @gfp_mask: memory allocation flags * @vmf: fault information * * Returns the struct page for entry and addr, after queueing swapin. * * It's a main entry function for swap readahead. By the configuration, * it will read ahead blocks by cluster-based(ie, physical disk based) * or vma-based(ie, virtual address based on faulty address) readahead. */ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_fault *vmf) { struct mempolicy *mpol; pgoff_t ilx; struct folio *folio; mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); folio = swap_use_vma_readahead() ? swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) : swap_cluster_readahead(entry, gfp_mask, mpol, ilx); mpol_cond_put(mpol); if (!folio) return NULL; return folio_file_page(folio, swp_offset(entry)); } #ifdef CONFIG_SYSFS static ssize_t vma_ra_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", enable_vma_readahead ? "true" : "false"); } static ssize_t vma_ra_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { ssize_t ret; ret = kstrtobool(buf, &enable_vma_readahead); if (ret) return ret; return count; } static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled); static struct attribute *swap_attrs[] = { &vma_ra_enabled_attr.attr, NULL, }; static const struct attribute_group swap_attr_group = { .attrs = swap_attrs, }; static int __init swap_init_sysfs(void) { int err; struct kobject *swap_kobj; swap_kobj = kobject_create_and_add("swap", mm_kobj); if (!swap_kobj) { pr_err("failed to create swap kobject\n"); return -ENOMEM; } err = sysfs_create_group(swap_kobj, &swap_attr_group); if (err) { pr_err("failed to register swap group\n"); goto delete_obj; } return 0; delete_obj: kobject_put(swap_kobj); return err; } subsys_initcall(swap_init_sysfs); #endif
605 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 /* SPDX-License-Identifier: GPL-2.0 */ /* * connection tracking helpers. * * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> * - generalize L3 protocol dependent part. * * Derived from include/linux/netfiter_ipv4/ip_conntrack_helper.h */ #ifndef _NF_CONNTRACK_HELPER_H #define _NF_CONNTRACK_HELPER_H #include <linux/refcount.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_expect.h> #define NF_NAT_HELPER_PREFIX "ip_nat_" #define NF_NAT_HELPER_NAME(name) NF_NAT_HELPER_PREFIX name #define MODULE_ALIAS_NF_NAT_HELPER(name) \ MODULE_ALIAS(NF_NAT_HELPER_NAME(name)) struct module; enum nf_ct_helper_flags { NF_CT_HELPER_F_USERSPACE = (1 << 0), NF_CT_HELPER_F_CONFIGURED = (1 << 1), }; #define NF_CT_HELPER_NAME_LEN 16 struct nf_conntrack_helper { struct hlist_node hnode; /* Internal use. */ char name[NF_CT_HELPER_NAME_LEN]; /* name of the module */ refcount_t refcnt; struct module *me; /* pointer to self */ const struct nf_conntrack_expect_policy *expect_policy; /* Tuple of things we will help (compared against server response) */ struct nf_conntrack_tuple tuple; /* Function to call when data passes; return verdict, or -1 to invalidate. */ int (*help)(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info conntrackinfo); void (*destroy)(struct nf_conn *ct); int (*from_nlattr)(struct nlattr *attr, struct nf_conn *ct); int (*to_nlattr)(struct sk_buff *skb, const struct nf_conn *ct); unsigned int expect_class_max; unsigned int flags; /* For user-space helpers: */ unsigned int queue_num; /* length of userspace private data stored in nf_conn_help->data */ u16 data_len; /* name of NAT helper module */ char nat_mod_name[NF_CT_HELPER_NAME_LEN]; }; /* Must be kept in sync with the classes defined by helpers */ #define NF_CT_MAX_EXPECT_CLASSES 4 /* nf_conn feature for connections that have a helper */ struct nf_conn_help { /* Helper. if any */ struct nf_conntrack_helper __rcu *helper; struct hlist_head expectations; /* Current number of expected connections */ u8 expecting[NF_CT_MAX_EXPECT_CLASSES]; /* private helper information. */ char data[32] __aligned(8); }; #define NF_CT_HELPER_BUILD_BUG_ON(structsize) \ BUILD_BUG_ON((structsize) > sizeof_field(struct nf_conn_help, data)) struct nf_conntrack_helper *__nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum); struct nf_conntrack_helper *nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum); void nf_conntrack_helper_put(struct nf_conntrack_helper *helper); void nf_ct_helper_init(struct nf_conntrack_helper *helper, u16 l3num, u16 protonum, const char *name, u16 default_port, u16 spec_port, u32 id, const struct nf_conntrack_expect_policy *exp_pol, u32 expect_class_max, int (*help)(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo), int (*from_nlattr)(struct nlattr *attr, struct nf_conn *ct), struct module *module); int nf_conntrack_helper_register(struct nf_conntrack_helper *); void nf_conntrack_helper_unregister(struct nf_conntrack_helper *); int nf_conntrack_helpers_register(struct nf_conntrack_helper *, unsigned int); void nf_conntrack_helpers_unregister(struct nf_conntrack_helper *, unsigned int); struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp); int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, gfp_t flags); int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, u16 proto); int nf_ct_add_helper(struct nf_conn *ct, const char *name, u8 family, u8 proto, bool nat, struct nf_conntrack_helper **hp); void nf_ct_helper_destroy(struct nf_conn *ct); static inline struct nf_conn_help *nfct_help(const struct nf_conn *ct) { return nf_ct_ext_find(ct, NF_CT_EXT_HELPER); } static inline void *nfct_help_data(const struct nf_conn *ct) { struct nf_conn_help *help; help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER); return (void *)help->data; } int nf_conntrack_helper_init(void); void nf_conntrack_helper_fini(void); int nf_conntrack_broadcast_help(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int timeout); struct nf_ct_helper_expectfn { struct list_head head; const char *name; void (*expectfn)(struct nf_conn *ct, struct nf_conntrack_expect *exp); }; __printf(3,4) void nf_ct_helper_log(struct sk_buff *skb, const struct nf_conn *ct, const char *fmt, ...); void nf_ct_helper_expectfn_register(struct nf_ct_helper_expectfn *n); void nf_ct_helper_expectfn_unregister(struct nf_ct_helper_expectfn *n); struct nf_ct_helper_expectfn * nf_ct_helper_expectfn_find_by_name(const char *name); struct nf_ct_helper_expectfn * nf_ct_helper_expectfn_find_by_symbol(const void *symbol); extern struct hlist_head *nf_ct_helper_hash; extern unsigned int nf_ct_helper_hsize; struct nf_conntrack_nat_helper { struct list_head list; char mod_name[NF_CT_HELPER_NAME_LEN]; /* module name */ struct module *module; /* pointer to self */ }; #define NF_CT_NAT_HELPER_INIT(name) \ { \ .mod_name = NF_NAT_HELPER_NAME(name), \ .module = THIS_MODULE \ } void nf_nat_helper_register(struct nf_conntrack_nat_helper *nat); void nf_nat_helper_unregister(struct nf_conntrack_nat_helper *nat); int nf_nat_helper_try_module_get(const char *name, u16 l3num, u8 protonum); void nf_nat_helper_put(struct nf_conntrack_helper *helper); #endif /*_NF_CONNTRACK_HELPER_H*/
9 9 4 55 54 17 52 3 4 4 3 9 9 9 9 2 8 4 128 128 128 128 128 128 127 41 128 127 128 112 128 14 18 10 10 10 10 10 10 10 9 9 9 7 1 8 7 7 6 1 2 48 45 3 2 41 3 1 4 2 39 40 1 41 1 42 39 1 2 40 2 40 1 41 42 41 1 41 1 1 39 2 39 2 36 37 2 32 6 4 29 6 4 8 4 5 4 21 1 1 1 5 5 8 5 3 3 5 5 3 5 3 5 3 3 5 5 3 5 5 32 1 1 12 18 1 1 2 2 2 2 2 4 3 11 11 11 10 8 10 11 11 11 11 10 11 11 11 11 11 11 4 4 4 1 4 4 4 5 5 2 1 2 2 2 1 2 2 7 7 7 163 163 163 54 52 54 453 456 438 144 144 19 19 19 35 35 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 // SPDX-License-Identifier: GPL-2.0-only /* * net/core/fib_rules.c Generic Routing Rules * * Authors: Thomas Graf <tgraf@suug.ch> */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/module.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/fib_rules.h> #include <net/ip_tunnels.h> #include <linux/indirect_call_wrapper.h> #if defined(CONFIG_IPV6) && defined(CONFIG_IPV6_MULTIPLE_TABLES) #ifdef CONFIG_IP_MULTIPLE_TABLES #define INDIRECT_CALL_MT(f, f2, f1, ...) \ INDIRECT_CALL_INET(f, f2, f1, __VA_ARGS__) #else #define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f2, __VA_ARGS__) #endif #elif defined(CONFIG_IP_MULTIPLE_TABLES) #define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__) #else #define INDIRECT_CALL_MT(f, f2, f1, ...) f(__VA_ARGS__) #endif static const struct fib_kuid_range fib_kuid_range_unset = { KUIDT_INIT(0), KUIDT_INIT(~0), }; bool fib_rule_matchall(const struct fib_rule *rule) { if (rule->iifindex || rule->oifindex || rule->mark || rule->tun_id || rule->flags) return false; if (rule->suppress_ifgroup != -1 || rule->suppress_prefixlen != -1) return false; if (!uid_eq(rule->uid_range.start, fib_kuid_range_unset.start) || !uid_eq(rule->uid_range.end, fib_kuid_range_unset.end)) return false; if (fib_rule_port_range_set(&rule->sport_range)) return false; if (fib_rule_port_range_set(&rule->dport_range)) return false; return true; } EXPORT_SYMBOL_GPL(fib_rule_matchall); int fib_default_rule_add(struct fib_rules_ops *ops, u32 pref, u32 table) { struct fib_rule *r; r = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT); if (r == NULL) return -ENOMEM; refcount_set(&r->refcnt, 1); r->action = FR_ACT_TO_TBL; r->pref = pref; r->table = table; r->proto = RTPROT_KERNEL; r->fr_net = ops->fro_net; r->uid_range = fib_kuid_range_unset; r->suppress_prefixlen = -1; r->suppress_ifgroup = -1; /* The lock is not required here, the list in unreacheable * at the moment this function is called */ list_add_tail(&r->list, &ops->rules_list); return 0; } EXPORT_SYMBOL(fib_default_rule_add); static u32 fib_default_rule_pref(struct fib_rules_ops *ops) { struct list_head *pos; struct fib_rule *rule; if (!list_empty(&ops->rules_list)) { pos = ops->rules_list.next; if (pos->next != &ops->rules_list) { rule = list_entry(pos->next, struct fib_rule, list); if (rule->pref) return rule->pref - 1; } } return 0; } static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, u32 pid); static struct fib_rules_ops *lookup_rules_ops(struct net *net, int family) { struct fib_rules_ops *ops; rcu_read_lock(); list_for_each_entry_rcu(ops, &net->rules_ops, list) { if (ops->family == family) { if (!try_module_get(ops->owner)) ops = NULL; rcu_read_unlock(); return ops; } } rcu_read_unlock(); return NULL; } static void rules_ops_put(struct fib_rules_ops *ops) { if (ops) module_put(ops->owner); } static void flush_route_cache(struct fib_rules_ops *ops) { if (ops->flush_cache) ops->flush_cache(ops); } static int __fib_rules_register(struct fib_rules_ops *ops) { int err = -EEXIST; struct fib_rules_ops *o; struct net *net; net = ops->fro_net; if (ops->rule_size < sizeof(struct fib_rule)) return -EINVAL; if (ops->match == NULL || ops->configure == NULL || ops->compare == NULL || ops->fill == NULL || ops->action == NULL) return -EINVAL; spin_lock(&net->rules_mod_lock); list_for_each_entry(o, &net->rules_ops, list) if (ops->family == o->family) goto errout; list_add_tail_rcu(&ops->list, &net->rules_ops); err = 0; errout: spin_unlock(&net->rules_mod_lock); return err; } struct fib_rules_ops * fib_rules_register(const struct fib_rules_ops *tmpl, struct net *net) { struct fib_rules_ops *ops; int err; ops = kmemdup(tmpl, sizeof(*ops), GFP_KERNEL); if (ops == NULL) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&ops->rules_list); ops->fro_net = net; err = __fib_rules_register(ops); if (err) { kfree(ops); ops = ERR_PTR(err); } return ops; } EXPORT_SYMBOL_GPL(fib_rules_register); static void fib_rules_cleanup_ops(struct fib_rules_ops *ops) { struct fib_rule *rule, *tmp; list_for_each_entry_safe(rule, tmp, &ops->rules_list, list) { list_del_rcu(&rule->list); if (ops->delete) ops->delete(rule); fib_rule_put(rule); } } void fib_rules_unregister(struct fib_rules_ops *ops) { struct net *net = ops->fro_net; spin_lock(&net->rules_mod_lock); list_del_rcu(&ops->list); spin_unlock(&net->rules_mod_lock); fib_rules_cleanup_ops(ops); kfree_rcu(ops, rcu); } EXPORT_SYMBOL_GPL(fib_rules_unregister); static int uid_range_set(struct fib_kuid_range *range) { return uid_valid(range->start) && uid_valid(range->end); } static struct fib_kuid_range nla_get_kuid_range(struct nlattr **tb) { struct fib_rule_uid_range *in; struct fib_kuid_range out; in = (struct fib_rule_uid_range *)nla_data(tb[FRA_UID_RANGE]); out.start = make_kuid(current_user_ns(), in->start); out.end = make_kuid(current_user_ns(), in->end); return out; } static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range) { struct fib_rule_uid_range out = { from_kuid_munged(current_user_ns(), range->start), from_kuid_munged(current_user_ns(), range->end) }; return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out); } static int nla_get_port_range(struct nlattr *pattr, struct fib_rule_port_range *port_range) { const struct fib_rule_port_range *pr = nla_data(pattr); if (!fib_rule_port_range_valid(pr)) return -EINVAL; port_range->start = pr->start; port_range->end = pr->end; return 0; } static int nla_put_port_range(struct sk_buff *skb, int attrtype, struct fib_rule_port_range *range) { return nla_put(skb, attrtype, sizeof(*range), range); } static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, struct flowi *fl, int flags, struct fib_lookup_arg *arg) { int ret = 0; if (rule->iifindex && (rule->iifindex != fl->flowi_iif)) goto out; if (rule->oifindex && (rule->oifindex != fl->flowi_oif)) goto out; if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask) goto out; if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id)) goto out; if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg)) goto out; if (uid_lt(fl->flowi_uid, rule->uid_range.start) || uid_gt(fl->flowi_uid, rule->uid_range.end)) goto out; ret = INDIRECT_CALL_MT(ops->match, fib6_rule_match, fib4_rule_match, rule, fl, flags); out: return (rule->flags & FIB_RULE_INVERT) ? !ret : ret; } int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl, int flags, struct fib_lookup_arg *arg) { struct fib_rule *rule; int err; rcu_read_lock(); list_for_each_entry_rcu(rule, &ops->rules_list, list) { jumped: if (!fib_rule_match(rule, ops, fl, flags, arg)) continue; if (rule->action == FR_ACT_GOTO) { struct fib_rule *target; target = rcu_dereference(rule->ctarget); if (target == NULL) { continue; } else { rule = target; goto jumped; } } else if (rule->action == FR_ACT_NOP) continue; else err = INDIRECT_CALL_MT(ops->action, fib6_rule_action, fib4_rule_action, rule, fl, flags, arg); if (!err && ops->suppress && INDIRECT_CALL_MT(ops->suppress, fib6_rule_suppress, fib4_rule_suppress, rule, flags, arg)) continue; if (err != -EAGAIN) { if ((arg->flags & FIB_LOOKUP_NOREF) || likely(refcount_inc_not_zero(&rule->refcnt))) { arg->rule = rule; goto out; } break; } } err = -ESRCH; out: rcu_read_unlock(); return err; } EXPORT_SYMBOL_GPL(fib_rules_lookup); static int call_fib_rule_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_rule *rule, int family, struct netlink_ext_ack *extack) { struct fib_rule_notifier_info info = { .info.family = family, .info.extack = extack, .rule = rule, }; return call_fib_notifier(nb, event_type, &info.info); } static int call_fib_rule_notifiers(struct net *net, enum fib_event_type event_type, struct fib_rule *rule, struct fib_rules_ops *ops, struct netlink_ext_ack *extack) { struct fib_rule_notifier_info info = { .info.family = ops->family, .info.extack = extack, .rule = rule, }; ops->fib_rules_seq++; return call_fib_notifiers(net, event_type, &info.info); } /* Called with rcu_read_lock() */ int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, struct netlink_ext_ack *extack) { struct fib_rules_ops *ops; struct fib_rule *rule; int err = 0; ops = lookup_rules_ops(net, family); if (!ops) return -EAFNOSUPPORT; list_for_each_entry_rcu(rule, &ops->rules_list, list) { err = call_fib_rule_notifier(nb, FIB_EVENT_RULE_ADD, rule, family, extack); if (err) break; } rules_ops_put(ops); return err; } EXPORT_SYMBOL_GPL(fib_rules_dump); unsigned int fib_rules_seq_read(struct net *net, int family) { unsigned int fib_rules_seq; struct fib_rules_ops *ops; ASSERT_RTNL(); ops = lookup_rules_ops(net, family); if (!ops) return 0; fib_rules_seq = ops->fib_rules_seq; rules_ops_put(ops); return fib_rules_seq; } EXPORT_SYMBOL_GPL(fib_rules_seq_read); static struct fib_rule *rule_find(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, struct nlattr **tb, struct fib_rule *rule, bool user_priority) { struct fib_rule *r; list_for_each_entry(r, &ops->rules_list, list) { if (rule->action && r->action != rule->action) continue; if (rule->table && r->table != rule->table) continue; if (user_priority && r->pref != rule->pref) continue; if (rule->iifname[0] && memcmp(r->iifname, rule->iifname, IFNAMSIZ)) continue; if (rule->oifname[0] && memcmp(r->oifname, rule->oifname, IFNAMSIZ)) continue; if (rule->mark && r->mark != rule->mark) continue; if (rule->suppress_ifgroup != -1 && r->suppress_ifgroup != rule->suppress_ifgroup) continue; if (rule->suppress_prefixlen != -1 && r->suppress_prefixlen != rule->suppress_prefixlen) continue; if (rule->mark_mask && r->mark_mask != rule->mark_mask) continue; if (rule->tun_id && r->tun_id != rule->tun_id) continue; if (r->fr_net != rule->fr_net) continue; if (rule->l3mdev && r->l3mdev != rule->l3mdev) continue; if (uid_range_set(&rule->uid_range) && (!uid_eq(r->uid_range.start, rule->uid_range.start) || !uid_eq(r->uid_range.end, rule->uid_range.end))) continue; if (rule->ip_proto && r->ip_proto != rule->ip_proto) continue; if (rule->proto && r->proto != rule->proto) continue; if (fib_rule_port_range_set(&rule->sport_range) && !fib_rule_port_range_compare(&r->sport_range, &rule->sport_range)) continue; if (fib_rule_port_range_set(&rule->dport_range) && !fib_rule_port_range_compare(&r->dport_range, &rule->dport_range)) continue; if (!ops->compare(r, frh, tb)) continue; return r; } return NULL; } #ifdef CONFIG_NET_L3_MASTER_DEV static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule, struct netlink_ext_ack *extack) { nlrule->l3mdev = nla_get_u8(nla); if (nlrule->l3mdev != 1) { NL_SET_ERR_MSG(extack, "Invalid l3mdev attribute"); return -1; } return 0; } #else static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule, struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "l3mdev support is not enabled in kernel"); return -1; } #endif static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, struct fib_rules_ops *ops, struct nlattr *tb[], struct fib_rule **rule, bool *user_priority) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rule *nlrule = NULL; int err = -EINVAL; if (frh->src_len) if (!tb[FRA_SRC] || frh->src_len > (ops->addr_size * 8) || nla_len(tb[FRA_SRC]) != ops->addr_size) { NL_SET_ERR_MSG(extack, "Invalid source address"); goto errout; } if (frh->dst_len) if (!tb[FRA_DST] || frh->dst_len > (ops->addr_size * 8) || nla_len(tb[FRA_DST]) != ops->addr_size) { NL_SET_ERR_MSG(extack, "Invalid dst address"); goto errout; } nlrule = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT); if (!nlrule) { err = -ENOMEM; goto errout; } refcount_set(&nlrule->refcnt, 1); nlrule->fr_net = net; if (tb[FRA_PRIORITY]) { nlrule->pref = nla_get_u32(tb[FRA_PRIORITY]); *user_priority = true; } else { nlrule->pref = fib_default_rule_pref(ops); } nlrule->proto = tb[FRA_PROTOCOL] ? nla_get_u8(tb[FRA_PROTOCOL]) : RTPROT_UNSPEC; if (tb[FRA_IIFNAME]) { struct net_device *dev; nlrule->iifindex = -1; nla_strscpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); dev = __dev_get_by_name(net, nlrule->iifname); if (dev) nlrule->iifindex = dev->ifindex; } if (tb[FRA_OIFNAME]) { struct net_device *dev; nlrule->oifindex = -1; nla_strscpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); dev = __dev_get_by_name(net, nlrule->oifname); if (dev) nlrule->oifindex = dev->ifindex; } if (tb[FRA_FWMARK]) { nlrule->mark = nla_get_u32(tb[FRA_FWMARK]); if (nlrule->mark) /* compatibility: if the mark value is non-zero all bits * are compared unless a mask is explicitly specified. */ nlrule->mark_mask = 0xFFFFFFFF; } if (tb[FRA_FWMASK]) nlrule->mark_mask = nla_get_u32(tb[FRA_FWMASK]); if (tb[FRA_TUN_ID]) nlrule->tun_id = nla_get_be64(tb[FRA_TUN_ID]); if (tb[FRA_L3MDEV] && fib_nl2rule_l3mdev(tb[FRA_L3MDEV], nlrule, extack) < 0) goto errout_free; nlrule->action = frh->action; nlrule->flags = frh->flags; nlrule->table = frh_get_table(frh, tb); if (tb[FRA_SUPPRESS_PREFIXLEN]) nlrule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]); else nlrule->suppress_prefixlen = -1; if (tb[FRA_SUPPRESS_IFGROUP]) nlrule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]); else nlrule->suppress_ifgroup = -1; if (tb[FRA_GOTO]) { if (nlrule->action != FR_ACT_GOTO) { NL_SET_ERR_MSG(extack, "Unexpected goto"); goto errout_free; } nlrule->target = nla_get_u32(tb[FRA_GOTO]); /* Backward jumps are prohibited to avoid endless loops */ if (nlrule->target <= nlrule->pref) { NL_SET_ERR_MSG(extack, "Backward goto not supported"); goto errout_free; } } else if (nlrule->action == FR_ACT_GOTO) { NL_SET_ERR_MSG(extack, "Missing goto target for action goto"); goto errout_free; } if (nlrule->l3mdev && nlrule->table) { NL_SET_ERR_MSG(extack, "l3mdev and table are mutually exclusive"); goto errout_free; } if (tb[FRA_UID_RANGE]) { if (current_user_ns() != net->user_ns) { err = -EPERM; NL_SET_ERR_MSG(extack, "No permission to set uid"); goto errout_free; } nlrule->uid_range = nla_get_kuid_range(tb); if (!uid_range_set(&nlrule->uid_range) || !uid_lte(nlrule->uid_range.start, nlrule->uid_range.end)) { NL_SET_ERR_MSG(extack, "Invalid uid range"); goto errout_free; } } else { nlrule->uid_range = fib_kuid_range_unset; } if (tb[FRA_IP_PROTO]) nlrule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]); if (tb[FRA_SPORT_RANGE]) { err = nla_get_port_range(tb[FRA_SPORT_RANGE], &nlrule->sport_range); if (err) { NL_SET_ERR_MSG(extack, "Invalid sport range"); goto errout_free; } } if (tb[FRA_DPORT_RANGE]) { err = nla_get_port_range(tb[FRA_DPORT_RANGE], &nlrule->dport_range); if (err) { NL_SET_ERR_MSG(extack, "Invalid dport range"); goto errout_free; } } *rule = nlrule; return 0; errout_free: kfree(nlrule); errout: return err; } static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, struct nlattr **tb, struct fib_rule *rule) { struct fib_rule *r; list_for_each_entry(r, &ops->rules_list, list) { if (r->action != rule->action) continue; if (r->table != rule->table) continue; if (r->pref != rule->pref) continue; if (memcmp(r->iifname, rule->iifname, IFNAMSIZ)) continue; if (memcmp(r->oifname, rule->oifname, IFNAMSIZ)) continue; if (r->mark != rule->mark) continue; if (r->suppress_ifgroup != rule->suppress_ifgroup) continue; if (r->suppress_prefixlen != rule->suppress_prefixlen) continue; if (r->mark_mask != rule->mark_mask) continue; if (r->tun_id != rule->tun_id) continue; if (r->fr_net != rule->fr_net) continue; if (r->l3mdev != rule->l3mdev) continue; if (!uid_eq(r->uid_range.start, rule->uid_range.start) || !uid_eq(r->uid_range.end, rule->uid_range.end)) continue; if (r->ip_proto != rule->ip_proto) continue; if (r->proto != rule->proto) continue; if (!fib_rule_port_range_compare(&r->sport_range, &rule->sport_range)) continue; if (!fib_rule_port_range_compare(&r->dport_range, &rule->dport_range)) continue; if (!ops->compare(r, frh, tb)) continue; return 1; } return 0; } static const struct nla_policy fib_rule_policy[FRA_MAX + 1] = { [FRA_UNSPEC] = { .strict_start_type = FRA_DPORT_RANGE + 1 }, [FRA_IIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, [FRA_OIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, [FRA_PRIORITY] = { .type = NLA_U32 }, [FRA_FWMARK] = { .type = NLA_U32 }, [FRA_FLOW] = { .type = NLA_U32 }, [FRA_TUN_ID] = { .type = NLA_U64 }, [FRA_FWMASK] = { .type = NLA_U32 }, [FRA_TABLE] = { .type = NLA_U32 }, [FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, [FRA_GOTO] = { .type = NLA_U32 }, [FRA_L3MDEV] = { .type = NLA_U8 }, [FRA_UID_RANGE] = { .len = sizeof(struct fib_rule_uid_range) }, [FRA_PROTOCOL] = { .type = NLA_U8 }, [FRA_IP_PROTO] = { .type = NLA_U8 }, [FRA_SPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) } }; int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rules_ops *ops = NULL; struct fib_rule *rule = NULL, *r, *last = NULL; struct nlattr *tb[FRA_MAX + 1]; int err = -EINVAL, unresolved = 0; bool user_priority = false; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; } ops = lookup_rules_ops(net, frh->family); if (!ops) { err = -EAFNOSUPPORT; NL_SET_ERR_MSG(extack, "Rule family not supported"); goto errout; } err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX, fib_rule_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Error parsing msg"); goto errout; } err = fib_nl2rule(skb, nlh, extack, ops, tb, &rule, &user_priority); if (err) goto errout; if ((nlh->nlmsg_flags & NLM_F_EXCL) && rule_exists(ops, frh, tb, rule)) { err = -EEXIST; goto errout_free; } err = ops->configure(rule, skb, frh, tb, extack); if (err < 0) goto errout_free; err = call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops, extack); if (err < 0) goto errout_free; list_for_each_entry(r, &ops->rules_list, list) { if (r->pref == rule->target) { RCU_INIT_POINTER(rule->ctarget, r); break; } } if (rcu_dereference_protected(rule->ctarget, 1) == NULL) unresolved = 1; list_for_each_entry(r, &ops->rules_list, list) { if (r->pref > rule->pref) break; last = r; } if (last) list_add_rcu(&rule->list, &last->list); else list_add_rcu(&rule->list, &ops->rules_list); if (ops->unresolved_rules) { /* * There are unresolved goto rules in the list, check if * any of them are pointing to this new rule. */ list_for_each_entry(r, &ops->rules_list, list) { if (r->action == FR_ACT_GOTO && r->target == rule->pref && rtnl_dereference(r->ctarget) == NULL) { rcu_assign_pointer(r->ctarget, rule); if (--ops->unresolved_rules == 0) break; } } } if (rule->action == FR_ACT_GOTO) ops->nr_goto_rules++; if (unresolved) ops->unresolved_rules++; if (rule->tun_id) ip_tunnel_need_metadata(); notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid); flush_route_cache(ops); rules_ops_put(ops); return 0; errout_free: kfree(rule); errout: rules_ops_put(ops); return err; } EXPORT_SYMBOL_GPL(fib_nl_newrule); int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rules_ops *ops = NULL; struct fib_rule *rule = NULL, *r, *nlrule = NULL; struct nlattr *tb[FRA_MAX+1]; int err = -EINVAL; bool user_priority = false; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; } ops = lookup_rules_ops(net, frh->family); if (ops == NULL) { err = -EAFNOSUPPORT; NL_SET_ERR_MSG(extack, "Rule family not supported"); goto errout; } err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX, fib_rule_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Error parsing msg"); goto errout; } err = fib_nl2rule(skb, nlh, extack, ops, tb, &nlrule, &user_priority); if (err) goto errout; rule = rule_find(ops, frh, tb, nlrule, user_priority); if (!rule) { err = -ENOENT; goto errout; } if (rule->flags & FIB_RULE_PERMANENT) { err = -EPERM; goto errout; } if (ops->delete) { err = ops->delete(rule); if (err) goto errout; } if (rule->tun_id) ip_tunnel_unneed_metadata(); list_del_rcu(&rule->list); if (rule->action == FR_ACT_GOTO) { ops->nr_goto_rules--; if (rtnl_dereference(rule->ctarget) == NULL) ops->unresolved_rules--; } /* * Check if this rule is a target to any of them. If so, * adjust to the next one with the same preference or * disable them. As this operation is eventually very * expensive, it is only performed if goto rules, except * current if it is goto rule, have actually been added. */ if (ops->nr_goto_rules > 0) { struct fib_rule *n; n = list_next_entry(rule, list); if (&n->list == &ops->rules_list || n->pref != rule->pref) n = NULL; list_for_each_entry(r, &ops->rules_list, list) { if (rtnl_dereference(r->ctarget) != rule) continue; rcu_assign_pointer(r->ctarget, n); if (!n) ops->unresolved_rules++; } } call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, NULL); notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).portid); fib_rule_put(rule); flush_route_cache(ops); rules_ops_put(ops); kfree(nlrule); return 0; errout: kfree(nlrule); rules_ops_put(ops); return err; } EXPORT_SYMBOL_GPL(fib_nl_delrule); static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, struct fib_rule *rule) { size_t payload = NLMSG_ALIGN(sizeof(struct fib_rule_hdr)) + nla_total_size(IFNAMSIZ) /* FRA_IIFNAME */ + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */ + nla_total_size(4) /* FRA_PRIORITY */ + nla_total_size(4) /* FRA_TABLE */ + nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */ + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */ + nla_total_size(4) /* FRA_FWMARK */ + nla_total_size(4) /* FRA_FWMASK */ + nla_total_size_64bit(8) /* FRA_TUN_ID */ + nla_total_size(sizeof(struct fib_kuid_range)) + nla_total_size(1) /* FRA_PROTOCOL */ + nla_total_size(1) /* FRA_IP_PROTO */ + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_SPORT_RANGE */ + nla_total_size(sizeof(struct fib_rule_port_range)); /* FRA_DPORT_RANGE */ if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); return payload; } static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, u32 pid, u32 seq, int type, int flags, struct fib_rules_ops *ops) { struct nlmsghdr *nlh; struct fib_rule_hdr *frh; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*frh), flags); if (nlh == NULL) return -EMSGSIZE; frh = nlmsg_data(nlh); frh->family = ops->family; frh->table = rule->table < 256 ? rule->table : RT_TABLE_COMPAT; if (nla_put_u32(skb, FRA_TABLE, rule->table)) goto nla_put_failure; if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen)) goto nla_put_failure; frh->res1 = 0; frh->res2 = 0; frh->action = rule->action; frh->flags = rule->flags; if (nla_put_u8(skb, FRA_PROTOCOL, rule->proto)) goto nla_put_failure; if (rule->action == FR_ACT_GOTO && rcu_access_pointer(rule->ctarget) == NULL) frh->flags |= FIB_RULE_UNRESOLVED; if (rule->iifname[0]) { if (nla_put_string(skb, FRA_IIFNAME, rule->iifname)) goto nla_put_failure; if (rule->iifindex == -1) frh->flags |= FIB_RULE_IIF_DETACHED; } if (rule->oifname[0]) { if (nla_put_string(skb, FRA_OIFNAME, rule->oifname)) goto nla_put_failure; if (rule->oifindex == -1) frh->flags |= FIB_RULE_OIF_DETACHED; } if ((rule->pref && nla_put_u32(skb, FRA_PRIORITY, rule->pref)) || (rule->mark && nla_put_u32(skb, FRA_FWMARK, rule->mark)) || ((rule->mark_mask || rule->mark) && nla_put_u32(skb, FRA_FWMASK, rule->mark_mask)) || (rule->target && nla_put_u32(skb, FRA_GOTO, rule->target)) || (rule->tun_id && nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)) || (rule->l3mdev && nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)) || (uid_range_set(&rule->uid_range) && nla_put_uid_range(skb, &rule->uid_range)) || (fib_rule_port_range_set(&rule->sport_range) && nla_put_port_range(skb, FRA_SPORT_RANGE, &rule->sport_range)) || (fib_rule_port_range_set(&rule->dport_range) && nla_put_port_range(skb, FRA_DPORT_RANGE, &rule->dport_range)) || (rule->ip_proto && nla_put_u8(skb, FRA_IP_PROTO, rule->ip_proto))) goto nla_put_failure; if (rule->suppress_ifgroup != -1) { if (nla_put_u32(skb, FRA_SUPPRESS_IFGROUP, rule->suppress_ifgroup)) goto nla_put_failure; } if (ops->fill(rule, skb, frh) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb, struct fib_rules_ops *ops) { int idx = 0; struct fib_rule *rule; int err = 0; rcu_read_lock(); list_for_each_entry_rcu(rule, &ops->rules_list, list) { if (idx < cb->args[1]) goto skip; err = fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWRULE, NLM_F_MULTI, ops); if (err) break; skip: idx++; } rcu_read_unlock(); cb->args[1] = idx; rules_ops_put(ops); return err; } static int fib_valid_dumprule_req(const struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct fib_rule_hdr *frh; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { NL_SET_ERR_MSG(extack, "Invalid header for fib rule dump request"); return -EINVAL; } frh = nlmsg_data(nlh); if (frh->dst_len || frh->src_len || frh->tos || frh->table || frh->res1 || frh->res2 || frh->action || frh->flags) { NL_SET_ERR_MSG(extack, "Invalid values in header for fib rule dump request"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*frh))) { NL_SET_ERR_MSG(extack, "Invalid data after header in fib rule dump request"); return -EINVAL; } return 0; } static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct fib_rules_ops *ops; int idx = 0, family; if (cb->strict_check) { int err = fib_valid_dumprule_req(nlh, cb->extack); if (err < 0) return err; } family = rtnl_msg_family(nlh); if (family != AF_UNSPEC) { /* Protocol specific dump request */ ops = lookup_rules_ops(net, family); if (ops == NULL) return -EAFNOSUPPORT; dump_rules(skb, cb, ops); return skb->len; } rcu_read_lock(); list_for_each_entry_rcu(ops, &net->rules_ops, list) { if (idx < cb->args[0] || !try_module_get(ops->owner)) goto skip; if (dump_rules(skb, cb, ops) < 0) break; cb->args[1] = 0; skip: idx++; } rcu_read_unlock(); cb->args[0] = idx; return skb->len; } static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, u32 pid) { struct net *net; struct sk_buff *skb; int err = -ENOMEM; net = ops->fro_net; skb = nlmsg_new(fib_rule_nlmsg_size(ops, rule), GFP_KERNEL); if (skb == NULL) goto errout; err = fib_nl_fill_rule(skb, rule, pid, nlh->nlmsg_seq, event, 0, ops); if (err < 0) { /* -EMSGSIZE implies BUG in fib_rule_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL); return; errout: if (err < 0) rtnl_set_sk_err(net, ops->nlgroup, err); } static void attach_rules(struct list_head *rules, struct net_device *dev) { struct fib_rule *rule; list_for_each_entry(rule, rules, list) { if (rule->iifindex == -1 && strcmp(dev->name, rule->iifname) == 0) rule->iifindex = dev->ifindex; if (rule->oifindex == -1 && strcmp(dev->name, rule->oifname) == 0) rule->oifindex = dev->ifindex; } } static void detach_rules(struct list_head *rules, struct net_device *dev) { struct fib_rule *rule; list_for_each_entry(rule, rules, list) { if (rule->iifindex == dev->ifindex) rule->iifindex = -1; if (rule->oifindex == dev->ifindex) rule->oifindex = -1; } } static int fib_rules_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct fib_rules_ops *ops; ASSERT_RTNL(); switch (event) { case NETDEV_REGISTER: list_for_each_entry(ops, &net->rules_ops, list) attach_rules(&ops->rules_list, dev); break; case NETDEV_CHANGENAME: list_for_each_entry(ops, &net->rules_ops, list) { detach_rules(&ops->rules_list, dev); attach_rules(&ops->rules_list, dev); } break; case NETDEV_UNREGISTER: list_for_each_entry(ops, &net->rules_ops, list) detach_rules(&ops->rules_list, dev); break; } return NOTIFY_DONE; } static struct notifier_block fib_rules_notifier = { .notifier_call = fib_rules_event, }; static int __net_init fib_rules_net_init(struct net *net) { INIT_LIST_HEAD(&net->rules_ops); spin_lock_init(&net->rules_mod_lock); return 0; } static void __net_exit fib_rules_net_exit(struct net *net) { WARN_ON_ONCE(!list_empty(&net->rules_ops)); } static struct pernet_operations fib_rules_net_ops = { .init = fib_rules_net_init, .exit = fib_rules_net_exit, }; static int __init fib_rules_init(void) { int err; rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, 0); err = register_pernet_subsys(&fib_rules_net_ops); if (err < 0) goto fail; err = register_netdevice_notifier(&fib_rules_notifier); if (err < 0) goto fail_unregister; return 0; fail_unregister: unregister_pernet_subsys(&fib_rules_net_ops); fail: rtnl_unregister(PF_UNSPEC, RTM_NEWRULE); rtnl_unregister(PF_UNSPEC, RTM_DELRULE); rtnl_unregister(PF_UNSPEC, RTM_GETRULE); return err; } subsys_initcall(fib_rules_init);
2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 // SPDX-License-Identifier: GPL-2.0-only #include <net/xdp_sock_drv.h> #include "netlink.h" #include "common.h" struct channels_req_info { struct ethnl_req_info base; }; struct channels_reply_data { struct ethnl_reply_data base; struct ethtool_channels channels; }; #define CHANNELS_REPDATA(__reply_base) \ container_of(__reply_base, struct channels_reply_data, base) const struct nla_policy ethnl_channels_get_policy[] = { [ETHTOOL_A_CHANNELS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int channels_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct channels_reply_data *data = CHANNELS_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; if (!dev->ethtool_ops->get_channels) return -EOPNOTSUPP; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; dev->ethtool_ops->get_channels(dev, &data->channels); ethnl_ops_complete(dev); return 0; } static int channels_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { return nla_total_size(sizeof(u32)) + /* _CHANNELS_RX_MAX */ nla_total_size(sizeof(u32)) + /* _CHANNELS_TX_MAX */ nla_total_size(sizeof(u32)) + /* _CHANNELS_OTHER_MAX */ nla_total_size(sizeof(u32)) + /* _CHANNELS_COMBINED_MAX */ nla_total_size(sizeof(u32)) + /* _CHANNELS_RX_COUNT */ nla_total_size(sizeof(u32)) + /* _CHANNELS_TX_COUNT */ nla_total_size(sizeof(u32)) + /* _CHANNELS_OTHER_COUNT */ nla_total_size(sizeof(u32)); /* _CHANNELS_COMBINED_COUNT */ } static int channels_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct channels_reply_data *data = CHANNELS_REPDATA(reply_base); const struct ethtool_channels *channels = &data->channels; if ((channels->max_rx && (nla_put_u32(skb, ETHTOOL_A_CHANNELS_RX_MAX, channels->max_rx) || nla_put_u32(skb, ETHTOOL_A_CHANNELS_RX_COUNT, channels->rx_count))) || (channels->max_tx && (nla_put_u32(skb, ETHTOOL_A_CHANNELS_TX_MAX, channels->max_tx) || nla_put_u32(skb, ETHTOOL_A_CHANNELS_TX_COUNT, channels->tx_count))) || (channels->max_other && (nla_put_u32(skb, ETHTOOL_A_CHANNELS_OTHER_MAX, channels->max_other) || nla_put_u32(skb, ETHTOOL_A_CHANNELS_OTHER_COUNT, channels->other_count))) || (channels->max_combined && (nla_put_u32(skb, ETHTOOL_A_CHANNELS_COMBINED_MAX, channels->max_combined) || nla_put_u32(skb, ETHTOOL_A_CHANNELS_COMBINED_COUNT, channels->combined_count)))) return -EMSGSIZE; return 0; } /* CHANNELS_SET */ const struct nla_policy ethnl_channels_set_policy[] = { [ETHTOOL_A_CHANNELS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_CHANNELS_RX_COUNT] = { .type = NLA_U32 }, [ETHTOOL_A_CHANNELS_TX_COUNT] = { .type = NLA_U32 }, [ETHTOOL_A_CHANNELS_OTHER_COUNT] = { .type = NLA_U32 }, [ETHTOOL_A_CHANNELS_COMBINED_COUNT] = { .type = NLA_U32 }, }; static int ethnl_set_channels_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; return ops->get_channels && ops->set_channels ? 1 : -EOPNOTSUPP; } static int ethnl_set_channels(struct ethnl_req_info *req_info, struct genl_info *info) { unsigned int from_channel, old_total, i; bool mod = false, mod_combined = false; struct net_device *dev = req_info->dev; struct ethtool_channels channels = {}; struct nlattr **tb = info->attrs; u32 err_attr, max_rxfh_in_use; u64 max_rxnfc_in_use; int ret; dev->ethtool_ops->get_channels(dev, &channels); old_total = channels.combined_count + max(channels.rx_count, channels.tx_count); ethnl_update_u32(&channels.rx_count, tb[ETHTOOL_A_CHANNELS_RX_COUNT], &mod); ethnl_update_u32(&channels.tx_count, tb[ETHTOOL_A_CHANNELS_TX_COUNT], &mod); ethnl_update_u32(&channels.other_count, tb[ETHTOOL_A_CHANNELS_OTHER_COUNT], &mod); ethnl_update_u32(&channels.combined_count, tb[ETHTOOL_A_CHANNELS_COMBINED_COUNT], &mod_combined); mod |= mod_combined; if (!mod) return 0; /* ensure new channel counts are within limits */ if (channels.rx_count > channels.max_rx) err_attr = ETHTOOL_A_CHANNELS_RX_COUNT; else if (channels.tx_count > channels.max_tx) err_attr = ETHTOOL_A_CHANNELS_TX_COUNT; else if (channels.other_count > channels.max_other) err_attr = ETHTOOL_A_CHANNELS_OTHER_COUNT; else if (channels.combined_count > channels.max_combined) err_attr = ETHTOOL_A_CHANNELS_COMBINED_COUNT; else err_attr = 0; if (err_attr) { NL_SET_ERR_MSG_ATTR(info->extack, tb[err_attr], "requested channel count exceeds maximum"); return -EINVAL; } /* ensure there is at least one RX and one TX channel */ if (!channels.combined_count && !channels.rx_count) err_attr = ETHTOOL_A_CHANNELS_RX_COUNT; else if (!channels.combined_count && !channels.tx_count) err_attr = ETHTOOL_A_CHANNELS_TX_COUNT; else err_attr = 0; if (err_attr) { if (mod_combined) err_attr = ETHTOOL_A_CHANNELS_COMBINED_COUNT; NL_SET_ERR_MSG_ATTR(info->extack, tb[err_attr], "requested channel counts would result in no RX or TX channel being configured"); return -EINVAL; } /* ensure the new Rx count fits within the configured Rx flow * indirection table/rxnfc settings */ if (ethtool_get_max_rxnfc_channel(dev, &max_rxnfc_in_use)) max_rxnfc_in_use = 0; if (!netif_is_rxfh_configured(dev) || ethtool_get_max_rxfh_channel(dev, &max_rxfh_in_use)) max_rxfh_in_use = 0; if (channels.combined_count + channels.rx_count <= max_rxfh_in_use) { GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing indirection table settings"); return -EINVAL; } if (channels.combined_count + channels.rx_count <= max_rxnfc_in_use) { GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing ntuple filter settings"); return -EINVAL; } /* Disabling channels, query zero-copy AF_XDP sockets */ from_channel = channels.combined_count + min(channels.rx_count, channels.tx_count); for (i = from_channel; i < old_total; i++) if (xsk_get_pool_from_qid(dev, i)) { GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing zerocopy AF_XDP sockets"); return -EINVAL; } ret = dev->ethtool_ops->set_channels(dev, &channels); return ret < 0 ? ret : 1; } const struct ethnl_request_ops ethnl_channels_request_ops = { .request_cmd = ETHTOOL_MSG_CHANNELS_GET, .reply_cmd = ETHTOOL_MSG_CHANNELS_GET_REPLY, .hdr_attr = ETHTOOL_A_CHANNELS_HEADER, .req_info_size = sizeof(struct channels_req_info), .reply_data_size = sizeof(struct channels_reply_data), .prepare_data = channels_prepare_data, .reply_size = channels_reply_size, .fill_reply = channels_fill_reply, .set_validate = ethnl_set_channels_validate, .set = ethnl_set_channels, .set_ntf_cmd = ETHTOOL_MSG_CHANNELS_NTF, };
1665 123 1564 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 // SPDX-License-Identifier: GPL-2.0-only #include <linux/uaccess.h> #include <linux/kernel.h> #include <asm/vsyscall.h> #ifdef CONFIG_X86_64 bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) { unsigned long vaddr = (unsigned long)unsafe_src; /* * Do not allow userspace addresses. This disallows * normal userspace and the userspace guard page: */ if (vaddr < TASK_SIZE_MAX + PAGE_SIZE) return false; /* * Reading from the vsyscall page may cause an unhandled fault in * certain cases. Though it is at an address above TASK_SIZE_MAX, it is * usually considered as a user space address. */ if (is_vsyscall_vaddr(vaddr)) return false; /* * Allow everything during early boot before 'x86_virt_bits' * is initialized. Needed for instruction decoding in early * exception handlers. */ if (!boot_cpu_data.x86_virt_bits) return true; return __is_canonical_address(vaddr, boot_cpu_data.x86_virt_bits); } #else bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) { return (unsigned long)unsafe_src >= TASK_SIZE_MAX; } #endif
14 14 14 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 // SPDX-License-Identifier: GPL-2.0-or-later /* auditfilter.c -- filtering of audit events * * Copyright 2003-2004 Red Hat, Inc. * Copyright 2005 Hewlett-Packard Development Company, L.P. * Copyright 2005 IBM Corporation */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/audit.h> #include <linux/kthread.h> #include <linux/mutex.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/netlink.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/security.h> #include <net/net_namespace.h> #include <net/sock.h> #include "audit.h" /* * Locking model: * * audit_filter_mutex: * Synchronizes writes and blocking reads of audit's filterlist * data. Rcu is used to traverse the filterlist and access * contents of structs audit_entry, audit_watch and opaque * LSM rules during filtering. If modified, these structures * must be copied and replace their counterparts in the filterlist. * An audit_parent struct is not accessed during filtering, so may * be written directly provided audit_filter_mutex is held. */ /* Audit filter lists, defined in <linux/audit.h> */ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { LIST_HEAD_INIT(audit_filter_list[0]), LIST_HEAD_INIT(audit_filter_list[1]), LIST_HEAD_INIT(audit_filter_list[2]), LIST_HEAD_INIT(audit_filter_list[3]), LIST_HEAD_INIT(audit_filter_list[4]), LIST_HEAD_INIT(audit_filter_list[5]), LIST_HEAD_INIT(audit_filter_list[6]), LIST_HEAD_INIT(audit_filter_list[7]), #if AUDIT_NR_FILTERS != 8 #error Fix audit_filter_list initialiser #endif }; static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = { LIST_HEAD_INIT(audit_rules_list[0]), LIST_HEAD_INIT(audit_rules_list[1]), LIST_HEAD_INIT(audit_rules_list[2]), LIST_HEAD_INIT(audit_rules_list[3]), LIST_HEAD_INIT(audit_rules_list[4]), LIST_HEAD_INIT(audit_rules_list[5]), LIST_HEAD_INIT(audit_rules_list[6]), LIST_HEAD_INIT(audit_rules_list[7]), }; DEFINE_MUTEX(audit_filter_mutex); static void audit_free_lsm_field(struct audit_field *f) { switch (f->type) { case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: case AUDIT_OBJ_USER: case AUDIT_OBJ_ROLE: case AUDIT_OBJ_TYPE: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: kfree(f->lsm_str); security_audit_rule_free(f->lsm_rule); } } static inline void audit_free_rule(struct audit_entry *e) { int i; struct audit_krule *erule = &e->rule; /* some rules don't have associated watches */ if (erule->watch) audit_put_watch(erule->watch); if (erule->fields) for (i = 0; i < erule->field_count; i++) audit_free_lsm_field(&erule->fields[i]); kfree(erule->fields); kfree(erule->filterkey); kfree(e); } void audit_free_rule_rcu(struct rcu_head *head) { struct audit_entry *e = container_of(head, struct audit_entry, rcu); audit_free_rule(e); } /* Initialize an audit filterlist entry. */ static inline struct audit_entry *audit_init_entry(u32 field_count) { struct audit_entry *entry; struct audit_field *fields; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (unlikely(!entry)) return NULL; fields = kcalloc(field_count, sizeof(*fields), GFP_KERNEL); if (unlikely(!fields)) { kfree(entry); return NULL; } entry->rule.fields = fields; return entry; } /* Unpack a filter field's string representation from user-space * buffer. */ char *audit_unpack_string(void **bufp, size_t *remain, size_t len) { char *str; if (!*bufp || (len == 0) || (len > *remain)) return ERR_PTR(-EINVAL); /* Of the currently implemented string fields, PATH_MAX * defines the longest valid length. */ if (len > PATH_MAX) return ERR_PTR(-ENAMETOOLONG); str = kmalloc(len + 1, GFP_KERNEL); if (unlikely(!str)) return ERR_PTR(-ENOMEM); memcpy(str, *bufp, len); str[len] = 0; *bufp += len; *remain -= len; return str; } /* Translate an inode field to kernel representation. */ static inline int audit_to_inode(struct audit_krule *krule, struct audit_field *f) { if ((krule->listnr != AUDIT_FILTER_EXIT && krule->listnr != AUDIT_FILTER_URING_EXIT) || krule->inode_f || krule->watch || krule->tree || (f->op != Audit_equal && f->op != Audit_not_equal)) return -EINVAL; krule->inode_f = f; return 0; } static __u32 *classes[AUDIT_SYSCALL_CLASSES]; int __init audit_register_class(int class, unsigned *list) { __u32 *p = kcalloc(AUDIT_BITMASK_SIZE, sizeof(__u32), GFP_KERNEL); if (!p) return -ENOMEM; while (*list != ~0U) { unsigned n = *list++; if (n >= AUDIT_BITMASK_SIZE * 32 - AUDIT_SYSCALL_CLASSES) { kfree(p); return -EINVAL; } p[AUDIT_WORD(n)] |= AUDIT_BIT(n); } if (class >= AUDIT_SYSCALL_CLASSES || classes[class]) { kfree(p); return -EINVAL; } classes[class] = p; return 0; } int audit_match_class(int class, unsigned syscall) { if (unlikely(syscall >= AUDIT_BITMASK_SIZE * 32)) return 0; if (unlikely(class >= AUDIT_SYSCALL_CLASSES || !classes[class])) return 0; return classes[class][AUDIT_WORD(syscall)] & AUDIT_BIT(syscall); } #ifdef CONFIG_AUDITSYSCALL static inline int audit_match_class_bits(int class, u32 *mask) { int i; if (classes[class]) { for (i = 0; i < AUDIT_BITMASK_SIZE; i++) if (mask[i] & classes[class][i]) return 0; } return 1; } static int audit_match_signal(struct audit_entry *entry) { struct audit_field *arch = entry->rule.arch_f; if (!arch) { /* When arch is unspecified, we must check both masks on biarch * as syscall number alone is ambiguous. */ return (audit_match_class_bits(AUDIT_CLASS_SIGNAL, entry->rule.mask) && audit_match_class_bits(AUDIT_CLASS_SIGNAL_32, entry->rule.mask)); } switch (audit_classify_arch(arch->val)) { case 0: /* native */ return (audit_match_class_bits(AUDIT_CLASS_SIGNAL, entry->rule.mask)); case 1: /* 32bit on biarch */ return (audit_match_class_bits(AUDIT_CLASS_SIGNAL_32, entry->rule.mask)); default: return 1; } } #endif /* Common user-space to kernel rule translation. */ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *rule) { unsigned listnr; struct audit_entry *entry; int i, err; err = -EINVAL; listnr = rule->flags & ~AUDIT_FILTER_PREPEND; switch (listnr) { default: goto exit_err; #ifdef CONFIG_AUDITSYSCALL case AUDIT_FILTER_ENTRY: pr_err("AUDIT_FILTER_ENTRY is deprecated\n"); goto exit_err; case AUDIT_FILTER_EXIT: case AUDIT_FILTER_URING_EXIT: case AUDIT_FILTER_TASK: #endif case AUDIT_FILTER_USER: case AUDIT_FILTER_EXCLUDE: case AUDIT_FILTER_FS: ; } if (unlikely(rule->action == AUDIT_POSSIBLE)) { pr_err("AUDIT_POSSIBLE is deprecated\n"); goto exit_err; } if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS) goto exit_err; if (rule->field_count > AUDIT_MAX_FIELDS) goto exit_err; err = -ENOMEM; entry = audit_init_entry(rule->field_count); if (!entry) goto exit_err; entry->rule.flags = rule->flags & AUDIT_FILTER_PREPEND; entry->rule.listnr = listnr; entry->rule.action = rule->action; entry->rule.field_count = rule->field_count; for (i = 0; i < AUDIT_BITMASK_SIZE; i++) entry->rule.mask[i] = rule->mask[i]; for (i = 0; i < AUDIT_SYSCALL_CLASSES; i++) { int bit = AUDIT_BITMASK_SIZE * 32 - i - 1; __u32 *p = &entry->rule.mask[AUDIT_WORD(bit)]; __u32 *class; if (!(*p & AUDIT_BIT(bit))) continue; *p &= ~AUDIT_BIT(bit); class = classes[i]; if (class) { int j; for (j = 0; j < AUDIT_BITMASK_SIZE; j++) entry->rule.mask[j] |= class[j]; } } return entry; exit_err: return ERR_PTR(err); } static u32 audit_ops[] = { [Audit_equal] = AUDIT_EQUAL, [Audit_not_equal] = AUDIT_NOT_EQUAL, [Audit_bitmask] = AUDIT_BIT_MASK, [Audit_bittest] = AUDIT_BIT_TEST, [Audit_lt] = AUDIT_LESS_THAN, [Audit_gt] = AUDIT_GREATER_THAN, [Audit_le] = AUDIT_LESS_THAN_OR_EQUAL, [Audit_ge] = AUDIT_GREATER_THAN_OR_EQUAL, }; static u32 audit_to_op(u32 op) { u32 n; for (n = Audit_equal; n < Audit_bad && audit_ops[n] != op; n++) ; return n; } /* check if an audit field is valid */ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) { switch (f->type) { case AUDIT_MSGTYPE: if (entry->rule.listnr != AUDIT_FILTER_EXCLUDE && entry->rule.listnr != AUDIT_FILTER_USER) return -EINVAL; break; case AUDIT_FSTYPE: if (entry->rule.listnr != AUDIT_FILTER_FS) return -EINVAL; break; case AUDIT_PERM: if (entry->rule.listnr == AUDIT_FILTER_URING_EXIT) return -EINVAL; break; } switch (entry->rule.listnr) { case AUDIT_FILTER_FS: switch (f->type) { case AUDIT_FSTYPE: case AUDIT_FILTERKEY: break; default: return -EINVAL; } } /* Check for valid field type and op */ switch (f->type) { case AUDIT_ARG0: case AUDIT_ARG1: case AUDIT_ARG2: case AUDIT_ARG3: case AUDIT_PERS: /* <uapi/linux/personality.h> */ case AUDIT_DEVMINOR: /* all ops are valid */ break; case AUDIT_UID: case AUDIT_EUID: case AUDIT_SUID: case AUDIT_FSUID: case AUDIT_LOGINUID: case AUDIT_OBJ_UID: case AUDIT_GID: case AUDIT_EGID: case AUDIT_SGID: case AUDIT_FSGID: case AUDIT_OBJ_GID: case AUDIT_PID: case AUDIT_MSGTYPE: case AUDIT_PPID: case AUDIT_DEVMAJOR: case AUDIT_EXIT: case AUDIT_SUCCESS: case AUDIT_INODE: case AUDIT_SESSIONID: case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: case AUDIT_SADDR_FAM: /* bit ops are only useful on syscall args */ if (f->op == Audit_bitmask || f->op == Audit_bittest) return -EINVAL; break; case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: case AUDIT_OBJ_USER: case AUDIT_OBJ_ROLE: case AUDIT_OBJ_TYPE: case AUDIT_WATCH: case AUDIT_DIR: case AUDIT_FILTERKEY: case AUDIT_LOGINUID_SET: case AUDIT_ARCH: case AUDIT_FSTYPE: case AUDIT_PERM: case AUDIT_FILETYPE: case AUDIT_FIELD_COMPARE: case AUDIT_EXE: /* only equal and not equal valid ops */ if (f->op != Audit_not_equal && f->op != Audit_equal) return -EINVAL; break; default: /* field not recognized */ return -EINVAL; } /* Check for select valid field values */ switch (f->type) { case AUDIT_LOGINUID_SET: if ((f->val != 0) && (f->val != 1)) return -EINVAL; break; case AUDIT_PERM: if (f->val & ~15) return -EINVAL; break; case AUDIT_FILETYPE: if (f->val & ~S_IFMT) return -EINVAL; break; case AUDIT_FIELD_COMPARE: if (f->val > AUDIT_MAX_FIELD_COMPARE) return -EINVAL; break; case AUDIT_SADDR_FAM: if (f->val >= AF_MAX) return -EINVAL; break; default: break; } return 0; } /* Translate struct audit_rule_data to kernel's rule representation. */ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, size_t datasz) { int err = 0; struct audit_entry *entry; void *bufp; size_t remain = datasz - sizeof(struct audit_rule_data); int i; char *str; struct audit_fsnotify_mark *audit_mark; entry = audit_to_entry_common(data); if (IS_ERR(entry)) goto exit_nofree; bufp = data->buf; for (i = 0; i < data->field_count; i++) { struct audit_field *f = &entry->rule.fields[i]; u32 f_val; err = -EINVAL; f->op = audit_to_op(data->fieldflags[i]); if (f->op == Audit_bad) goto exit_free; f->type = data->fields[i]; f_val = data->values[i]; /* Support legacy tests for a valid loginuid */ if ((f->type == AUDIT_LOGINUID) && (f_val == AUDIT_UID_UNSET)) { f->type = AUDIT_LOGINUID_SET; f_val = 0; entry->rule.pflags |= AUDIT_LOGINUID_LEGACY; } err = audit_field_valid(entry, f); if (err) goto exit_free; err = -EINVAL; switch (f->type) { case AUDIT_LOGINUID: case AUDIT_UID: case AUDIT_EUID: case AUDIT_SUID: case AUDIT_FSUID: case AUDIT_OBJ_UID: f->uid = make_kuid(current_user_ns(), f_val); if (!uid_valid(f->uid)) goto exit_free; break; case AUDIT_GID: case AUDIT_EGID: case AUDIT_SGID: case AUDIT_FSGID: case AUDIT_OBJ_GID: f->gid = make_kgid(current_user_ns(), f_val); if (!gid_valid(f->gid)) goto exit_free; break; case AUDIT_ARCH: f->val = f_val; entry->rule.arch_f = f; break; case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: case AUDIT_OBJ_USER: case AUDIT_OBJ_ROLE: case AUDIT_OBJ_TYPE: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: str = audit_unpack_string(&bufp, &remain, f_val); if (IS_ERR(str)) { err = PTR_ERR(str); goto exit_free; } entry->rule.buflen += f_val; f->lsm_str = str; err = security_audit_rule_init(f->type, f->op, str, (void **)&f->lsm_rule); /* Keep currently invalid fields around in case they * become valid after a policy reload. */ if (err == -EINVAL) { pr_warn("audit rule for LSM \'%s\' is invalid\n", str); err = 0; } else if (err) goto exit_free; break; case AUDIT_WATCH: str = audit_unpack_string(&bufp, &remain, f_val); if (IS_ERR(str)) { err = PTR_ERR(str); goto exit_free; } err = audit_to_watch(&entry->rule, str, f_val, f->op); if (err) { kfree(str); goto exit_free; } entry->rule.buflen += f_val; break; case AUDIT_DIR: str = audit_unpack_string(&bufp, &remain, f_val); if (IS_ERR(str)) { err = PTR_ERR(str); goto exit_free; } err = audit_make_tree(&entry->rule, str, f->op); kfree(str); if (err) goto exit_free; entry->rule.buflen += f_val; break; case AUDIT_INODE: f->val = f_val; err = audit_to_inode(&entry->rule, f); if (err) goto exit_free; break; case AUDIT_FILTERKEY: if (entry->rule.filterkey || f_val > AUDIT_MAX_KEY_LEN) goto exit_free; str = audit_unpack_string(&bufp, &remain, f_val); if (IS_ERR(str)) { err = PTR_ERR(str); goto exit_free; } entry->rule.buflen += f_val; entry->rule.filterkey = str; break; case AUDIT_EXE: if (entry->rule.exe || f_val > PATH_MAX) goto exit_free; str = audit_unpack_string(&bufp, &remain, f_val); if (IS_ERR(str)) { err = PTR_ERR(str); goto exit_free; } audit_mark = audit_alloc_mark(&entry->rule, str, f_val); if (IS_ERR(audit_mark)) { kfree(str); err = PTR_ERR(audit_mark); goto exit_free; } entry->rule.buflen += f_val; entry->rule.exe = audit_mark; break; default: f->val = f_val; break; } } if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal) entry->rule.inode_f = NULL; exit_nofree: return entry; exit_free: if (entry->rule.tree) audit_put_tree(entry->rule.tree); /* that's the temporary one */ if (entry->rule.exe) audit_remove_mark(entry->rule.exe); /* that's the template one */ audit_free_rule(entry); return ERR_PTR(err); } /* Pack a filter field's string representation into data block. */ static inline size_t audit_pack_string(void **bufp, const char *str) { size_t len = strlen(str); memcpy(*bufp, str, len); *bufp += len; return len; } /* Translate kernel rule representation to struct audit_rule_data. */ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) { struct audit_rule_data *data; void *bufp; int i; data = kmalloc(struct_size(data, buf, krule->buflen), GFP_KERNEL); if (unlikely(!data)) return NULL; memset(data, 0, sizeof(*data)); data->flags = krule->flags | krule->listnr; data->action = krule->action; data->field_count = krule->field_count; bufp = data->buf; for (i = 0; i < data->field_count; i++) { struct audit_field *f = &krule->fields[i]; data->fields[i] = f->type; data->fieldflags[i] = audit_ops[f->op]; switch (f->type) { case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: case AUDIT_OBJ_USER: case AUDIT_OBJ_ROLE: case AUDIT_OBJ_TYPE: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: data->buflen += data->values[i] = audit_pack_string(&bufp, f->lsm_str); break; case AUDIT_WATCH: data->buflen += data->values[i] = audit_pack_string(&bufp, audit_watch_path(krule->watch)); break; case AUDIT_DIR: data->buflen += data->values[i] = audit_pack_string(&bufp, audit_tree_path(krule->tree)); break; case AUDIT_FILTERKEY: data->buflen += data->values[i] = audit_pack_string(&bufp, krule->filterkey); break; case AUDIT_EXE: data->buflen += data->values[i] = audit_pack_string(&bufp, audit_mark_path(krule->exe)); break; case AUDIT_LOGINUID_SET: if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) { data->fields[i] = AUDIT_LOGINUID; data->values[i] = AUDIT_UID_UNSET; break; } fallthrough; /* if set */ default: data->values[i] = f->val; } } for (i = 0; i < AUDIT_BITMASK_SIZE; i++) data->mask[i] = krule->mask[i]; return data; } /* Compare two rules in kernel format. Considered success if rules * don't match. */ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) { int i; if (a->flags != b->flags || a->pflags != b->pflags || a->listnr != b->listnr || a->action != b->action || a->field_count != b->field_count) return 1; for (i = 0; i < a->field_count; i++) { if (a->fields[i].type != b->fields[i].type || a->fields[i].op != b->fields[i].op) return 1; switch (a->fields[i].type) { case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: case AUDIT_OBJ_USER: case AUDIT_OBJ_ROLE: case AUDIT_OBJ_TYPE: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: if (strcmp(a->fields[i].lsm_str, b->fields[i].lsm_str)) return 1; break; case AUDIT_WATCH: if (strcmp(audit_watch_path(a->watch), audit_watch_path(b->watch))) return 1; break; case AUDIT_DIR: if (strcmp(audit_tree_path(a->tree), audit_tree_path(b->tree))) return 1; break; case AUDIT_FILTERKEY: /* both filterkeys exist based on above type compare */ if (strcmp(a->filterkey, b->filterkey)) return 1; break; case AUDIT_EXE: /* both paths exist based on above type compare */ if (strcmp(audit_mark_path(a->exe), audit_mark_path(b->exe))) return 1; break; case AUDIT_UID: case AUDIT_EUID: case AUDIT_SUID: case AUDIT_FSUID: case AUDIT_LOGINUID: case AUDIT_OBJ_UID: if (!uid_eq(a->fields[i].uid, b->fields[i].uid)) return 1; break; case AUDIT_GID: case AUDIT_EGID: case AUDIT_SGID: case AUDIT_FSGID: case AUDIT_OBJ_GID: if (!gid_eq(a->fields[i].gid, b->fields[i].gid)) return 1; break; default: if (a->fields[i].val != b->fields[i].val) return 1; } } for (i = 0; i < AUDIT_BITMASK_SIZE; i++) if (a->mask[i] != b->mask[i]) return 1; return 0; } /* Duplicate LSM field information. The lsm_rule is opaque, so must be * re-initialized. */ static inline int audit_dupe_lsm_field(struct audit_field *df, struct audit_field *sf) { int ret; char *lsm_str; /* our own copy of lsm_str */ lsm_str = kstrdup(sf->lsm_str, GFP_KERNEL); if (unlikely(!lsm_str)) return -ENOMEM; df->lsm_str = lsm_str; /* our own (refreshed) copy of lsm_rule */ ret = security_audit_rule_init(df->type, df->op, df->lsm_str, (void **)&df->lsm_rule); /* Keep currently invalid fields around in case they * become valid after a policy reload. */ if (ret == -EINVAL) { pr_warn("audit rule for LSM \'%s\' is invalid\n", df->lsm_str); ret = 0; } return ret; } /* Duplicate an audit rule. This will be a deep copy with the exception * of the watch - that pointer is carried over. The LSM specific fields * will be updated in the copy. The point is to be able to replace the old * rule with the new rule in the filterlist, then free the old rule. * The rlist element is undefined; list manipulations are handled apart from * the initial copy. */ struct audit_entry *audit_dupe_rule(struct audit_krule *old) { u32 fcount = old->field_count; struct audit_entry *entry; struct audit_krule *new; char *fk; int i, err = 0; entry = audit_init_entry(fcount); if (unlikely(!entry)) return ERR_PTR(-ENOMEM); new = &entry->rule; new->flags = old->flags; new->pflags = old->pflags; new->listnr = old->listnr; new->action = old->action; for (i = 0; i < AUDIT_BITMASK_SIZE; i++) new->mask[i] = old->mask[i]; new->prio = old->prio; new->buflen = old->buflen; new->inode_f = old->inode_f; new->field_count = old->field_count; /* * note that we are OK with not refcounting here; audit_match_tree() * never dereferences tree and we can't get false positives there * since we'd have to have rule gone from the list *and* removed * before the chunks found by lookup had been allocated, i.e. before * the beginning of list scan. */ new->tree = old->tree; memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount); /* deep copy this information, updating the lsm_rule fields, because * the originals will all be freed when the old rule is freed. */ for (i = 0; i < fcount; i++) { switch (new->fields[i].type) { case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: case AUDIT_OBJ_USER: case AUDIT_OBJ_ROLE: case AUDIT_OBJ_TYPE: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: err = audit_dupe_lsm_field(&new->fields[i], &old->fields[i]); break; case AUDIT_FILTERKEY: fk = kstrdup(old->filterkey, GFP_KERNEL); if (unlikely(!fk)) err = -ENOMEM; else new->filterkey = fk; break; case AUDIT_EXE: err = audit_dupe_exe(new, old); break; } if (err) { if (new->exe) audit_remove_mark(new->exe); audit_free_rule(entry); return ERR_PTR(err); } } if (old->watch) { audit_get_watch(old->watch); new->watch = old->watch; } return entry; } /* Find an existing audit rule. * Caller must hold audit_filter_mutex to prevent stale rule data. */ static struct audit_entry *audit_find_rule(struct audit_entry *entry, struct list_head **p) { struct audit_entry *e, *found = NULL; struct list_head *list; int h; if (entry->rule.inode_f) { h = audit_hash_ino(entry->rule.inode_f->val); *p = list = &audit_inode_hash[h]; } else if (entry->rule.watch) { /* we don't know the inode number, so must walk entire hash */ for (h = 0; h < AUDIT_INODE_BUCKETS; h++) { list = &audit_inode_hash[h]; list_for_each_entry(e, list, list) if (!audit_compare_rule(&entry->rule, &e->rule)) { found = e; goto out; } } goto out; } else { *p = list = &audit_filter_list[entry->rule.listnr]; } list_for_each_entry(e, list, list) if (!audit_compare_rule(&entry->rule, &e->rule)) { found = e; goto out; } out: return found; } static u64 prio_low = ~0ULL/2; static u64 prio_high = ~0ULL/2 - 1; /* Add rule to given filterlist if not a duplicate. */ static inline int audit_add_rule(struct audit_entry *entry) { struct audit_entry *e; struct audit_watch *watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; struct list_head *list; int err = 0; #ifdef CONFIG_AUDITSYSCALL int dont_count = 0; /* If any of these, don't count towards total */ switch (entry->rule.listnr) { case AUDIT_FILTER_USER: case AUDIT_FILTER_EXCLUDE: case AUDIT_FILTER_FS: dont_count = 1; } #endif mutex_lock(&audit_filter_mutex); e = audit_find_rule(entry, &list); if (e) { mutex_unlock(&audit_filter_mutex); err = -EEXIST; /* normally audit_add_tree_rule() will free it on failure */ if (tree) audit_put_tree(tree); return err; } if (watch) { /* audit_filter_mutex is dropped and re-taken during this call */ err = audit_add_watch(&entry->rule, &list); if (err) { mutex_unlock(&audit_filter_mutex); /* * normally audit_add_tree_rule() will free it * on failure */ if (tree) audit_put_tree(tree); return err; } } if (tree) { err = audit_add_tree_rule(&entry->rule); if (err) { mutex_unlock(&audit_filter_mutex); return err; } } entry->rule.prio = ~0ULL; if (entry->rule.listnr == AUDIT_FILTER_EXIT || entry->rule.listnr == AUDIT_FILTER_URING_EXIT) { if (entry->rule.flags & AUDIT_FILTER_PREPEND) entry->rule.prio = ++prio_high; else entry->rule.prio = --prio_low; } if (entry->rule.flags & AUDIT_FILTER_PREPEND) { list_add(&entry->rule.list, &audit_rules_list[entry->rule.listnr]); list_add_rcu(&entry->list, list); entry->rule.flags &= ~AUDIT_FILTER_PREPEND; } else { list_add_tail(&entry->rule.list, &audit_rules_list[entry->rule.listnr]); list_add_tail_rcu(&entry->list, list); } #ifdef CONFIG_AUDITSYSCALL if (!dont_count) audit_n_rules++; if (!audit_match_signal(entry)) audit_signals++; #endif mutex_unlock(&audit_filter_mutex); return err; } /* Remove an existing rule from filterlist. */ int audit_del_rule(struct audit_entry *entry) { struct audit_entry *e; struct audit_tree *tree = entry->rule.tree; struct list_head *list; int ret = 0; #ifdef CONFIG_AUDITSYSCALL int dont_count = 0; /* If any of these, don't count towards total */ switch (entry->rule.listnr) { case AUDIT_FILTER_USER: case AUDIT_FILTER_EXCLUDE: case AUDIT_FILTER_FS: dont_count = 1; } #endif mutex_lock(&audit_filter_mutex); e = audit_find_rule(entry, &list); if (!e) { ret = -ENOENT; goto out; } if (e->rule.watch) audit_remove_watch_rule(&e->rule); if (e->rule.tree) audit_remove_tree_rule(&e->rule); if (e->rule.exe) audit_remove_mark_rule(&e->rule); #ifdef CONFIG_AUDITSYSCALL if (!dont_count) audit_n_rules--; if (!audit_match_signal(entry)) audit_signals--; #endif list_del_rcu(&e->list); list_del(&e->rule.list); call_rcu(&e->rcu, audit_free_rule_rcu); out: mutex_unlock(&audit_filter_mutex); if (tree) audit_put_tree(tree); /* that's the temporary one */ return ret; } /* List rules using struct audit_rule_data. */ static void audit_list_rules(int seq, struct sk_buff_head *q) { struct sk_buff *skb; struct audit_krule *r; int i; /* This is a blocking read, so use audit_filter_mutex instead of rcu * iterator to sync with list writers. */ for (i = 0; i < AUDIT_NR_FILTERS; i++) { list_for_each_entry(r, &audit_rules_list[i], list) { struct audit_rule_data *data; data = audit_krule_to_data(r); if (unlikely(!data)) break; skb = audit_make_reply(seq, AUDIT_LIST_RULES, 0, 1, data, struct_size(data, buf, data->buflen)); if (skb) skb_queue_tail(q, skb); kfree(data); } } skb = audit_make_reply(seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); if (skb) skb_queue_tail(q, skb); } /* Log rule additions and removals */ static void audit_log_rule_change(char *action, struct audit_krule *rule, int res) { struct audit_buffer *ab; if (!audit_enabled) return; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (!ab) return; audit_log_session_info(ab); audit_log_task_context(ab); audit_log_format(ab, " op=%s", action); audit_log_key(ab, rule->filterkey); audit_log_format(ab, " list=%d res=%d", rule->listnr, res); audit_log_end(ab); } /** * audit_rule_change - apply all rules to the specified message type * @type: audit message type * @seq: netlink audit message sequence (serial) number * @data: payload data * @datasz: size of payload data */ int audit_rule_change(int type, int seq, void *data, size_t datasz) { int err = 0; struct audit_entry *entry; switch (type) { case AUDIT_ADD_RULE: entry = audit_data_to_entry(data, datasz); if (IS_ERR(entry)) return PTR_ERR(entry); err = audit_add_rule(entry); audit_log_rule_change("add_rule", &entry->rule, !err); break; case AUDIT_DEL_RULE: entry = audit_data_to_entry(data, datasz); if (IS_ERR(entry)) return PTR_ERR(entry); err = audit_del_rule(entry); audit_log_rule_change("remove_rule", &entry->rule, !err); break; default: WARN_ON(1); return -EINVAL; } if (err || type == AUDIT_DEL_RULE) { if (entry->rule.exe) audit_remove_mark(entry->rule.exe); audit_free_rule(entry); } return err; } /** * audit_list_rules_send - list the audit rules * @request_skb: skb of request we are replying to (used to target the reply) * @seq: netlink audit message sequence (serial) number */ int audit_list_rules_send(struct sk_buff *request_skb, int seq) { struct task_struct *tsk; struct audit_netlink_list *dest; /* We can't just spew out the rules here because we might fill * the available socket buffer space and deadlock waiting for * auditctl to read from it... which isn't ever going to * happen if we're actually running in the context of auditctl * trying to _send_ the stuff */ dest = kmalloc(sizeof(*dest), GFP_KERNEL); if (!dest) return -ENOMEM; dest->net = get_net(sock_net(NETLINK_CB(request_skb).sk)); dest->portid = NETLINK_CB(request_skb).portid; skb_queue_head_init(&dest->q); mutex_lock(&audit_filter_mutex); audit_list_rules(seq, &dest->q); mutex_unlock(&audit_filter_mutex); tsk = kthread_run(audit_send_list_thread, dest, "audit_send_list"); if (IS_ERR(tsk)) { skb_queue_purge(&dest->q); put_net(dest->net); kfree(dest); return PTR_ERR(tsk); } return 0; } int audit_comparator(u32 left, u32 op, u32 right) { switch (op) { case Audit_equal: return (left == right); case Audit_not_equal: return (left != right); case Audit_lt: return (left < right); case Audit_le: return (left <= right); case Audit_gt: return (left > right); case Audit_ge: return (left >= right); case Audit_bitmask: return (left & right); case Audit_bittest: return ((left & right) == right); default: return 0; } } int audit_uid_comparator(kuid_t left, u32 op, kuid_t right) { switch (op) { case Audit_equal: return uid_eq(left, right); case Audit_not_equal: return !uid_eq(left, right); case Audit_lt: return uid_lt(left, right); case Audit_le: return uid_lte(left, right); case Audit_gt: return uid_gt(left, right); case Audit_ge: return uid_gte(left, right); case Audit_bitmask: case Audit_bittest: default: return 0; } } int audit_gid_comparator(kgid_t left, u32 op, kgid_t right) { switch (op) { case Audit_equal: return gid_eq(left, right); case Audit_not_equal: return !gid_eq(left, right); case Audit_lt: return gid_lt(left, right); case Audit_le: return gid_lte(left, right); case Audit_gt: return gid_gt(left, right); case Audit_ge: return gid_gte(left, right); case Audit_bitmask: case Audit_bittest: default: return 0; } } /** * parent_len - find the length of the parent portion of a pathname * @path: pathname of which to determine length */ int parent_len(const char *path) { int plen; const char *p; plen = strlen(path); if (plen == 0) return plen; /* disregard trailing slashes */ p = path + plen - 1; while ((*p == '/') && (p > path)) p--; /* walk backward until we find the next slash or hit beginning */ while ((*p != '/') && (p > path)) p--; /* did we find a slash? Then increment to include it in path */ if (*p == '/') p++; return p - path; } /** * audit_compare_dname_path - compare given dentry name with last component in * given path. Return of 0 indicates a match. * @dname: dentry name that we're comparing * @path: full pathname that we're comparing * @parentlen: length of the parent if known. Passing in AUDIT_NAME_FULL * here indicates that we must compute this value. */ int audit_compare_dname_path(const struct qstr *dname, const char *path, int parentlen) { int dlen, pathlen; const char *p; dlen = dname->len; pathlen = strlen(path); if (pathlen < dlen) return 1; parentlen = parentlen == AUDIT_NAME_FULL ? parent_len(path) : parentlen; if (pathlen - parentlen != dlen) return 1; p = path + parentlen; return strncmp(p, dname->name, dlen); } int audit_filter(int msgtype, unsigned int listtype) { struct audit_entry *e; int ret = 1; /* Audit by default */ rcu_read_lock(); list_for_each_entry_rcu(e, &audit_filter_list[listtype], list) { int i, result = 0; for (i = 0; i < e->rule.field_count; i++) { struct audit_field *f = &e->rule.fields[i]; pid_t pid; u32 sid; switch (f->type) { case AUDIT_PID: pid = task_pid_nr(current); result = audit_comparator(pid, f->op, f->val); break; case AUDIT_UID: result = audit_uid_comparator(current_uid(), f->op, f->uid); break; case AUDIT_GID: result = audit_gid_comparator(current_gid(), f->op, f->gid); break; case AUDIT_LOGINUID: result = audit_uid_comparator(audit_get_loginuid(current), f->op, f->uid); break; case AUDIT_LOGINUID_SET: result = audit_comparator(audit_loginuid_set(current), f->op, f->val); break; case AUDIT_MSGTYPE: result = audit_comparator(msgtype, f->op, f->val); break; case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: if (f->lsm_rule) { security_current_getsecid_subj(&sid); result = security_audit_rule_match(sid, f->type, f->op, f->lsm_rule); } break; case AUDIT_EXE: result = audit_exe_compare(current, e->rule.exe); if (f->op == Audit_not_equal) result = !result; break; default: goto unlock_and_return; } if (result < 0) /* error */ goto unlock_and_return; if (!result) break; } if (result > 0) { if (e->rule.action == AUDIT_NEVER || listtype == AUDIT_FILTER_EXCLUDE) ret = 0; break; } } unlock_and_return: rcu_read_unlock(); return ret; } static int update_lsm_rule(struct audit_krule *r) { struct audit_entry *entry = container_of(r, struct audit_entry, rule); struct audit_entry *nentry; int err = 0; if (!security_audit_rule_known(r)) return 0; nentry = audit_dupe_rule(r); if (entry->rule.exe) audit_remove_mark(entry->rule.exe); if (IS_ERR(nentry)) { /* save the first error encountered for the * return value */ err = PTR_ERR(nentry); audit_panic("error updating LSM filters"); if (r->watch) list_del(&r->rlist); list_del_rcu(&entry->list); list_del(&r->list); } else { if (r->watch || r->tree) list_replace_init(&r->rlist, &nentry->rule.rlist); list_replace_rcu(&entry->list, &nentry->list); list_replace(&r->list, &nentry->rule.list); } call_rcu(&entry->rcu, audit_free_rule_rcu); return err; } /* This function will re-initialize the lsm_rule field of all applicable rules. * It will traverse the filter lists serarching for rules that contain LSM * specific filter fields. When such a rule is found, it is copied, the * LSM field is re-initialized, and the old rule is replaced with the * updated rule. */ int audit_update_lsm_rules(void) { struct audit_krule *r, *n; int i, err = 0; /* audit_filter_mutex synchronizes the writers */ mutex_lock(&audit_filter_mutex); for (i = 0; i < AUDIT_NR_FILTERS; i++) { list_for_each_entry_safe(r, n, &audit_rules_list[i], list) { int res = update_lsm_rule(r); if (!err) err = res; } } mutex_unlock(&audit_filter_mutex); return err; }
8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 // SPDX-License-Identifier: GPL-2.0 /* * linux/ipc/util.c * Copyright (C) 1992 Krishna Balasubramanian * * Sep 1997 - Call suser() last after "normal" permission checks so we * get BSD style process accounting right. * Occurs in several places in the IPC code. * Chris Evans, <chris@ferret.lmh.ox.ac.uk> * Nov 1999 - ipc helper functions, unified SMP locking * Manfred Spraul <manfred@colorfullife.com> * Oct 2002 - One lock per IPC id. RCU ipc_free for lock-free grow_ary(). * Mingming Cao <cmm@us.ibm.com> * Mar 2006 - support for audit of ipc object properties * Dustin Kirkland <dustin.kirkland@us.ibm.com> * Jun 2006 - namespaces ssupport * OpenVZ, SWsoft Inc. * Pavel Emelianov <xemul@openvz.org> * * General sysv ipc locking scheme: * rcu_read_lock() * obtain the ipc object (kern_ipc_perm) by looking up the id in an idr * tree. * - perform initial checks (capabilities, auditing and permission, * etc). * - perform read-only operations, such as INFO command, that * do not demand atomicity * acquire the ipc lock (kern_ipc_perm.lock) through * ipc_lock_object() * - perform read-only operations that demand atomicity, * such as STAT command. * - perform data updates, such as SET, RMID commands and * mechanism-specific operations (semop/semtimedop, * msgsnd/msgrcv, shmat/shmdt). * drop the ipc lock, through ipc_unlock_object(). * rcu_read_unlock() * * The ids->rwsem must be taken when: * - creating, removing and iterating the existing entries in ipc * identifier sets. * - iterating through files under /proc/sysvipc/ * * Note that sems have a special fast path that avoids kern_ipc_perm.lock - * see sem_lock(). */ #include <linux/mm.h> #include <linux/shm.h> #include <linux/init.h> #include <linux/msg.h> #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/notifier.h> #include <linux/capability.h> #include <linux/highuid.h> #include <linux/security.h> #include <linux/rcupdate.h> #include <linux/workqueue.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/audit.h> #include <linux/nsproxy.h> #include <linux/rwsem.h> #include <linux/memory.h> #include <linux/ipc_namespace.h> #include <linux/rhashtable.h> #include <linux/log2.h> #include <asm/unistd.h> #include "util.h" struct ipc_proc_iface { const char *path; const char *header; int ids; int (*show)(struct seq_file *, void *); }; /** * ipc_init - initialise ipc subsystem * * The various sysv ipc resources (semaphores, messages and shared * memory) are initialised. * * A callback routine is registered into the memory hotplug notifier * chain: since msgmni scales to lowmem this callback routine will be * called upon successful memory add / remove to recompute msmgni. */ static int __init ipc_init(void) { proc_mkdir("sysvipc", NULL); sem_init(); msg_init(); shm_init(); return 0; } device_initcall(ipc_init); static const struct rhashtable_params ipc_kht_params = { .head_offset = offsetof(struct kern_ipc_perm, khtnode), .key_offset = offsetof(struct kern_ipc_perm, key), .key_len = sizeof_field(struct kern_ipc_perm, key), .automatic_shrinking = true, }; /** * ipc_init_ids - initialise ipc identifiers * @ids: ipc identifier set * * Set up the sequence range to use for the ipc identifier range (limited * below ipc_mni) then initialise the keys hashtable and ids idr. */ void ipc_init_ids(struct ipc_ids *ids) { ids->in_use = 0; ids->seq = 0; init_rwsem(&ids->rwsem); rhashtable_init(&ids->key_ht, &ipc_kht_params); idr_init(&ids->ipcs_idr); ids->max_idx = -1; ids->last_idx = -1; #ifdef CONFIG_CHECKPOINT_RESTORE ids->next_id = -1; #endif } #ifdef CONFIG_PROC_FS static const struct proc_ops sysvipc_proc_ops; /** * ipc_init_proc_interface - create a proc interface for sysipc types using a seq_file interface. * @path: Path in procfs * @header: Banner to be printed at the beginning of the file. * @ids: ipc id table to iterate. * @show: show routine. */ void __init ipc_init_proc_interface(const char *path, const char *header, int ids, int (*show)(struct seq_file *, void *)) { struct proc_dir_entry *pde; struct ipc_proc_iface *iface; iface = kmalloc(sizeof(*iface), GFP_KERNEL); if (!iface) return; iface->path = path; iface->header = header; iface->ids = ids; iface->show = show; pde = proc_create_data(path, S_IRUGO, /* world readable */ NULL, /* parent dir */ &sysvipc_proc_ops, iface); if (!pde) kfree(iface); } #endif /** * ipc_findkey - find a key in an ipc identifier set * @ids: ipc identifier set * @key: key to find * * Returns the locked pointer to the ipc structure if found or NULL * otherwise. If key is found ipc points to the owning ipc structure * * Called with writer ipc_ids.rwsem held. */ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key) { struct kern_ipc_perm *ipcp; ipcp = rhashtable_lookup_fast(&ids->key_ht, &key, ipc_kht_params); if (!ipcp) return NULL; rcu_read_lock(); ipc_lock_object(ipcp); return ipcp; } /* * Insert new IPC object into idr tree, and set sequence number and id * in the correct order. * Especially: * - the sequence number must be set before inserting the object into the idr, * because the sequence number is accessed without a lock. * - the id can/must be set after inserting the object into the idr. * All accesses must be done after getting kern_ipc_perm.lock. * * The caller must own kern_ipc_perm.lock.of the new object. * On error, the function returns a (negative) error code. * * To conserve sequence number space, especially with extended ipc_mni, * the sequence number is incremented only when the returned ID is less than * the last one. */ static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new) { int idx, next_id = -1; #ifdef CONFIG_CHECKPOINT_RESTORE next_id = ids->next_id; ids->next_id = -1; #endif /* * As soon as a new object is inserted into the idr, * ipc_obtain_object_idr() or ipc_obtain_object_check() can find it, * and the lockless preparations for ipc operations can start. * This means especially: permission checks, audit calls, allocation * of undo structures, ... * * Thus the object must be fully initialized, and if something fails, * then the full tear-down sequence must be followed. * (i.e.: set new->deleted, reduce refcount, call_rcu()) */ if (next_id < 0) { /* !CHECKPOINT_RESTORE or next_id is unset */ int max_idx; max_idx = max(ids->in_use*3/2, ipc_min_cycle); max_idx = min(max_idx, ipc_mni); /* allocate the idx, with a NULL struct kern_ipc_perm */ idx = idr_alloc_cyclic(&ids->ipcs_idr, NULL, 0, max_idx, GFP_NOWAIT); if (idx >= 0) { /* * idx got allocated successfully. * Now calculate the sequence number and set the * pointer for real. */ if (idx <= ids->last_idx) { ids->seq++; if (ids->seq >= ipcid_seq_max()) ids->seq = 0; } ids->last_idx = idx; new->seq = ids->seq; /* no need for smp_wmb(), this is done * inside idr_replace, as part of * rcu_assign_pointer */ idr_replace(&ids->ipcs_idr, new, idx); } } else { new->seq = ipcid_to_seqx(next_id); idx = idr_alloc(&ids->ipcs_idr, new, ipcid_to_idx(next_id), 0, GFP_NOWAIT); } if (idx >= 0) new->id = (new->seq << ipcmni_seq_shift()) + idx; return idx; } /** * ipc_addid - add an ipc identifier * @ids: ipc identifier set * @new: new ipc permission set * @limit: limit for the number of used ids * * Add an entry 'new' to the ipc ids idr. The permissions object is * initialised and the first free entry is set up and the index assigned * is returned. The 'new' entry is returned in a locked state on success. * * On failure the entry is not locked and a negative err-code is returned. * The caller must use ipc_rcu_putref() to free the identifier. * * Called with writer ipc_ids.rwsem held. */ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit) { kuid_t euid; kgid_t egid; int idx, err; /* 1) Initialize the refcount so that ipc_rcu_putref works */ refcount_set(&new->refcount, 1); if (limit > ipc_mni) limit = ipc_mni; if (ids->in_use >= limit) return -ENOSPC; idr_preload(GFP_KERNEL); spin_lock_init(&new->lock); rcu_read_lock(); spin_lock(&new->lock); current_euid_egid(&euid, &egid); new->cuid = new->uid = euid; new->gid = new->cgid = egid; new->deleted = false; idx = ipc_idr_alloc(ids, new); idr_preload_end(); if (idx >= 0 && new->key != IPC_PRIVATE) { err = rhashtable_insert_fast(&ids->key_ht, &new->khtnode, ipc_kht_params); if (err < 0) { idr_remove(&ids->ipcs_idr, idx); idx = err; } } if (idx < 0) { new->deleted = true; spin_unlock(&new->lock); rcu_read_unlock(); return idx; } ids->in_use++; if (idx > ids->max_idx) ids->max_idx = idx; return idx; } /** * ipcget_new - create a new ipc object * @ns: ipc namespace * @ids: ipc identifier set * @ops: the actual creation routine to call * @params: its parameters * * This routine is called by sys_msgget, sys_semget() and sys_shmget() * when the key is IPC_PRIVATE. */ static int ipcget_new(struct ipc_namespace *ns, struct ipc_ids *ids, const struct ipc_ops *ops, struct ipc_params *params) { int err; down_write(&ids->rwsem); err = ops->getnew(ns, params); up_write(&ids->rwsem); return err; } /** * ipc_check_perms - check security and permissions for an ipc object * @ns: ipc namespace * @ipcp: ipc permission set * @ops: the actual security routine to call * @params: its parameters * * This routine is called by sys_msgget(), sys_semget() and sys_shmget() * when the key is not IPC_PRIVATE and that key already exists in the * ds IDR. * * On success, the ipc id is returned. * * It is called with ipc_ids.rwsem and ipcp->lock held. */ static int ipc_check_perms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, const struct ipc_ops *ops, struct ipc_params *params) { int err; if (ipcperms(ns, ipcp, params->flg)) err = -EACCES; else { err = ops->associate(ipcp, params->flg); if (!err) err = ipcp->id; } return err; } /** * ipcget_public - get an ipc object or create a new one * @ns: ipc namespace * @ids: ipc identifier set * @ops: the actual creation routine to call * @params: its parameters * * This routine is called by sys_msgget, sys_semget() and sys_shmget() * when the key is not IPC_PRIVATE. * It adds a new entry if the key is not found and does some permission * / security checkings if the key is found. * * On success, the ipc id is returned. */ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids, const struct ipc_ops *ops, struct ipc_params *params) { struct kern_ipc_perm *ipcp; int flg = params->flg; int err; /* * Take the lock as a writer since we are potentially going to add * a new entry + read locks are not "upgradable" */ down_write(&ids->rwsem); ipcp = ipc_findkey(ids, params->key); if (ipcp == NULL) { /* key not used */ if (!(flg & IPC_CREAT)) err = -ENOENT; else err = ops->getnew(ns, params); } else { /* ipc object has been locked by ipc_findkey() */ if (flg & IPC_CREAT && flg & IPC_EXCL) err = -EEXIST; else { err = 0; if (ops->more_checks) err = ops->more_checks(ipcp, params); if (!err) /* * ipc_check_perms returns the IPC id on * success */ err = ipc_check_perms(ns, ipcp, ops, params); } ipc_unlock(ipcp); } up_write(&ids->rwsem); return err; } /** * ipc_kht_remove - remove an ipc from the key hashtable * @ids: ipc identifier set * @ipcp: ipc perm structure containing the key to remove * * ipc_ids.rwsem (as a writer) and the spinlock for this ID are held * before this function is called, and remain locked on the exit. */ static void ipc_kht_remove(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) { if (ipcp->key != IPC_PRIVATE) WARN_ON_ONCE(rhashtable_remove_fast(&ids->key_ht, &ipcp->khtnode, ipc_kht_params)); } /** * ipc_search_maxidx - search for the highest assigned index * @ids: ipc identifier set * @limit: known upper limit for highest assigned index * * The function determines the highest assigned index in @ids. It is intended * to be called when ids->max_idx needs to be updated. * Updating ids->max_idx is necessary when the current highest index ipc * object is deleted. * If no ipc object is allocated, then -1 is returned. * * ipc_ids.rwsem needs to be held by the caller. */ static int ipc_search_maxidx(struct ipc_ids *ids, int limit) { int tmpidx; int i; int retval; i = ilog2(limit+1); retval = 0; for (; i >= 0; i--) { tmpidx = retval | (1<<i); /* * "0" is a possible index value, thus search using * e.g. 15,7,3,1,0 instead of 16,8,4,2,1. */ tmpidx = tmpidx-1; if (idr_get_next(&ids->ipcs_idr, &tmpidx)) retval |= (1<<i); } return retval - 1; } /** * ipc_rmid - remove an ipc identifier * @ids: ipc identifier set * @ipcp: ipc perm structure containing the identifier to remove * * ipc_ids.rwsem (as a writer) and the spinlock for this ID are held * before this function is called, and remain locked on the exit. */ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) { int idx = ipcid_to_idx(ipcp->id); WARN_ON_ONCE(idr_remove(&ids->ipcs_idr, idx) != ipcp); ipc_kht_remove(ids, ipcp); ids->in_use--; ipcp->deleted = true; if (unlikely(idx == ids->max_idx)) { idx = ids->max_idx-1; if (idx >= 0) idx = ipc_search_maxidx(ids, idx); ids->max_idx = idx; } } /** * ipc_set_key_private - switch the key of an existing ipc to IPC_PRIVATE * @ids: ipc identifier set * @ipcp: ipc perm structure containing the key to modify * * ipc_ids.rwsem (as a writer) and the spinlock for this ID are held * before this function is called, and remain locked on the exit. */ void ipc_set_key_private(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) { ipc_kht_remove(ids, ipcp); ipcp->key = IPC_PRIVATE; } bool ipc_rcu_getref(struct kern_ipc_perm *ptr) { return refcount_inc_not_zero(&ptr->refcount); } void ipc_rcu_putref(struct kern_ipc_perm *ptr, void (*func)(struct rcu_head *head)) { if (!refcount_dec_and_test(&ptr->refcount)) return; call_rcu(&ptr->rcu, func); } /** * ipcperms - check ipc permissions * @ns: ipc namespace * @ipcp: ipc permission set * @flag: desired permission set * * Check user, group, other permissions for access * to ipc resources. return 0 if allowed * * @flag will most probably be 0 or ``S_...UGO`` from <linux/stat.h> */ int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flag) { kuid_t euid = current_euid(); int requested_mode, granted_mode; audit_ipc_obj(ipcp); requested_mode = (flag >> 6) | (flag >> 3) | flag; granted_mode = ipcp->mode; if (uid_eq(euid, ipcp->cuid) || uid_eq(euid, ipcp->uid)) granted_mode >>= 6; else if (in_group_p(ipcp->cgid) || in_group_p(ipcp->gid)) granted_mode >>= 3; /* is there some bit set in requested_mode but not in granted_mode? */ if ((requested_mode & ~granted_mode & 0007) && !ns_capable(ns->user_ns, CAP_IPC_OWNER)) return -1; return security_ipc_permission(ipcp, flag); } /* * Functions to convert between the kern_ipc_perm structure and the * old/new ipc_perm structures */ /** * kernel_to_ipc64_perm - convert kernel ipc permissions to user * @in: kernel permissions * @out: new style ipc permissions * * Turn the kernel object @in into a set of permissions descriptions * for returning to userspace (@out). */ void kernel_to_ipc64_perm(struct kern_ipc_perm *in, struct ipc64_perm *out) { out->key = in->key; out->uid = from_kuid_munged(current_user_ns(), in->uid); out->gid = from_kgid_munged(current_user_ns(), in->gid); out->cuid = from_kuid_munged(current_user_ns(), in->cuid); out->cgid = from_kgid_munged(current_user_ns(), in->cgid); out->mode = in->mode; out->seq = in->seq; } /** * ipc64_perm_to_ipc_perm - convert new ipc permissions to old * @in: new style ipc permissions * @out: old style ipc permissions * * Turn the new style permissions object @in into a compatibility * object and store it into the @out pointer. */ void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out) { out->key = in->key; SET_UID(out->uid, in->uid); SET_GID(out->gid, in->gid); SET_UID(out->cuid, in->cuid); SET_GID(out->cgid, in->cgid); out->mode = in->mode; out->seq = in->seq; } /** * ipc_obtain_object_idr * @ids: ipc identifier set * @id: ipc id to look for * * Look for an id in the ipc ids idr and return associated ipc object. * * Call inside the RCU critical section. * The ipc object is *not* locked on exit. */ struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id) { struct kern_ipc_perm *out; int idx = ipcid_to_idx(id); out = idr_find(&ids->ipcs_idr, idx); if (!out) return ERR_PTR(-EINVAL); return out; } /** * ipc_obtain_object_check * @ids: ipc identifier set * @id: ipc id to look for * * Similar to ipc_obtain_object_idr() but also checks the ipc object * sequence number. * * Call inside the RCU critical section. * The ipc object is *not* locked on exit. */ struct kern_ipc_perm *ipc_obtain_object_check(struct ipc_ids *ids, int id) { struct kern_ipc_perm *out = ipc_obtain_object_idr(ids, id); if (IS_ERR(out)) goto out; if (ipc_checkid(out, id)) return ERR_PTR(-EINVAL); out: return out; } /** * ipcget - Common sys_*get() code * @ns: namespace * @ids: ipc identifier set * @ops: operations to be called on ipc object creation, permission checks * and further checks * @params: the parameters needed by the previous operations. * * Common routine called by sys_msgget(), sys_semget() and sys_shmget(). */ int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids, const struct ipc_ops *ops, struct ipc_params *params) { if (params->key == IPC_PRIVATE) return ipcget_new(ns, ids, ops, params); else return ipcget_public(ns, ids, ops, params); } /** * ipc_update_perm - update the permissions of an ipc object * @in: the permission given as input. * @out: the permission of the ipc to set. */ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out) { kuid_t uid = make_kuid(current_user_ns(), in->uid); kgid_t gid = make_kgid(current_user_ns(), in->gid); if (!uid_valid(uid) || !gid_valid(gid)) return -EINVAL; out->uid = uid; out->gid = gid; out->mode = (out->mode & ~S_IRWXUGO) | (in->mode & S_IRWXUGO); return 0; } /** * ipcctl_obtain_check - retrieve an ipc object and check permissions * @ns: ipc namespace * @ids: the table of ids where to look for the ipc * @id: the id of the ipc to retrieve * @cmd: the cmd to check * @perm: the permission to set * @extra_perm: one extra permission parameter used by msq * * This function does some common audit and permissions check for some IPC_XXX * cmd and is called from semctl_down, shmctl_down and msgctl_down. * * It: * - retrieves the ipc object with the given id in the given table. * - performs some audit and permission check, depending on the given cmd * - returns a pointer to the ipc object or otherwise, the corresponding * error. * * Call holding the both the rwsem and the rcu read lock. */ struct kern_ipc_perm *ipcctl_obtain_check(struct ipc_namespace *ns, struct ipc_ids *ids, int id, int cmd, struct ipc64_perm *perm, int extra_perm) { kuid_t euid; int err = -EPERM; struct kern_ipc_perm *ipcp; ipcp = ipc_obtain_object_check(ids, id); if (IS_ERR(ipcp)) { err = PTR_ERR(ipcp); goto err; } audit_ipc_obj(ipcp); if (cmd == IPC_SET) audit_ipc_set_perm(extra_perm, perm->uid, perm->gid, perm->mode); euid = current_euid(); if (uid_eq(euid, ipcp->cuid) || uid_eq(euid, ipcp->uid) || ns_capable(ns->user_ns, CAP_SYS_ADMIN)) return ipcp; /* successful lookup */ err: return ERR_PTR(err); } #ifdef CONFIG_ARCH_WANT_IPC_PARSE_VERSION /** * ipc_parse_version - ipc call version * @cmd: pointer to command * * Return IPC_64 for new style IPC and IPC_OLD for old style IPC. * The @cmd value is turned from an encoding command and version into * just the command code. */ int ipc_parse_version(int *cmd) { if (*cmd & IPC_64) { *cmd ^= IPC_64; return IPC_64; } else { return IPC_OLD; } } #endif /* CONFIG_ARCH_WANT_IPC_PARSE_VERSION */ #ifdef CONFIG_PROC_FS struct ipc_proc_iter { struct ipc_namespace *ns; struct pid_namespace *pid_ns; struct ipc_proc_iface *iface; }; struct pid_namespace *ipc_seq_pid_ns(struct seq_file *s) { struct ipc_proc_iter *iter = s->private; return iter->pid_ns; } /** * sysvipc_find_ipc - Find and lock the ipc structure based on seq pos * @ids: ipc identifier set * @pos: expected position * * The function finds an ipc structure, based on the sequence file * position @pos. If there is no ipc structure at position @pos, then * the successor is selected. * If a structure is found, then it is locked (both rcu_read_lock() and * ipc_lock_object()) and @pos is set to the position needed to locate * the found ipc structure. * If nothing is found (i.e. EOF), @pos is not modified. * * The function returns the found ipc structure, or NULL at EOF. */ static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t *pos) { int tmpidx; struct kern_ipc_perm *ipc; /* convert from position to idr index -> "-1" */ tmpidx = *pos - 1; ipc = idr_get_next(&ids->ipcs_idr, &tmpidx); if (ipc != NULL) { rcu_read_lock(); ipc_lock_object(ipc); /* convert from idr index to position -> "+1" */ *pos = tmpidx + 1; } return ipc; } static void *sysvipc_proc_next(struct seq_file *s, void *it, loff_t *pos) { struct ipc_proc_iter *iter = s->private; struct ipc_proc_iface *iface = iter->iface; struct kern_ipc_perm *ipc = it; /* If we had an ipc id locked before, unlock it */ if (ipc && ipc != SEQ_START_TOKEN) ipc_unlock(ipc); /* Next -> search for *pos+1 */ (*pos)++; return sysvipc_find_ipc(&iter->ns->ids[iface->ids], pos); } /* * File positions: pos 0 -> header, pos n -> ipc idx = n - 1. * SeqFile iterator: iterator value locked ipc pointer or SEQ_TOKEN_START. */ static void *sysvipc_proc_start(struct seq_file *s, loff_t *pos) { struct ipc_proc_iter *iter = s->private; struct ipc_proc_iface *iface = iter->iface; struct ipc_ids *ids; ids = &iter->ns->ids[iface->ids]; /* * Take the lock - this will be released by the corresponding * call to stop(). */ down_read(&ids->rwsem); /* pos < 0 is invalid */ if (*pos < 0) return NULL; /* pos == 0 means header */ if (*pos == 0) return SEQ_START_TOKEN; /* Otherwise return the correct ipc structure */ return sysvipc_find_ipc(ids, pos); } static void sysvipc_proc_stop(struct seq_file *s, void *it) { struct kern_ipc_perm *ipc = it; struct ipc_proc_iter *iter = s->private; struct ipc_proc_iface *iface = iter->iface; struct ipc_ids *ids; /* If we had a locked structure, release it */ if (ipc && ipc != SEQ_START_TOKEN) ipc_unlock(ipc); ids = &iter->ns->ids[iface->ids]; /* Release the lock we took in start() */ up_read(&ids->rwsem); } static int sysvipc_proc_show(struct seq_file *s, void *it) { struct ipc_proc_iter *iter = s->private; struct ipc_proc_iface *iface = iter->iface; if (it == SEQ_START_TOKEN) { seq_puts(s, iface->header); return 0; } return iface->show(s, it); } static const struct seq_operations sysvipc_proc_seqops = { .start = sysvipc_proc_start, .stop = sysvipc_proc_stop, .next = sysvipc_proc_next, .show = sysvipc_proc_show, }; static int sysvipc_proc_open(struct inode *inode, struct file *file) { struct ipc_proc_iter *iter; iter = __seq_open_private(file, &sysvipc_proc_seqops, sizeof(*iter)); if (!iter) return -ENOMEM; iter->iface = pde_data(inode); iter->ns = get_ipc_ns(current->nsproxy->ipc_ns); iter->pid_ns = get_pid_ns(task_active_pid_ns(current)); return 0; } static int sysvipc_proc_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; struct ipc_proc_iter *iter = seq->private; put_ipc_ns(iter->ns); put_pid_ns(iter->pid_ns); return seq_release_private(inode, file); } static const struct proc_ops sysvipc_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = sysvipc_proc_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = sysvipc_proc_release, }; #endif /* CONFIG_PROC_FS */
634 154 154 618 154 154 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_CONNTRACK_ZONES_H #define _NF_CONNTRACK_ZONES_H #include <linux/netfilter/nf_conntrack_zones_common.h> #include <net/netfilter/nf_conntrack.h> static inline const struct nf_conntrack_zone * nf_ct_zone(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_ZONES return &ct->zone; #else return &nf_ct_zone_dflt; #endif } static inline const struct nf_conntrack_zone * nf_ct_zone_init(struct nf_conntrack_zone *zone, u16 id, u8 dir, u8 flags) { zone->id = id; zone->flags = flags; zone->dir = dir; return zone; } static inline const struct nf_conntrack_zone * nf_ct_zone_tmpl(const struct nf_conn *tmpl, const struct sk_buff *skb, struct nf_conntrack_zone *tmp) { #ifdef CONFIG_NF_CONNTRACK_ZONES if (!tmpl) return &nf_ct_zone_dflt; if (tmpl->zone.flags & NF_CT_FLAG_MARK) return nf_ct_zone_init(tmp, skb->mark, tmpl->zone.dir, 0); #endif return nf_ct_zone(tmpl); } static inline void nf_ct_zone_add(struct nf_conn *ct, const struct nf_conntrack_zone *zone) { #ifdef CONFIG_NF_CONNTRACK_ZONES ct->zone = *zone; #endif } static inline bool nf_ct_zone_matches_dir(const struct nf_conntrack_zone *zone, enum ip_conntrack_dir dir) { return zone->dir & (1 << dir); } static inline u16 nf_ct_zone_id(const struct nf_conntrack_zone *zone, enum ip_conntrack_dir dir) { #ifdef CONFIG_NF_CONNTRACK_ZONES return nf_ct_zone_matches_dir(zone, dir) ? zone->id : NF_CT_DEFAULT_ZONE_ID; #else return NF_CT_DEFAULT_ZONE_ID; #endif } static inline bool nf_ct_zone_equal(const struct nf_conn *a, const struct nf_conntrack_zone *b, enum ip_conntrack_dir dir) { #ifdef CONFIG_NF_CONNTRACK_ZONES return nf_ct_zone_id(nf_ct_zone(a), dir) == nf_ct_zone_id(b, dir); #else return true; #endif } static inline bool nf_ct_zone_equal_any(const struct nf_conn *a, const struct nf_conntrack_zone *b) { #ifdef CONFIG_NF_CONNTRACK_ZONES return nf_ct_zone(a)->id == b->id; #else return true; #endif } #endif /* _NF_CONNTRACK_ZONES_H */
2 2 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 // SPDX-License-Identifier: GPL-2.0-or-later #include <net/genetlink.h> #include <uapi/linux/mrp_bridge.h> #include "br_private.h" #include "br_private_mrp.h" static const struct nla_policy br_mrp_policy[IFLA_BRIDGE_MRP_MAX + 1] = { [IFLA_BRIDGE_MRP_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_MRP_INSTANCE] = { .type = NLA_NESTED }, [IFLA_BRIDGE_MRP_PORT_STATE] = { .type = NLA_NESTED }, [IFLA_BRIDGE_MRP_PORT_ROLE] = { .type = NLA_NESTED }, [IFLA_BRIDGE_MRP_RING_STATE] = { .type = NLA_NESTED }, [IFLA_BRIDGE_MRP_RING_ROLE] = { .type = NLA_NESTED }, [IFLA_BRIDGE_MRP_START_TEST] = { .type = NLA_NESTED }, [IFLA_BRIDGE_MRP_IN_ROLE] = { .type = NLA_NESTED }, [IFLA_BRIDGE_MRP_IN_STATE] = { .type = NLA_NESTED }, [IFLA_BRIDGE_MRP_START_IN_TEST] = { .type = NLA_NESTED }, }; static const struct nla_policy br_mrp_instance_policy[IFLA_BRIDGE_MRP_INSTANCE_MAX + 1] = { [IFLA_BRIDGE_MRP_INSTANCE_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_MRP_INSTANCE_RING_ID] = { .type = NLA_U32 }, [IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX] = { .type = NLA_U32 }, [IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX] = { .type = NLA_U32 }, [IFLA_BRIDGE_MRP_INSTANCE_PRIO] = { .type = NLA_U16 }, }; static int br_mrp_instance_parse(struct net_bridge *br, struct nlattr *attr, int cmd, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_MRP_INSTANCE_MAX + 1]; struct br_mrp_instance inst; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_INSTANCE_MAX, attr, br_mrp_instance_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_MRP_INSTANCE_RING_ID] || !tb[IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX] || !tb[IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX]) { NL_SET_ERR_MSG_MOD(extack, "Missing attribute: RING_ID or P_IFINDEX or S_IFINDEX"); return -EINVAL; } memset(&inst, 0, sizeof(inst)); inst.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_RING_ID]); inst.p_ifindex = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX]); inst.s_ifindex = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX]); inst.prio = MRP_DEFAULT_PRIO; if (tb[IFLA_BRIDGE_MRP_INSTANCE_PRIO]) inst.prio = nla_get_u16(tb[IFLA_BRIDGE_MRP_INSTANCE_PRIO]); if (cmd == RTM_SETLINK) return br_mrp_add(br, &inst); else return br_mrp_del(br, &inst); return 0; } static const struct nla_policy br_mrp_port_state_policy[IFLA_BRIDGE_MRP_PORT_STATE_MAX + 1] = { [IFLA_BRIDGE_MRP_PORT_STATE_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_MRP_PORT_STATE_STATE] = { .type = NLA_U32 }, }; static int br_mrp_port_state_parse(struct net_bridge_port *p, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_MRP_PORT_STATE_MAX + 1]; enum br_mrp_port_state_type state; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_PORT_STATE_MAX, attr, br_mrp_port_state_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_MRP_PORT_STATE_STATE]) { NL_SET_ERR_MSG_MOD(extack, "Missing attribute: STATE"); return -EINVAL; } state = nla_get_u32(tb[IFLA_BRIDGE_MRP_PORT_STATE_STATE]); return br_mrp_set_port_state(p, state); } static const struct nla_policy br_mrp_port_role_policy[IFLA_BRIDGE_MRP_PORT_ROLE_MAX + 1] = { [IFLA_BRIDGE_MRP_PORT_ROLE_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_MRP_PORT_ROLE_ROLE] = { .type = NLA_U32 }, }; static int br_mrp_port_role_parse(struct net_bridge_port *p, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_MRP_PORT_ROLE_MAX + 1]; enum br_mrp_port_role_type role; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_PORT_ROLE_MAX, attr, br_mrp_port_role_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_MRP_PORT_ROLE_ROLE]) { NL_SET_ERR_MSG_MOD(extack, "Missing attribute: ROLE"); return -EINVAL; } role = nla_get_u32(tb[IFLA_BRIDGE_MRP_PORT_ROLE_ROLE]); return br_mrp_set_port_role(p, role); } static const struct nla_policy br_mrp_ring_state_policy[IFLA_BRIDGE_MRP_RING_STATE_MAX + 1] = { [IFLA_BRIDGE_MRP_RING_STATE_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_MRP_RING_STATE_RING_ID] = { .type = NLA_U32 }, [IFLA_BRIDGE_MRP_RING_STATE_STATE] = { .type = NLA_U32 }, }; static int br_mrp_ring_state_parse(struct net_bridge *br, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_MRP_RING_STATE_MAX + 1]; struct br_mrp_ring_state state; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_RING_STATE_MAX, attr, br_mrp_ring_state_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_MRP_RING_STATE_RING_ID] || !tb[IFLA_BRIDGE_MRP_RING_STATE_STATE]) { NL_SET_ERR_MSG_MOD(extack, "Missing attribute: RING_ID or STATE"); return -EINVAL; } memset(&state, 0x0, sizeof(state)); state.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_STATE_RING_ID]); state.ring_state = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_STATE_STATE]); return br_mrp_set_ring_state(br, &state); } static const struct nla_policy br_mrp_ring_role_policy[IFLA_BRIDGE_MRP_RING_ROLE_MAX + 1] = { [IFLA_BRIDGE_MRP_RING_ROLE_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_MRP_RING_ROLE_RING_ID] = { .type = NLA_U32 }, [IFLA_BRIDGE_MRP_RING_ROLE_ROLE] = { .type = NLA_U32 }, }; static int br_mrp_ring_role_parse(struct net_bridge *br, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_MRP_RING_ROLE_MAX + 1]; struct br_mrp_ring_role role; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_RING_ROLE_MAX, attr, br_mrp_ring_role_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_MRP_RING_ROLE_RING_ID] || !tb[IFLA_BRIDGE_MRP_RING_ROLE_ROLE]) { NL_SET_ERR_MSG_MOD(extack, "Missing attribute: RING_ID or ROLE"); return -EINVAL; } memset(&role, 0x0, sizeof(role)); role.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_ROLE_RING_ID]); role.ring_role = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_ROLE_ROLE]); return br_mrp_set_ring_role(br, &role); } static const struct nla_policy br_mrp_start_test_policy[IFLA_BRIDGE_MRP_START_TEST_MAX + 1] = { [IFLA_BRIDGE_MRP_START_TEST_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_MRP_START_TEST_RING_ID] = { .type = NLA_U32 }, [IFLA_BRIDGE_MRP_START_TEST_INTERVAL] = { .type = NLA_U32 }, [IFLA_BRIDGE_MRP_START_TEST_MAX_MISS] = { .type = NLA_U32 }, [IFLA_BRIDGE_MRP_START_TEST_PERIOD] = { .type = NLA_U32 }, [IFLA_BRIDGE_MRP_START_TEST_MONITOR] = { .type = NLA_U32 }, }; static int br_mrp_start_test_parse(struct net_bridge *br, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_MRP_START_TEST_MAX + 1]; struct br_mrp_start_test test; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_START_TEST_MAX, attr, br_mrp_start_test_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_MRP_START_TEST_RING_ID] || !tb[IFLA_BRIDGE_MRP_START_TEST_INTERVAL] || !tb[IFLA_BRIDGE_MRP_START_TEST_MAX_MISS] || !tb[IFLA_BRIDGE_MRP_START_TEST_PERIOD]) { NL_SET_ERR_MSG_MOD(extack, "Missing attribute: RING_ID or INTERVAL or MAX_MISS or PERIOD"); return -EINVAL; } memset(&test, 0x0, sizeof(test)); test.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_RING_ID]); test.interval = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_INTERVAL]); test.max_miss = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_MAX_MISS]); test.period = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_PERIOD]); test.monitor = false; if (tb[IFLA_BRIDGE_MRP_START_TEST_MONITOR]) test.monitor = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_MONITOR]); return br_mrp_start_test(br, &test); } static const struct nla_policy br_mrp_in_state_policy[IFLA_BRIDGE_MRP_IN_STATE_MAX + 1] = { [IFLA_BRIDGE_MRP_IN_STATE_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_MRP_IN_STATE_IN_ID] = { .type = NLA_U32 }, [IFLA_BRIDGE_MRP_IN_STATE_STATE] = { .type = NLA_U32 }, }; static int br_mrp_in_state_parse(struct net_bridge *br, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_MRP_IN_STATE_MAX + 1]; struct br_mrp_in_state state; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_IN_STATE_MAX, attr, br_mrp_in_state_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_MRP_IN_STATE_IN_ID] || !tb[IFLA_BRIDGE_MRP_IN_STATE_STATE]) { NL_SET_ERR_MSG_MOD(extack, "Missing attribute: IN_ID or STATE"); return -EINVAL; } memset(&state, 0x0, sizeof(state)); state.in_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_IN_STATE_IN_ID]); state.in_state = nla_get_u32(tb[IFLA_BRIDGE_MRP_IN_STATE_STATE]); return br_mrp_set_in_state(br, &state); } static const struct nla_policy br_mrp_in_role_policy[IFLA_BRIDGE_MRP_IN_ROLE_MAX + 1] = { [IFLA_BRIDGE_MRP_IN_ROLE_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_MRP_IN_ROLE_RING_ID] = { .type = NLA_U32 }, [IFLA_BRIDGE_MRP_IN_ROLE_IN_ID] = { .type = NLA_U16 }, [IFLA_BRIDGE_MRP_IN_ROLE_ROLE] = { .type = NLA_U32 }, [IFLA_BRIDGE_MRP_IN_ROLE_I_IFINDEX] = { .type = NLA_U32 }, }; static int br_mrp_in_role_parse(struct net_bridge *br, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_MRP_IN_ROLE_MAX + 1]; struct br_mrp_in_role role; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_IN_ROLE_MAX, attr, br_mrp_in_role_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_MRP_IN_ROLE_RING_ID] || !tb[IFLA_BRIDGE_MRP_IN_ROLE_IN_ID] || !tb[IFLA_BRIDGE_MRP_IN_ROLE_I_IFINDEX] || !tb[IFLA_BRIDGE_MRP_IN_ROLE_ROLE]) { NL_SET_ERR_MSG_MOD(extack, "Missing attribute: RING_ID or ROLE or IN_ID or I_IFINDEX"); return -EINVAL; } memset(&role, 0x0, sizeof(role)); role.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_IN_ROLE_RING_ID]); role.in_id = nla_get_u16(tb[IFLA_BRIDGE_MRP_IN_ROLE_IN_ID]); role.i_ifindex = nla_get_u32(tb[IFLA_BRIDGE_MRP_IN_ROLE_I_IFINDEX]); role.in_role = nla_get_u32(tb[IFLA_BRIDGE_MRP_IN_ROLE_ROLE]); return br_mrp_set_in_role(br, &role); } static const struct nla_policy br_mrp_start_in_test_policy[IFLA_BRIDGE_MRP_START_IN_TEST_MAX + 1] = { [IFLA_BRIDGE_MRP_START_IN_TEST_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_MRP_START_IN_TEST_IN_ID] = { .type = NLA_U32 }, [IFLA_BRIDGE_MRP_START_IN_TEST_INTERVAL] = { .type = NLA_U32 }, [IFLA_BRIDGE_MRP_START_IN_TEST_MAX_MISS] = { .type = NLA_U32 }, [IFLA_BRIDGE_MRP_START_IN_TEST_PERIOD] = { .type = NLA_U32 }, }; static int br_mrp_start_in_test_parse(struct net_bridge *br, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_MRP_START_IN_TEST_MAX + 1]; struct br_mrp_start_in_test test; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_START_IN_TEST_MAX, attr, br_mrp_start_in_test_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_MRP_START_IN_TEST_IN_ID] || !tb[IFLA_BRIDGE_MRP_START_IN_TEST_INTERVAL] || !tb[IFLA_BRIDGE_MRP_START_IN_TEST_MAX_MISS] || !tb[IFLA_BRIDGE_MRP_START_IN_TEST_PERIOD]) { NL_SET_ERR_MSG_MOD(extack, "Missing attribute: RING_ID or INTERVAL or MAX_MISS or PERIOD"); return -EINVAL; } memset(&test, 0x0, sizeof(test)); test.in_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_IN_TEST_IN_ID]); test.interval = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_IN_TEST_INTERVAL]); test.max_miss = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_IN_TEST_MAX_MISS]); test.period = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_IN_TEST_PERIOD]); return br_mrp_start_in_test(br, &test); } int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, struct nlattr *attr, int cmd, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_MRP_MAX + 1]; int err; /* When this function is called for a port then the br pointer is * invalid, therefor set the br to point correctly */ if (p) br = p->br; if (br->stp_enabled != BR_NO_STP) { NL_SET_ERR_MSG_MOD(extack, "MRP can't be enabled if STP is already enabled"); return -EINVAL; } err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_MAX, attr, br_mrp_policy, extack); if (err) return err; if (tb[IFLA_BRIDGE_MRP_INSTANCE]) { err = br_mrp_instance_parse(br, tb[IFLA_BRIDGE_MRP_INSTANCE], cmd, extack); if (err) return err; } if (tb[IFLA_BRIDGE_MRP_PORT_STATE]) { err = br_mrp_port_state_parse(p, tb[IFLA_BRIDGE_MRP_PORT_STATE], extack); if (err) return err; } if (tb[IFLA_BRIDGE_MRP_PORT_ROLE]) { err = br_mrp_port_role_parse(p, tb[IFLA_BRIDGE_MRP_PORT_ROLE], extack); if (err) return err; } if (tb[IFLA_BRIDGE_MRP_RING_STATE]) { err = br_mrp_ring_state_parse(br, tb[IFLA_BRIDGE_MRP_RING_STATE], extack); if (err) return err; } if (tb[IFLA_BRIDGE_MRP_RING_ROLE]) { err = br_mrp_ring_role_parse(br, tb[IFLA_BRIDGE_MRP_RING_ROLE], extack); if (err) return err; } if (tb[IFLA_BRIDGE_MRP_START_TEST]) { err = br_mrp_start_test_parse(br, tb[IFLA_BRIDGE_MRP_START_TEST], extack); if (err) return err; } if (tb[IFLA_BRIDGE_MRP_IN_STATE]) { err = br_mrp_in_state_parse(br, tb[IFLA_BRIDGE_MRP_IN_STATE], extack); if (err) return err; } if (tb[IFLA_BRIDGE_MRP_IN_ROLE]) { err = br_mrp_in_role_parse(br, tb[IFLA_BRIDGE_MRP_IN_ROLE], extack); if (err) return err; } if (tb[IFLA_BRIDGE_MRP_START_IN_TEST]) { err = br_mrp_start_in_test_parse(br, tb[IFLA_BRIDGE_MRP_START_IN_TEST], extack); if (err) return err; } return 0; } int br_mrp_fill_info(struct sk_buff *skb, struct net_bridge *br) { struct nlattr *tb, *mrp_tb; struct br_mrp *mrp; mrp_tb = nla_nest_start_noflag(skb, IFLA_BRIDGE_MRP); if (!mrp_tb) return -EMSGSIZE; hlist_for_each_entry_rcu(mrp, &br->mrp_list, list) { struct net_bridge_port *p; tb = nla_nest_start_noflag(skb, IFLA_BRIDGE_MRP_INFO); if (!tb) goto nla_info_failure; if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_RING_ID, mrp->ring_id)) goto nla_put_failure; p = rcu_dereference(mrp->p_port); if (p && nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_P_IFINDEX, p->dev->ifindex)) goto nla_put_failure; p = rcu_dereference(mrp->s_port); if (p && nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_S_IFINDEX, p->dev->ifindex)) goto nla_put_failure; p = rcu_dereference(mrp->i_port); if (p && nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_I_IFINDEX, p->dev->ifindex)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_BRIDGE_MRP_INFO_PRIO, mrp->prio)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_RING_STATE, mrp->ring_state)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_RING_ROLE, mrp->ring_role)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_TEST_INTERVAL, mrp->test_interval)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_TEST_MAX_MISS, mrp->test_max_miss)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_TEST_MONITOR, mrp->test_monitor)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_IN_STATE, mrp->in_state)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_IN_ROLE, mrp->in_role)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_IN_TEST_INTERVAL, mrp->in_test_interval)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_IN_TEST_MAX_MISS, mrp->in_test_max_miss)) goto nla_put_failure; nla_nest_end(skb, tb); } nla_nest_end(skb, mrp_tb); return 0; nla_put_failure: nla_nest_cancel(skb, tb); nla_info_failure: nla_nest_cancel(skb, mrp_tb); return -EMSGSIZE; } int br_mrp_ring_port_open(struct net_device *dev, u8 loc) { struct net_bridge_port *p; int err = 0; p = br_port_get_rcu(dev); if (!p) { err = -EINVAL; goto out; } if (loc) p->flags |= BR_MRP_LOST_CONT; else p->flags &= ~BR_MRP_LOST_CONT; br_ifinfo_notify(RTM_NEWLINK, NULL, p); out: return err; } int br_mrp_in_port_open(struct net_device *dev, u8 loc) { struct net_bridge_port *p; int err = 0; p = br_port_get_rcu(dev); if (!p) { err = -EINVAL; goto out; } if (loc) p->flags |= BR_MRP_LOST_IN_CONT; else p->flags &= ~BR_MRP_LOST_IN_CONT; br_ifinfo_notify(RTM_NEWLINK, NULL, p); out: return err; }
1078 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SWAP_H #define _LINUX_SWAP_H #include <linux/spinlock.h> #include <linux/linkage.h> #include <linux/mmzone.h> #include <linux/list.h> #include <linux/memcontrol.h> #include <linux/sched.h> #include <linux/node.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/atomic.h> #include <linux/page-flags.h> #include <uapi/linux/mempolicy.h> #include <asm/page.h> struct notifier_block; struct bio; struct pagevec; #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ #define SWAP_FLAG_PRIO_MASK 0x7fff #define SWAP_FLAG_PRIO_SHIFT 0 #define SWAP_FLAG_DISCARD 0x10000 /* enable discard for swap */ #define SWAP_FLAG_DISCARD_ONCE 0x20000 /* discard swap area at swapon-time */ #define SWAP_FLAG_DISCARD_PAGES 0x40000 /* discard page-clusters after use */ #define SWAP_FLAGS_VALID (SWAP_FLAG_PRIO_MASK | SWAP_FLAG_PREFER | \ SWAP_FLAG_DISCARD | SWAP_FLAG_DISCARD_ONCE | \ SWAP_FLAG_DISCARD_PAGES) #define SWAP_BATCH 64 static inline int current_is_kswapd(void) { return current->flags & PF_KSWAPD; } /* * MAX_SWAPFILES defines the maximum number of swaptypes: things which can * be swapped to. The swap type and the offset into that swap type are * encoded into pte's and into pgoff_t's in the swapcache. Using five bits * for the type means that the maximum number of swapcache pages is 27 bits * on 32-bit-pgoff_t architectures. And that assumes that the architecture packs * the type/offset into the pte as 5/27 as well. */ #define MAX_SWAPFILES_SHIFT 5 /* * Use some of the swap files numbers for other purposes. This * is a convenient way to hook into the VM to trigger special * actions on faults. */ /* * PTE markers are used to persist information onto PTEs that otherwise * should be a none pte. As its name "PTE" hints, it should only be * applied to the leaves of pgtables. */ #define SWP_PTE_MARKER_NUM 1 #define SWP_PTE_MARKER (MAX_SWAPFILES + SWP_HWPOISON_NUM + \ SWP_MIGRATION_NUM + SWP_DEVICE_NUM) /* * Unaddressable device memory support. See include/linux/hmm.h and * Documentation/mm/hmm.rst. Short description is we need struct pages for * device memory that is unaddressable (inaccessible) by CPU, so that we can * migrate part of a process memory to device memory. * * When a page is migrated from CPU to device, we set the CPU page table entry * to a special SWP_DEVICE_{READ|WRITE} entry. * * When a page is mapped by the device for exclusive access we set the CPU page * table entries to special SWP_DEVICE_EXCLUSIVE_* entries. */ #ifdef CONFIG_DEVICE_PRIVATE #define SWP_DEVICE_NUM 4 #define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM) #define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1) #define SWP_DEVICE_EXCLUSIVE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2) #define SWP_DEVICE_EXCLUSIVE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+3) #else #define SWP_DEVICE_NUM 0 #endif /* * Page migration support. * * SWP_MIGRATION_READ_EXCLUSIVE is only applicable to anonymous pages and * indicates that the referenced (part of) an anonymous page is exclusive to * a single process. For SWP_MIGRATION_WRITE, that information is implicit: * (part of) an anonymous page that are mapped writable are exclusive to a * single process. */ #ifdef CONFIG_MIGRATION #define SWP_MIGRATION_NUM 3 #define SWP_MIGRATION_READ (MAX_SWAPFILES + SWP_HWPOISON_NUM) #define SWP_MIGRATION_READ_EXCLUSIVE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 1) #define SWP_MIGRATION_WRITE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 2) #else #define SWP_MIGRATION_NUM 0 #endif /* * Handling of hardware poisoned pages with memory corruption. */ #ifdef CONFIG_MEMORY_FAILURE #define SWP_HWPOISON_NUM 1 #define SWP_HWPOISON MAX_SWAPFILES #else #define SWP_HWPOISON_NUM 0 #endif #define MAX_SWAPFILES \ ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \ SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - \ SWP_PTE_MARKER_NUM) /* * Magic header for a swap area. The first part of the union is * what the swap magic looks like for the old (limited to 128MB) * swap area format, the second part of the union adds - in the * old reserved area - some extra information. Note that the first * kilobyte is reserved for boot loader or disk label stuff... * * Having the magic at the end of the PAGE_SIZE makes detecting swap * areas somewhat tricky on machines that support multiple page sizes. * For 2.5 we'll probably want to move the magic to just beyond the * bootbits... */ union swap_header { struct { char reserved[PAGE_SIZE - 10]; char magic[10]; /* SWAP-SPACE or SWAPSPACE2 */ } magic; struct { char bootbits[1024]; /* Space for disklabel etc. */ __u32 version; __u32 last_page; __u32 nr_badpages; unsigned char sws_uuid[16]; unsigned char sws_volume[16]; __u32 padding[117]; __u32 badpages[1]; } info; }; /* * current->reclaim_state points to one of these when a task is running * memory reclaim */ struct reclaim_state { /* pages reclaimed outside of LRU-based reclaim */ unsigned long reclaimed; #ifdef CONFIG_LRU_GEN /* per-thread mm walk data */ struct lru_gen_mm_walk *mm_walk; #endif }; /* * mm_account_reclaimed_pages(): account reclaimed pages outside of LRU-based * reclaim * @pages: number of pages reclaimed * * If the current process is undergoing a reclaim operation, increment the * number of reclaimed pages by @pages. */ static inline void mm_account_reclaimed_pages(unsigned long pages) { if (current->reclaim_state) current->reclaim_state->reclaimed += pages; } #ifdef __KERNEL__ struct address_space; struct sysinfo; struct writeback_control; struct zone; /* * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of * disk blocks. A rbtree of swap extents maps the entire swapfile (Where the * term `swapfile' refers to either a blockdevice or an IS_REG file). Apart * from setup, they're handled identically. * * We always assume that blocks are of size PAGE_SIZE. */ struct swap_extent { struct rb_node rb_node; pgoff_t start_page; pgoff_t nr_pages; sector_t start_block; }; /* * Max bad pages in the new format.. */ #define MAX_SWAP_BADPAGES \ ((offsetof(union swap_header, magic.magic) - \ offsetof(union swap_header, info.badpages)) / sizeof(int)) enum { SWP_USED = (1 << 0), /* is slot in swap_info[] used? */ SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */ SWP_DISCARDABLE = (1 << 2), /* blkdev support discard */ SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */ SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */ SWP_BLKDEV = (1 << 6), /* its a block device */ SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */ SWP_FS_OPS = (1 << 8), /* swapfile operations go through fs */ SWP_AREA_DISCARD = (1 << 9), /* single-time swap area discards */ SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */ SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */ /* add others here before... */ SWP_SCANNING = (1 << 14), /* refcount in scan_swap_map */ }; #define SWAP_CLUSTER_MAX 32UL #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX /* Bit flag in swap_map */ #define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */ #define COUNT_CONTINUED 0x80 /* Flag swap_map continuation for full count */ /* Special value in first swap_map */ #define SWAP_MAP_MAX 0x3e /* Max count */ #define SWAP_MAP_BAD 0x3f /* Note page is bad */ #define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs */ /* Special value in each swap_map continuation */ #define SWAP_CONT_MAX 0x7f /* Max count */ /* * We use this to track usage of a cluster. A cluster is a block of swap disk * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All * free clusters are organized into a list. We fetch an entry from the list to * get a free cluster. * * The data field stores next cluster if the cluster is free or cluster usage * counter otherwise. The flags field determines if a cluster is free. This is * protected by swap_info_struct.lock. */ struct swap_cluster_info { spinlock_t lock; /* * Protect swap_cluster_info fields * and swap_info_struct->swap_map * elements correspond to the swap * cluster */ unsigned int data:24; unsigned int flags:8; }; #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ #define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */ #define CLUSTER_FLAG_HUGE 4 /* This cluster is backing a transparent huge page */ /* * We assign a cluster to each CPU, so each CPU can allocate swap entry from * its own cluster and swapout sequentially. The purpose is to optimize swapout * throughput. */ struct percpu_cluster { struct swap_cluster_info index; /* Current cluster index */ unsigned int next; /* Likely next allocation offset */ }; struct swap_cluster_list { struct swap_cluster_info head; struct swap_cluster_info tail; }; /* * The in-memory structure used to track swap areas. */ struct swap_info_struct { struct percpu_ref users; /* indicate and keep swap device valid. */ unsigned long flags; /* SWP_USED etc: see above */ signed short prio; /* swap priority of this type */ struct plist_node list; /* entry in swap_active_head */ signed char type; /* strange name for an index */ unsigned int max; /* extent of the swap_map */ unsigned char *swap_map; /* vmalloc'ed array of usage counts */ struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ struct swap_cluster_list free_clusters; /* free clusters list */ unsigned int lowest_bit; /* index of first free in swap_map */ unsigned int highest_bit; /* index of last free in swap_map */ unsigned int pages; /* total of usable pages of swap */ unsigned int inuse_pages; /* number of those currently in use */ unsigned int cluster_next; /* likely index for next allocation */ unsigned int cluster_nr; /* countdown to next cluster search */ unsigned int __percpu *cluster_next_cpu; /*percpu index for next allocation */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct file *bdev_file; /* open handle of the bdev */ struct block_device *bdev; /* swap device or bdev of swap file */ struct file *swap_file; /* seldom referenced */ unsigned int old_block_size; /* seldom referenced */ struct completion comp; /* seldom referenced */ spinlock_t lock; /* * protect map scan related fields like * swap_map, lowest_bit, highest_bit, * inuse_pages, cluster_next, * cluster_nr, lowest_alloc, * highest_alloc, free/discard cluster * list. other fields are only changed * at swapon/swapoff, so are protected * by swap_lock. changing flags need * hold this lock and swap_lock. If * both locks need hold, hold swap_lock * first. */ spinlock_t cont_lock; /* * protect swap count continuation page * list. */ struct work_struct discard_work; /* discard worker */ struct swap_cluster_list discard_clusters; /* discard clusters list */ struct plist_node avail_lists[]; /* * entries in swap_avail_heads, one * entry per node. * Must be last as the number of the * array is nr_node_ids, which is not * a fixed value so have to allocate * dynamically. * And it has to be an array so that * plist_for_each_* can work. */ }; static inline swp_entry_t page_swap_entry(struct page *page) { struct folio *folio = page_folio(page); swp_entry_t entry = folio->swap; entry.val += folio_page_idx(folio, page); return entry; } /* linux/mm/workingset.c */ bool workingset_test_recent(void *shadow, bool file, bool *workingset); void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages); void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg); void workingset_refault(struct folio *folio, void *shadow); void workingset_activation(struct folio *folio); /* Only track the nodes of mappings with shadow entries */ void workingset_update_node(struct xa_node *node); extern struct list_lru shadow_nodes; #define mapping_set_update(xas, mapping) do { \ if (!dax_mapping(mapping) && !shmem_mapping(mapping)) { \ xas_set_update(xas, workingset_update_node); \ xas_set_lru(xas, &shadow_nodes); \ } \ } while (0) /* linux/mm/page_alloc.c */ extern unsigned long totalreserve_pages; /* Definition of global_zone_page_state not available yet */ #define nr_free_pages() global_zone_page_state(NR_FREE_PAGES) /* linux/mm/swap.c */ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_io, unsigned int nr_rotated); void lru_note_cost_refault(struct folio *); void folio_add_lru(struct folio *); void folio_add_lru_vma(struct folio *, struct vm_area_struct *); void mark_page_accessed(struct page *); void folio_mark_accessed(struct folio *); extern atomic_t lru_disable_count; static inline bool lru_cache_disabled(void) { return atomic_read(&lru_disable_count); } static inline void lru_cache_enable(void) { atomic_dec(&lru_disable_count); } extern void lru_cache_disable(void); extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_cpu_zone(struct zone *zone); extern void lru_add_drain_all(void); void folio_deactivate(struct folio *folio); void folio_mark_lazyfree(struct folio *folio); extern void swap_setup(void); /* linux/mm/vmscan.c */ extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); #define MEMCG_RECLAIM_MAY_SWAP (1 << 1) #define MEMCG_RECLAIM_PROACTIVE (1 << 2) extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, unsigned int reclaim_options); extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, unsigned long *nr_scanned); extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; long remove_mapping(struct address_space *mapping, struct folio *folio); #ifdef CONFIG_NUMA extern int node_reclaim_mode; extern int sysctl_min_unmapped_ratio; extern int sysctl_min_slab_ratio; #else #define node_reclaim_mode 0 #endif static inline bool node_reclaim_enabled(void) { /* Is any node_reclaim_mode bit set? */ return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP); } void check_move_unevictable_folios(struct folio_batch *fbatch); extern void __meminit kswapd_run(int nid); extern void __meminit kswapd_stop(int nid); #ifdef CONFIG_SWAP int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block); int generic_swapfile_activate(struct swap_info_struct *, struct file *, sector_t *); static inline unsigned long total_swapcache_pages(void) { return global_node_page_state(NR_SWAPCACHE); } extern void free_swap_cache(struct page *page); extern void free_page_and_swap_cache(struct page *); extern void free_pages_and_swap_cache(struct encoded_page **, int); /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; extern long total_swap_pages; extern atomic_t nr_rotate_swap; extern bool has_usable_swap(void); /* Swap 50% full? Release swapcache more aggressively.. */ static inline bool vm_swap_full(void) { return atomic_long_read(&nr_swap_pages) * 2 < total_swap_pages; } static inline long get_nr_swap_pages(void) { return atomic_long_read(&nr_swap_pages); } extern void si_swapinfo(struct sysinfo *); swp_entry_t folio_alloc_swap(struct folio *folio); bool folio_free_swap(struct folio *folio); void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t); extern void swap_free(swp_entry_t); extern void swapcache_free_entries(swp_entry_t *entries, int n); extern int free_swap_and_cache(swp_entry_t); int swap_type_of(dev_t device, sector_t offset); int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); extern sector_t swapdev_block(int, pgoff_t); extern int __swap_count(swp_entry_t entry); extern int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); struct swap_info_struct *swp_swap_info(swp_entry_t entry); struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); extern void exit_swap_address_space(unsigned int type); extern struct swap_info_struct *get_swap_device(swp_entry_t entry); sector_t swap_folio_sector(struct folio *folio); static inline void put_swap_device(struct swap_info_struct *si) { percpu_ref_put(&si->users); } #else /* CONFIG_SWAP */ static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry) { return NULL; } static inline struct swap_info_struct *get_swap_device(swp_entry_t entry) { return NULL; } static inline void put_swap_device(struct swap_info_struct *si) { } #define get_nr_swap_pages() 0L #define total_swap_pages 0L #define total_swapcache_pages() 0UL #define vm_swap_full() 0 #define si_swapinfo(val) \ do { (val)->freeswap = (val)->totalswap = 0; } while (0) /* only sparc can not include linux/pagemap.h in this file * so leave put_page and release_pages undeclared... */ #define free_page_and_swap_cache(page) \ put_page(page) #define free_pages_and_swap_cache(pages, nr) \ release_pages((pages), (nr)); /* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */ #define free_swap_and_cache(e) is_pfn_swap_entry(e) static inline void free_swap_cache(struct page *page) { } static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) { return 0; } static inline void swap_shmem_alloc(swp_entry_t swp) { } static inline int swap_duplicate(swp_entry_t swp) { return 0; } static inline int swapcache_prepare(swp_entry_t swp) { return 0; } static inline void swap_free(swp_entry_t swp) { } static inline void put_swap_folio(struct folio *folio, swp_entry_t swp) { } static inline int __swap_count(swp_entry_t entry) { return 0; } static inline int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) { return 0; } static inline int swp_swapcount(swp_entry_t entry) { return 0; } static inline swp_entry_t folio_alloc_swap(struct folio *folio) { swp_entry_t entry; entry.val = 0; return entry; } static inline bool folio_free_swap(struct folio *folio) { return false; } static inline int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block) { return -EINVAL; } #endif /* CONFIG_SWAP */ #ifdef CONFIG_THP_SWAP extern int split_swap_cluster(swp_entry_t entry); #else static inline int split_swap_cluster(swp_entry_t entry) { return 0; } #endif #ifdef CONFIG_MEMCG static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) { /* Cgroup2 doesn't have per-cgroup swappiness */ if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) return READ_ONCE(vm_swappiness); /* root ? */ if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg)) return READ_ONCE(vm_swappiness); return READ_ONCE(memcg->swappiness); } #else static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) { return READ_ONCE(vm_swappiness); } #endif #if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp); static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp) { if (mem_cgroup_disabled()) return; __folio_throttle_swaprate(folio, gfp); } #else static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp) { } #endif #if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry); int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry); static inline int mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) { if (mem_cgroup_disabled()) return 0; return __mem_cgroup_try_charge_swap(folio, entry); } extern void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages); static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) { if (mem_cgroup_disabled()) return; __mem_cgroup_uncharge_swap(entry, nr_pages); } extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg); extern bool mem_cgroup_swap_full(struct folio *folio); #else static inline void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) { } static inline int mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) { return 0; } static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) { } static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) { return get_nr_swap_pages(); } static inline bool mem_cgroup_swap_full(struct folio *folio) { return vm_swap_full(); } #endif #endif /* __KERNEL__*/ #endif /* _LINUX_SWAP_H */
3178 3178 3232 295 468 20 244 30 465 20 244 465 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM net #if !defined(_TRACE_NET_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_NET_H #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/if_vlan.h> #include <linux/ip.h> #include <linux/tracepoint.h> TRACE_EVENT(net_dev_start_xmit, TP_PROTO(const struct sk_buff *skb, const struct net_device *dev), TP_ARGS(skb, dev), TP_STRUCT__entry( __string( name, dev->name ) __field( u16, queue_mapping ) __field( const void *, skbaddr ) __field( bool, vlan_tagged ) __field( u16, vlan_proto ) __field( u16, vlan_tci ) __field( u16, protocol ) __field( u8, ip_summed ) __field( unsigned int, len ) __field( unsigned int, data_len ) __field( int, network_offset ) __field( bool, transport_offset_valid) __field( int, transport_offset) __field( u8, tx_flags ) __field( u16, gso_size ) __field( u16, gso_segs ) __field( u16, gso_type ) ), TP_fast_assign( __assign_str(name, dev->name); __entry->queue_mapping = skb->queue_mapping; __entry->skbaddr = skb; __entry->vlan_tagged = skb_vlan_tag_present(skb); __entry->vlan_proto = ntohs(skb->vlan_proto); __entry->vlan_tci = skb_vlan_tag_get(skb); __entry->protocol = ntohs(skb->protocol); __entry->ip_summed = skb->ip_summed; __entry->len = skb->len; __entry->data_len = skb->data_len; __entry->network_offset = skb_network_offset(skb); __entry->transport_offset_valid = skb_transport_header_was_set(skb); __entry->transport_offset = skb_transport_header_was_set(skb) ? skb_transport_offset(skb) : 0; __entry->tx_flags = skb_shinfo(skb)->tx_flags; __entry->gso_size = skb_shinfo(skb)->gso_size; __entry->gso_segs = skb_shinfo(skb)->gso_segs; __entry->gso_type = skb_shinfo(skb)->gso_type; ), TP_printk("dev=%s queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x", __get_str(name), __entry->queue_mapping, __entry->skbaddr, __entry->vlan_tagged, __entry->vlan_proto, __entry->vlan_tci, __entry->protocol, __entry->ip_summed, __entry->len, __entry->data_len, __entry->network_offset, __entry->transport_offset_valid, __entry->transport_offset, __entry->tx_flags, __entry->gso_size, __entry->gso_segs, __entry->gso_type) ); TRACE_EVENT(net_dev_xmit, TP_PROTO(struct sk_buff *skb, int rc, struct net_device *dev, unsigned int skb_len), TP_ARGS(skb, rc, dev, skb_len), TP_STRUCT__entry( __field( void *, skbaddr ) __field( unsigned int, len ) __field( int, rc ) __string( name, dev->name ) ), TP_fast_assign( __entry->skbaddr = skb; __entry->len = skb_len; __entry->rc = rc; __assign_str(name, dev->name); ), TP_printk("dev=%s skbaddr=%p len=%u rc=%d", __get_str(name), __entry->skbaddr, __entry->len, __entry->rc) ); TRACE_EVENT(net_dev_xmit_timeout, TP_PROTO(struct net_device *dev, int queue_index), TP_ARGS(dev, queue_index), TP_STRUCT__entry( __string( name, dev->name ) __string( driver, netdev_drivername(dev)) __field( int, queue_index ) ), TP_fast_assign( __assign_str(name, dev->name); __assign_str(driver, netdev_drivername(dev)); __entry->queue_index = queue_index; ), TP_printk("dev=%s driver=%s queue=%d", __get_str(name), __get_str(driver), __entry->queue_index) ); DECLARE_EVENT_CLASS(net_dev_template, TP_PROTO(struct sk_buff *skb), TP_ARGS(skb), TP_STRUCT__entry( __field( void *, skbaddr ) __field( unsigned int, len ) __string( name, skb->dev->name ) ), TP_fast_assign( __entry->skbaddr = skb; __entry->len = skb->len; __assign_str(name, skb->dev->name); ), TP_printk("dev=%s skbaddr=%p len=%u", __get_str(name), __entry->skbaddr, __entry->len) ) DEFINE_EVENT(net_dev_template, net_dev_queue, TP_PROTO(struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_template, netif_receive_skb, TP_PROTO(struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_template, netif_rx, TP_PROTO(struct sk_buff *skb), TP_ARGS(skb) ); DECLARE_EVENT_CLASS(net_dev_rx_verbose_template, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb), TP_STRUCT__entry( __string( name, skb->dev->name ) __field( unsigned int, napi_id ) __field( u16, queue_mapping ) __field( const void *, skbaddr ) __field( bool, vlan_tagged ) __field( u16, vlan_proto ) __field( u16, vlan_tci ) __field( u16, protocol ) __field( u8, ip_summed ) __field( u32, hash ) __field( bool, l4_hash ) __field( unsigned int, len ) __field( unsigned int, data_len ) __field( unsigned int, truesize ) __field( bool, mac_header_valid) __field( int, mac_header ) __field( unsigned char, nr_frags ) __field( u16, gso_size ) __field( u16, gso_type ) ), TP_fast_assign( __assign_str(name, skb->dev->name); #ifdef CONFIG_NET_RX_BUSY_POLL __entry->napi_id = skb->napi_id; #else __entry->napi_id = 0; #endif __entry->queue_mapping = skb->queue_mapping; __entry->skbaddr = skb; __entry->vlan_tagged = skb_vlan_tag_present(skb); __entry->vlan_proto = ntohs(skb->vlan_proto); __entry->vlan_tci = skb_vlan_tag_get(skb); __entry->protocol = ntohs(skb->protocol); __entry->ip_summed = skb->ip_summed; __entry->hash = skb->hash; __entry->l4_hash = skb->l4_hash; __entry->len = skb->len; __entry->data_len = skb->data_len; __entry->truesize = skb->truesize; __entry->mac_header_valid = skb_mac_header_was_set(skb); __entry->mac_header = skb_mac_header(skb) - skb->data; __entry->nr_frags = skb_shinfo(skb)->nr_frags; __entry->gso_size = skb_shinfo(skb)->gso_size; __entry->gso_type = skb_shinfo(skb)->gso_type; ), TP_printk("dev=%s napi_id=%#x queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d hash=0x%08x l4_hash=%d len=%u data_len=%u truesize=%u mac_header_valid=%d mac_header=%d nr_frags=%d gso_size=%d gso_type=%#x", __get_str(name), __entry->napi_id, __entry->queue_mapping, __entry->skbaddr, __entry->vlan_tagged, __entry->vlan_proto, __entry->vlan_tci, __entry->protocol, __entry->ip_summed, __entry->hash, __entry->l4_hash, __entry->len, __entry->data_len, __entry->truesize, __entry->mac_header_valid, __entry->mac_header, __entry->nr_frags, __entry->gso_size, __entry->gso_type) ); DEFINE_EVENT(net_dev_rx_verbose_template, napi_gro_frags_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_rx_verbose_template, napi_gro_receive_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_list_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_rx_verbose_template, netif_rx_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DECLARE_EVENT_CLASS(net_dev_rx_exit_template, TP_PROTO(int ret), TP_ARGS(ret), TP_STRUCT__entry( __field(int, ret) ), TP_fast_assign( __entry->ret = ret; ), TP_printk("ret=%d", __entry->ret) ); DEFINE_EVENT(net_dev_rx_exit_template, napi_gro_frags_exit, TP_PROTO(int ret), TP_ARGS(ret) ); DEFINE_EVENT(net_dev_rx_exit_template, napi_gro_receive_exit, TP_PROTO(int ret), TP_ARGS(ret) ); DEFINE_EVENT(net_dev_rx_exit_template, netif_receive_skb_exit, TP_PROTO(int ret), TP_ARGS(ret) ); DEFINE_EVENT(net_dev_rx_exit_template, netif_rx_exit, TP_PROTO(int ret), TP_ARGS(ret) ); DEFINE_EVENT(net_dev_rx_exit_template, netif_receive_skb_list_exit, TP_PROTO(int ret), TP_ARGS(ret) ); #endif /* _TRACE_NET_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
33 26 3 3 7 31 3884 3882 3888 3884 3880 3883 3882 3888 164 1614 4 164 1620 1621 1616 4 95 600 95 2 2 3608 3613 3606 2335 964 2 1 3610 43 16 43 49 8 301 281 70 281 8 301 603 604 428 604 158 49 306 504 711 238 505 505 506 504 139 139 2181 14 2177 2181 2179 1523 1522 2 167 1510 1524 1528 1525 1529 16 32 34 34 2154 1533 8 1523 1527 2146 34 15264 5 5 5 5 5 15216 15184 15264 15203 17 17 1623 1622 1622 3 1620 1621 1622 1620 1621 1621 1621 1620 6 3550 15 97 1 86 3 9 1616 465 85 1619 154 1456 1458 1459 8 1617 19 9 10 23 17 13 10 4 3 4 2 2 2 12 10 4 3 1 3 10 10 1 1 10 6 2 1 1 3 3207 3208 345 75 405 15 3653 3655 2211 2201 14 79 80 270 268 270 29 95 1542 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux Socket Filter - Kernel level socket filtering * * Based on the design of the Berkeley Packet Filter. The new * internal format has been designed by PLUMgrid: * * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com * * Authors: * * Jay Schulist <jschlst@samba.org> * Alexei Starovoitov <ast@plumgrid.com> * Daniel Borkmann <dborkman@redhat.com> * * Andi Kleen - Fix a few bad bugs and races. * Kris Katterjohn - Added many additional checks in bpf_check_classic() */ #include <uapi/linux/btf.h> #include <linux/filter.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/random.h> #include <linux/moduleloader.h> #include <linux/bpf.h> #include <linux/btf.h> #include <linux/objtool.h> #include <linux/rbtree_latch.h> #include <linux/kallsyms.h> #include <linux/rcupdate.h> #include <linux/perf_event.h> #include <linux/extable.h> #include <linux/log2.h> #include <linux/bpf_verifier.h> #include <linux/nodemask.h> #include <linux/nospec.h> #include <linux/bpf_mem_alloc.h> #include <linux/memcontrol.h> #include <asm/barrier.h> #include <asm/unaligned.h> /* Registers */ #define BPF_R0 regs[BPF_REG_0] #define BPF_R1 regs[BPF_REG_1] #define BPF_R2 regs[BPF_REG_2] #define BPF_R3 regs[BPF_REG_3] #define BPF_R4 regs[BPF_REG_4] #define BPF_R5 regs[BPF_REG_5] #define BPF_R6 regs[BPF_REG_6] #define BPF_R7 regs[BPF_REG_7] #define BPF_R8 regs[BPF_REG_8] #define BPF_R9 regs[BPF_REG_9] #define BPF_R10 regs[BPF_REG_10] /* Named registers */ #define DST regs[insn->dst_reg] #define SRC regs[insn->src_reg] #define FP regs[BPF_REG_FP] #define AX regs[BPF_REG_AX] #define ARG1 regs[BPF_REG_ARG1] #define CTX regs[BPF_REG_CTX] #define OFF insn->off #define IMM insn->imm struct bpf_mem_alloc bpf_global_ma; bool bpf_global_ma_set; /* No hurry in this branch * * Exported for the bpf jit load helper. */ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size) { u8 *ptr = NULL; if (k >= SKF_NET_OFF) { ptr = skb_network_header(skb) + k - SKF_NET_OFF; } else if (k >= SKF_LL_OFF) { if (unlikely(!skb_mac_header_was_set(skb))) return NULL; ptr = skb_mac_header(skb) + k - SKF_LL_OFF; } if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb)) return ptr; return NULL; } /* tell bpf programs that include vmlinux.h kernel's PAGE_SIZE */ enum page_size_enum { __PAGE_SIZE = PAGE_SIZE }; struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags) { gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags); struct bpf_prog_aux *aux; struct bpf_prog *fp; size = round_up(size, __PAGE_SIZE); fp = __vmalloc(size, gfp_flags); if (fp == NULL) return NULL; aux = kzalloc(sizeof(*aux), bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags)); if (aux == NULL) { vfree(fp); return NULL; } fp->active = alloc_percpu_gfp(int, bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags)); if (!fp->active) { vfree(fp); kfree(aux); return NULL; } fp->pages = size / PAGE_SIZE; fp->aux = aux; fp->aux->prog = fp; fp->jit_requested = ebpf_jit_enabled(); fp->blinding_requested = bpf_jit_blinding_enabled(fp); #ifdef CONFIG_CGROUP_BPF aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID; #endif INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode); #ifdef CONFIG_FINEIBT INIT_LIST_HEAD_RCU(&fp->aux->ksym_prefix.lnode); #endif mutex_init(&fp->aux->used_maps_mutex); mutex_init(&fp->aux->dst_mutex); return fp; } struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) { gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags); struct bpf_prog *prog; int cpu; prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags); if (!prog) return NULL; prog->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags); if (!prog->stats) { free_percpu(prog->active); kfree(prog->aux); vfree(prog); return NULL; } for_each_possible_cpu(cpu) { struct bpf_prog_stats *pstats; pstats = per_cpu_ptr(prog->stats, cpu); u64_stats_init(&pstats->syncp); } return prog; } EXPORT_SYMBOL_GPL(bpf_prog_alloc); int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog) { if (!prog->aux->nr_linfo || !prog->jit_requested) return 0; prog->aux->jited_linfo = kvcalloc(prog->aux->nr_linfo, sizeof(*prog->aux->jited_linfo), bpf_memcg_flags(GFP_KERNEL | __GFP_NOWARN)); if (!prog->aux->jited_linfo) return -ENOMEM; return 0; } void bpf_prog_jit_attempt_done(struct bpf_prog *prog) { if (prog->aux->jited_linfo && (!prog->jited || !prog->aux->jited_linfo[0])) { kvfree(prog->aux->jited_linfo); prog->aux->jited_linfo = NULL; } kfree(prog->aux->kfunc_tab); prog->aux->kfunc_tab = NULL; } /* The jit engine is responsible to provide an array * for insn_off to the jited_off mapping (insn_to_jit_off). * * The idx to this array is the insn_off. Hence, the insn_off * here is relative to the prog itself instead of the main prog. * This array has one entry for each xlated bpf insn. * * jited_off is the byte off to the end of the jited insn. * * Hence, with * insn_start: * The first bpf insn off of the prog. The insn off * here is relative to the main prog. * e.g. if prog is a subprog, insn_start > 0 * linfo_idx: * The prog's idx to prog->aux->linfo and jited_linfo * * jited_linfo[linfo_idx] = prog->bpf_func * * For i > linfo_idx, * * jited_linfo[i] = prog->bpf_func + * insn_to_jit_off[linfo[i].insn_off - insn_start - 1] */ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog, const u32 *insn_to_jit_off) { u32 linfo_idx, insn_start, insn_end, nr_linfo, i; const struct bpf_line_info *linfo; void **jited_linfo; if (!prog->aux->jited_linfo || prog->aux->func_idx > prog->aux->func_cnt) /* Userspace did not provide linfo */ return; linfo_idx = prog->aux->linfo_idx; linfo = &prog->aux->linfo[linfo_idx]; insn_start = linfo[0].insn_off; insn_end = insn_start + prog->len; jited_linfo = &prog->aux->jited_linfo[linfo_idx]; jited_linfo[0] = prog->bpf_func; nr_linfo = prog->aux->nr_linfo - linfo_idx; for (i = 1; i < nr_linfo && linfo[i].insn_off < insn_end; i++) /* The verifier ensures that linfo[i].insn_off is * strictly increasing */ jited_linfo[i] = prog->bpf_func + insn_to_jit_off[linfo[i].insn_off - insn_start - 1]; } struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags) { gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags); struct bpf_prog *fp; u32 pages; size = round_up(size, PAGE_SIZE); pages = size / PAGE_SIZE; if (pages <= fp_old->pages) return fp_old; fp = __vmalloc(size, gfp_flags); if (fp) { memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); fp->pages = pages; fp->aux->prog = fp; /* We keep fp->aux from fp_old around in the new * reallocated structure. */ fp_old->aux = NULL; fp_old->stats = NULL; fp_old->active = NULL; __bpf_prog_free(fp_old); } return fp; } void __bpf_prog_free(struct bpf_prog *fp) { if (fp->aux) { mutex_destroy(&fp->aux->used_maps_mutex); mutex_destroy(&fp->aux->dst_mutex); kfree(fp->aux->poke_tab); kfree(fp->aux); } free_percpu(fp->stats); free_percpu(fp->active); vfree(fp); } int bpf_prog_calc_tag(struct bpf_prog *fp) { const u32 bits_offset = SHA1_BLOCK_SIZE - sizeof(__be64); u32 raw_size = bpf_prog_tag_scratch_size(fp); u32 digest[SHA1_DIGEST_WORDS]; u32 ws[SHA1_WORKSPACE_WORDS]; u32 i, bsize, psize, blocks; struct bpf_insn *dst; bool was_ld_map; u8 *raw, *todo; __be32 *result; __be64 *bits; raw = vmalloc(raw_size); if (!raw) return -ENOMEM; sha1_init(digest); memset(ws, 0, sizeof(ws)); /* We need to take out the map fd for the digest calculation * since they are unstable from user space side. */ dst = (void *)raw; for (i = 0, was_ld_map = false; i < fp->len; i++) { dst[i] = fp->insnsi[i]; if (!was_ld_map && dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) && (dst[i].src_reg == BPF_PSEUDO_MAP_FD || dst[i].src_reg == BPF_PSEUDO_MAP_VALUE)) { was_ld_map = true; dst[i].imm = 0; } else if (was_ld_map && dst[i].code == 0 && dst[i].dst_reg == 0 && dst[i].src_reg == 0 && dst[i].off == 0) { was_ld_map = false; dst[i].imm = 0; } else { was_ld_map = false; } } psize = bpf_prog_insn_size(fp); memset(&raw[psize], 0, raw_size - psize); raw[psize++] = 0x80; bsize = round_up(psize, SHA1_BLOCK_SIZE); blocks = bsize / SHA1_BLOCK_SIZE; todo = raw; if (bsize - psize >= sizeof(__be64)) { bits = (__be64 *)(todo + bsize - sizeof(__be64)); } else { bits = (__be64 *)(todo + bsize + bits_offset); blocks++; } *bits = cpu_to_be64((psize - 1) << 3); while (blocks--) { sha1_transform(digest, todo, ws); todo += SHA1_BLOCK_SIZE; } result = (__force __be32 *)digest; for (i = 0; i < SHA1_DIGEST_WORDS; i++) result[i] = cpu_to_be32(digest[i]); memcpy(fp->tag, result, sizeof(fp->tag)); vfree(raw); return 0; } static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old, s32 end_new, s32 curr, const bool probe_pass) { const s64 imm_min = S32_MIN, imm_max = S32_MAX; s32 delta = end_new - end_old; s64 imm = insn->imm; if (curr < pos && curr + imm + 1 >= end_old) imm += delta; else if (curr >= end_new && curr + imm + 1 < end_new) imm -= delta; if (imm < imm_min || imm > imm_max) return -ERANGE; if (!probe_pass) insn->imm = imm; return 0; } static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, s32 end_new, s32 curr, const bool probe_pass) { s64 off_min, off_max, off; s32 delta = end_new - end_old; if (insn->code == (BPF_JMP32 | BPF_JA)) { off = insn->imm; off_min = S32_MIN; off_max = S32_MAX; } else { off = insn->off; off_min = S16_MIN; off_max = S16_MAX; } if (curr < pos && curr + off + 1 >= end_old) off += delta; else if (curr >= end_new && curr + off + 1 < end_new) off -= delta; if (off < off_min || off > off_max) return -ERANGE; if (!probe_pass) { if (insn->code == (BPF_JMP32 | BPF_JA)) insn->imm = off; else insn->off = off; } return 0; } static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old, s32 end_new, const bool probe_pass) { u32 i, insn_cnt = prog->len + (probe_pass ? end_new - end_old : 0); struct bpf_insn *insn = prog->insnsi; int ret = 0; for (i = 0; i < insn_cnt; i++, insn++) { u8 code; /* In the probing pass we still operate on the original, * unpatched image in order to check overflows before we * do any other adjustments. Therefore skip the patchlet. */ if (probe_pass && i == pos) { i = end_new; insn = prog->insnsi + end_old; } if (bpf_pseudo_func(insn)) { ret = bpf_adj_delta_to_imm(insn, pos, end_old, end_new, i, probe_pass); if (ret) return ret; continue; } code = insn->code; if ((BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) || BPF_OP(code) == BPF_EXIT) continue; /* Adjust offset of jmps if we cross patch boundaries. */ if (BPF_OP(code) == BPF_CALL) { if (insn->src_reg != BPF_PSEUDO_CALL) continue; ret = bpf_adj_delta_to_imm(insn, pos, end_old, end_new, i, probe_pass); } else { ret = bpf_adj_delta_to_off(insn, pos, end_old, end_new, i, probe_pass); } if (ret) break; } return ret; } static void bpf_adj_linfo(struct bpf_prog *prog, u32 off, u32 delta) { struct bpf_line_info *linfo; u32 i, nr_linfo; nr_linfo = prog->aux->nr_linfo; if (!nr_linfo || !delta) return; linfo = prog->aux->linfo; for (i = 0; i < nr_linfo; i++) if (off < linfo[i].insn_off) break; /* Push all off < linfo[i].insn_off by delta */ for (; i < nr_linfo; i++) linfo[i].insn_off += delta; } struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len) { u32 insn_adj_cnt, insn_rest, insn_delta = len - 1; const u32 cnt_max = S16_MAX; struct bpf_prog *prog_adj; int err; /* Since our patchlet doesn't expand the image, we're done. */ if (insn_delta == 0) { memcpy(prog->insnsi + off, patch, sizeof(*patch)); return prog; } insn_adj_cnt = prog->len + insn_delta; /* Reject anything that would potentially let the insn->off * target overflow when we have excessive program expansions. * We need to probe here before we do any reallocation where * we afterwards may not fail anymore. */ if (insn_adj_cnt > cnt_max && (err = bpf_adj_branches(prog, off, off + 1, off + len, true))) return ERR_PTR(err); /* Several new instructions need to be inserted. Make room * for them. Likely, there's no need for a new allocation as * last page could have large enough tailroom. */ prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt), GFP_USER); if (!prog_adj) return ERR_PTR(-ENOMEM); prog_adj->len = insn_adj_cnt; /* Patching happens in 3 steps: * * 1) Move over tail of insnsi from next instruction onwards, * so we can patch the single target insn with one or more * new ones (patching is always from 1 to n insns, n > 0). * 2) Inject new instructions at the target location. * 3) Adjust branch offsets if necessary. */ insn_rest = insn_adj_cnt - off - len; memmove(prog_adj->insnsi + off + len, prog_adj->insnsi + off + 1, sizeof(*patch) * insn_rest); memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len); /* We are guaranteed to not fail at this point, otherwise * the ship has sailed to reverse to the original state. An * overflow cannot happen at this point. */ BUG_ON(bpf_adj_branches(prog_adj, off, off + 1, off + len, false)); bpf_adj_linfo(prog_adj, off, insn_delta); return prog_adj; } int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt) { /* Branch offsets can't overflow when program is shrinking, no need * to call bpf_adj_branches(..., true) here */ memmove(prog->insnsi + off, prog->insnsi + off + cnt, sizeof(struct bpf_insn) * (prog->len - off - cnt)); prog->len -= cnt; return WARN_ON_ONCE(bpf_adj_branches(prog, off, off + cnt, off, false)); } static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp) { int i; for (i = 0; i < fp->aux->real_func_cnt; i++) bpf_prog_kallsyms_del(fp->aux->func[i]); } void bpf_prog_kallsyms_del_all(struct bpf_prog *fp) { bpf_prog_kallsyms_del_subprogs(fp); bpf_prog_kallsyms_del(fp); } #ifdef CONFIG_BPF_JIT /* All BPF JIT sysctl knobs here. */ int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); int bpf_jit_harden __read_mostly; long bpf_jit_limit __read_mostly; long bpf_jit_limit_max __read_mostly; static void bpf_prog_ksym_set_addr(struct bpf_prog *prog) { WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog)); prog->aux->ksym.start = (unsigned long) prog->bpf_func; prog->aux->ksym.end = prog->aux->ksym.start + prog->jited_len; } static void bpf_prog_ksym_set_name(struct bpf_prog *prog) { char *sym = prog->aux->ksym.name; const char *end = sym + KSYM_NAME_LEN; const struct btf_type *type; const char *func_name; BUILD_BUG_ON(sizeof("bpf_prog_") + sizeof(prog->tag) * 2 + /* name has been null terminated. * We should need +1 for the '_' preceding * the name. However, the null character * is double counted between the name and the * sizeof("bpf_prog_") above, so we omit * the +1 here. */ sizeof(prog->aux->name) > KSYM_NAME_LEN); sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_"); sym = bin2hex(sym, prog->tag, sizeof(prog->tag)); /* prog->aux->name will be ignored if full btf name is available */ if (prog->aux->func_info_cnt && prog->aux->func_idx < prog->aux->func_info_cnt) { type = btf_type_by_id(prog->aux->btf, prog->aux->func_info[prog->aux->func_idx].type_id); func_name = btf_name_by_offset(prog->aux->btf, type->name_off); snprintf(sym, (size_t)(end - sym), "_%s", func_name); return; } if (prog->aux->name[0]) snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name); else *sym = 0; } static unsigned long bpf_get_ksym_start(struct latch_tree_node *n) { return container_of(n, struct bpf_ksym, tnode)->start; } static __always_inline bool bpf_tree_less(struct latch_tree_node *a, struct latch_tree_node *b) { return bpf_get_ksym_start(a) < bpf_get_ksym_start(b); } static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n) { unsigned long val = (unsigned long)key; const struct bpf_ksym *ksym; ksym = container_of(n, struct bpf_ksym, tnode); if (val < ksym->start) return -1; /* Ensure that we detect return addresses as part of the program, when * the final instruction is a call for a program part of the stack * trace. Therefore, do val > ksym->end instead of val >= ksym->end. */ if (val > ksym->end) return 1; return 0; } static const struct latch_tree_ops bpf_tree_ops = { .less = bpf_tree_less, .comp = bpf_tree_comp, }; static DEFINE_SPINLOCK(bpf_lock); static LIST_HEAD(bpf_kallsyms); static struct latch_tree_root bpf_tree __cacheline_aligned; void bpf_ksym_add(struct bpf_ksym *ksym) { spin_lock_bh(&bpf_lock); WARN_ON_ONCE(!list_empty(&ksym->lnode)); list_add_tail_rcu(&ksym->lnode, &bpf_kallsyms); latch_tree_insert(&ksym->tnode, &bpf_tree, &bpf_tree_ops); spin_unlock_bh(&bpf_lock); } static void __bpf_ksym_del(struct bpf_ksym *ksym) { if (list_empty(&ksym->lnode)) return; latch_tree_erase(&ksym->tnode, &bpf_tree, &bpf_tree_ops); list_del_rcu(&ksym->lnode); } void bpf_ksym_del(struct bpf_ksym *ksym) { spin_lock_bh(&bpf_lock); __bpf_ksym_del(ksym); spin_unlock_bh(&bpf_lock); } static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp) { return fp->jited && !bpf_prog_was_classic(fp); } void bpf_prog_kallsyms_add(struct bpf_prog *fp) { if (!bpf_prog_kallsyms_candidate(fp) || !bpf_token_capable(fp->aux->token, CAP_BPF)) return; bpf_prog_ksym_set_addr(fp); bpf_prog_ksym_set_name(fp); fp->aux->ksym.prog = true; bpf_ksym_add(&fp->aux->ksym); #ifdef CONFIG_FINEIBT /* * When FineIBT, code in the __cfi_foo() symbols can get executed * and hence unwinder needs help. */ if (cfi_mode != CFI_FINEIBT) return; snprintf(fp->aux->ksym_prefix.name, KSYM_NAME_LEN, "__cfi_%s", fp->aux->ksym.name); fp->aux->ksym_prefix.start = (unsigned long) fp->bpf_func - 16; fp->aux->ksym_prefix.end = (unsigned long) fp->bpf_func; bpf_ksym_add(&fp->aux->ksym_prefix); #endif } void bpf_prog_kallsyms_del(struct bpf_prog *fp) { if (!bpf_prog_kallsyms_candidate(fp)) return; bpf_ksym_del(&fp->aux->ksym); #ifdef CONFIG_FINEIBT if (cfi_mode != CFI_FINEIBT) return; bpf_ksym_del(&fp->aux->ksym_prefix); #endif } static struct bpf_ksym *bpf_ksym_find(unsigned long addr) { struct latch_tree_node *n; n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops); return n ? container_of(n, struct bpf_ksym, tnode) : NULL; } const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char *sym) { struct bpf_ksym *ksym; char *ret = NULL; rcu_read_lock(); ksym = bpf_ksym_find(addr); if (ksym) { unsigned long symbol_start = ksym->start; unsigned long symbol_end = ksym->end; strncpy(sym, ksym->name, KSYM_NAME_LEN); ret = sym; if (size) *size = symbol_end - symbol_start; if (off) *off = addr - symbol_start; } rcu_read_unlock(); return ret; } bool is_bpf_text_address(unsigned long addr) { bool ret; rcu_read_lock(); ret = bpf_ksym_find(addr) != NULL; rcu_read_unlock(); return ret; } struct bpf_prog *bpf_prog_ksym_find(unsigned long addr) { struct bpf_ksym *ksym = bpf_ksym_find(addr); return ksym && ksym->prog ? container_of(ksym, struct bpf_prog_aux, ksym)->prog : NULL; } const struct exception_table_entry *search_bpf_extables(unsigned long addr) { const struct exception_table_entry *e = NULL; struct bpf_prog *prog; rcu_read_lock(); prog = bpf_prog_ksym_find(addr); if (!prog) goto out; if (!prog->aux->num_exentries) goto out; e = search_extable(prog->aux->extable, prog->aux->num_exentries, addr); out: rcu_read_unlock(); return e; } int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym) { struct bpf_ksym *ksym; unsigned int it = 0; int ret = -ERANGE; if (!bpf_jit_kallsyms_enabled()) return ret; rcu_read_lock(); list_for_each_entry_rcu(ksym, &bpf_kallsyms, lnode) { if (it++ != symnum) continue; strncpy(sym, ksym->name, KSYM_NAME_LEN); *value = ksym->start; *type = BPF_SYM_ELF_TYPE; ret = 0; break; } rcu_read_unlock(); return ret; } int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, struct bpf_jit_poke_descriptor *poke) { struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab; static const u32 poke_tab_max = 1024; u32 slot = prog->aux->size_poke_tab; u32 size = slot + 1; if (size > poke_tab_max) return -ENOSPC; if (poke->tailcall_target || poke->tailcall_target_stable || poke->tailcall_bypass || poke->adj_off || poke->bypass_addr) return -EINVAL; switch (poke->reason) { case BPF_POKE_REASON_TAIL_CALL: if (!poke->tail_call.map) return -EINVAL; break; default: return -EINVAL; } tab = krealloc(tab, size * sizeof(*poke), GFP_KERNEL); if (!tab) return -ENOMEM; memcpy(&tab[slot], poke, sizeof(*poke)); prog->aux->size_poke_tab = size; prog->aux->poke_tab = tab; return slot; } /* * BPF program pack allocator. * * Most BPF programs are pretty small. Allocating a hole page for each * program is sometime a waste. Many small bpf program also adds pressure * to instruction TLB. To solve this issue, we introduce a BPF program pack * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86) * to host BPF programs. */ #define BPF_PROG_CHUNK_SHIFT 6 #define BPF_PROG_CHUNK_SIZE (1 << BPF_PROG_CHUNK_SHIFT) #define BPF_PROG_CHUNK_MASK (~(BPF_PROG_CHUNK_SIZE - 1)) struct bpf_prog_pack { struct list_head list; void *ptr; unsigned long bitmap[]; }; void bpf_jit_fill_hole_with_zero(void *area, unsigned int size) { memset(area, 0, size); } #define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE) static DEFINE_MUTEX(pack_mutex); static LIST_HEAD(pack_list); /* PMD_SIZE is not available in some special config, e.g. ARCH=arm with * CONFIG_MMU=n. Use PAGE_SIZE in these cases. */ #ifdef PMD_SIZE /* PMD_SIZE is really big for some archs. It doesn't make sense to * reserve too much memory in one allocation. Hardcode BPF_PROG_PACK_SIZE to * 2MiB * num_possible_nodes(). On most architectures PMD_SIZE will be * greater than or equal to 2MB. */ #define BPF_PROG_PACK_SIZE (SZ_2M * num_possible_nodes()) #else #define BPF_PROG_PACK_SIZE PAGE_SIZE #endif #define BPF_PROG_CHUNK_COUNT (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE) static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_prog_pack *pack; pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)), GFP_KERNEL); if (!pack) return NULL; pack->ptr = bpf_jit_alloc_exec(BPF_PROG_PACK_SIZE); if (!pack->ptr) { kfree(pack); return NULL; } bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE); bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE); list_add_tail(&pack->list, &pack_list); set_vm_flush_reset_perms(pack->ptr); set_memory_rox((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); return pack; } void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) { unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size); struct bpf_prog_pack *pack; unsigned long pos; void *ptr = NULL; mutex_lock(&pack_mutex); if (size > BPF_PROG_PACK_SIZE) { size = round_up(size, PAGE_SIZE); ptr = bpf_jit_alloc_exec(size); if (ptr) { bpf_fill_ill_insns(ptr, size); set_vm_flush_reset_perms(ptr); set_memory_rox((unsigned long)ptr, size / PAGE_SIZE); } goto out; } list_for_each_entry(pack, &pack_list, list) { pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0, nbits, 0); if (pos < BPF_PROG_CHUNK_COUNT) goto found_free_area; } pack = alloc_new_pack(bpf_fill_ill_insns); if (!pack) goto out; pos = 0; found_free_area: bitmap_set(pack->bitmap, pos, nbits); ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT); out: mutex_unlock(&pack_mutex); return ptr; } void bpf_prog_pack_free(void *ptr, u32 size) { struct bpf_prog_pack *pack = NULL, *tmp; unsigned int nbits; unsigned long pos; mutex_lock(&pack_mutex); if (size > BPF_PROG_PACK_SIZE) { bpf_jit_free_exec(ptr); goto out; } list_for_each_entry(tmp, &pack_list, list) { if (ptr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > ptr) { pack = tmp; break; } } if (WARN_ONCE(!pack, "bpf_prog_pack bug\n")) goto out; nbits = BPF_PROG_SIZE_TO_NBITS(size); pos = ((unsigned long)ptr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT; WARN_ONCE(bpf_arch_text_invalidate(ptr, size), "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n"); bitmap_clear(pack->bitmap, pos, nbits); if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0, BPF_PROG_CHUNK_COUNT, 0) == 0) { list_del(&pack->list); bpf_jit_free_exec(pack->ptr); kfree(pack); } out: mutex_unlock(&pack_mutex); } static atomic_long_t bpf_jit_current; /* Can be overridden by an arch's JIT compiler if it has a custom, * dedicated BPF backend memory area, or if neither of the two * below apply. */ u64 __weak bpf_jit_alloc_exec_limit(void) { #if defined(MODULES_VADDR) return MODULES_END - MODULES_VADDR; #else return VMALLOC_END - VMALLOC_START; #endif } static int __init bpf_jit_charge_init(void) { /* Only used as heuristic here to derive limit. */ bpf_jit_limit_max = bpf_jit_alloc_exec_limit(); bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 1, PAGE_SIZE), LONG_MAX); return 0; } pure_initcall(bpf_jit_charge_init); int bpf_jit_charge_modmem(u32 size) { if (atomic_long_add_return(size, &bpf_jit_current) > READ_ONCE(bpf_jit_limit)) { if (!bpf_capable()) { atomic_long_sub(size, &bpf_jit_current); return -EPERM; } } return 0; } void bpf_jit_uncharge_modmem(u32 size) { atomic_long_sub(size, &bpf_jit_current); } void *__weak bpf_jit_alloc_exec(unsigned long size) { return module_alloc(size); } void __weak bpf_jit_free_exec(void *addr) { module_memfree(addr); } struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_binary_header *hdr; u32 size, hole, start; WARN_ON_ONCE(!is_power_of_2(alignment) || alignment > BPF_IMAGE_ALIGNMENT); /* Most of BPF filters are really small, but if some of them * fill a page, allow at least 128 extra bytes to insert a * random section of illegal instructions. */ size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); if (bpf_jit_charge_modmem(size)) return NULL; hdr = bpf_jit_alloc_exec(size); if (!hdr) { bpf_jit_uncharge_modmem(size); return NULL; } /* Fill space with illegal/arch-dep instructions. */ bpf_fill_ill_insns(hdr, size); hdr->size = size; hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); start = get_random_u32_below(hole) & ~(alignment - 1); /* Leave a random number of instructions before BPF code. */ *image_ptr = &hdr->image[start]; return hdr; } void bpf_jit_binary_free(struct bpf_binary_header *hdr) { u32 size = hdr->size; bpf_jit_free_exec(hdr); bpf_jit_uncharge_modmem(size); } /* Allocate jit binary from bpf_prog_pack allocator. * Since the allocated memory is RO+X, the JIT engine cannot write directly * to the memory. To solve this problem, a RW buffer is also allocated at * as the same time. The JIT engine should calculate offsets based on the * RO memory address, but write JITed program to the RW buffer. Once the * JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies * the JITed program to the RO memory. */ struct bpf_binary_header * bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, struct bpf_binary_header **rw_header, u8 **rw_image, bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_binary_header *ro_header; u32 size, hole, start; WARN_ON_ONCE(!is_power_of_2(alignment) || alignment > BPF_IMAGE_ALIGNMENT); /* add 16 bytes for a random section of illegal instructions */ size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE); if (bpf_jit_charge_modmem(size)) return NULL; ro_header = bpf_prog_pack_alloc(size, bpf_fill_ill_insns); if (!ro_header) { bpf_jit_uncharge_modmem(size); return NULL; } *rw_header = kvmalloc(size, GFP_KERNEL); if (!*rw_header) { bpf_prog_pack_free(ro_header, size); bpf_jit_uncharge_modmem(size); return NULL; } /* Fill space with illegal/arch-dep instructions. */ bpf_fill_ill_insns(*rw_header, size); (*rw_header)->size = size; hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)), BPF_PROG_CHUNK_SIZE - sizeof(*ro_header)); start = get_random_u32_below(hole) & ~(alignment - 1); *image_ptr = &ro_header->image[start]; *rw_image = &(*rw_header)->image[start]; return ro_header; } /* Copy JITed text from rw_header to its final location, the ro_header. */ int bpf_jit_binary_pack_finalize(struct bpf_prog *prog, struct bpf_binary_header *ro_header, struct bpf_binary_header *rw_header) { void *ptr; ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size); kvfree(rw_header); if (IS_ERR(ptr)) { bpf_prog_pack_free(ro_header, ro_header->size); return PTR_ERR(ptr); } return 0; } /* bpf_jit_binary_pack_free is called in two different scenarios: * 1) when the program is freed after; * 2) when the JIT engine fails (before bpf_jit_binary_pack_finalize). * For case 2), we need to free both the RO memory and the RW buffer. * * bpf_jit_binary_pack_free requires proper ro_header->size. However, * bpf_jit_binary_pack_alloc does not set it. Therefore, ro_header->size * must be set with either bpf_jit_binary_pack_finalize (normal path) or * bpf_arch_text_copy (when jit fails). */ void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header, struct bpf_binary_header *rw_header) { u32 size = ro_header->size; bpf_prog_pack_free(ro_header, size); kvfree(rw_header); bpf_jit_uncharge_modmem(size); } struct bpf_binary_header * bpf_jit_binary_pack_hdr(const struct bpf_prog *fp) { unsigned long real_start = (unsigned long)fp->bpf_func; unsigned long addr; addr = real_start & BPF_PROG_CHUNK_MASK; return (void *)addr; } static inline struct bpf_binary_header * bpf_jit_binary_hdr(const struct bpf_prog *fp) { unsigned long real_start = (unsigned long)fp->bpf_func; unsigned long addr; addr = real_start & PAGE_MASK; return (void *)addr; } /* This symbol is only overridden by archs that have different * requirements than the usual eBPF JITs, f.e. when they only * implement cBPF JIT, do not set images read-only, etc. */ void __weak bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) { struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); bpf_jit_binary_free(hdr); WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); } bpf_prog_unlock_free(fp); } int bpf_jit_get_func_addr(const struct bpf_prog *prog, const struct bpf_insn *insn, bool extra_pass, u64 *func_addr, bool *func_addr_fixed) { s16 off = insn->off; s32 imm = insn->imm; u8 *addr; int err; *func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL; if (!*func_addr_fixed) { /* Place-holder address till the last pass has collected * all addresses for JITed subprograms in which case we * can pick them up from prog->aux. */ if (!extra_pass) addr = NULL; else if (prog->aux->func && off >= 0 && off < prog->aux->real_func_cnt) addr = (u8 *)prog->aux->func[off]->bpf_func; else return -EINVAL; } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && bpf_jit_supports_far_kfunc_call()) { err = bpf_get_kfunc_addr(prog, insn->imm, insn->off, &addr); if (err) return err; } else { /* Address of a BPF helper call. Since part of the core * kernel, it's always at a fixed location. __bpf_call_base * and the helper with imm relative to it are both in core * kernel. */ addr = (u8 *)__bpf_call_base + imm; } *func_addr = (unsigned long)addr; return 0; } static int bpf_jit_blind_insn(const struct bpf_insn *from, const struct bpf_insn *aux, struct bpf_insn *to_buff, bool emit_zext) { struct bpf_insn *to = to_buff; u32 imm_rnd = get_random_u32(); s16 off; BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG); /* Constraints on AX register: * * AX register is inaccessible from user space. It is mapped in * all JITs, and used here for constant blinding rewrites. It is * typically "stateless" meaning its contents are only valid within * the executed instruction, but not across several instructions. * There are a few exceptions however which are further detailed * below. * * Constant blinding is only used by JITs, not in the interpreter. * The interpreter uses AX in some occasions as a local temporary * register e.g. in DIV or MOD instructions. * * In restricted circumstances, the verifier can also use the AX * register for rewrites as long as they do not interfere with * the above cases! */ if (from->dst_reg == BPF_REG_AX || from->src_reg == BPF_REG_AX) goto out; if (from->imm == 0 && (from->code == (BPF_ALU | BPF_MOV | BPF_K) || from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) { *to++ = BPF_ALU64_REG(BPF_XOR, from->dst_reg, from->dst_reg); goto out; } switch (from->code) { case BPF_ALU | BPF_ADD | BPF_K: case BPF_ALU | BPF_SUB | BPF_K: case BPF_ALU | BPF_AND | BPF_K: case BPF_ALU | BPF_OR | BPF_K: case BPF_ALU | BPF_XOR | BPF_K: case BPF_ALU | BPF_MUL | BPF_K: case BPF_ALU | BPF_MOV | BPF_K: case BPF_ALU | BPF_DIV | BPF_K: case BPF_ALU | BPF_MOD | BPF_K: *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); *to++ = BPF_ALU32_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off); break; case BPF_ALU64 | BPF_ADD | BPF_K: case BPF_ALU64 | BPF_SUB | BPF_K: case BPF_ALU64 | BPF_AND | BPF_K: case BPF_ALU64 | BPF_OR | BPF_K: case BPF_ALU64 | BPF_XOR | BPF_K: case BPF_ALU64 | BPF_MUL | BPF_K: case BPF_ALU64 | BPF_MOV | BPF_K: case BPF_ALU64 | BPF_DIV | BPF_K: case BPF_ALU64 | BPF_MOD | BPF_K: *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); *to++ = BPF_ALU64_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off); break; case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JNE | BPF_K: case BPF_JMP | BPF_JGT | BPF_K: case BPF_JMP | BPF_JLT | BPF_K: case BPF_JMP | BPF_JGE | BPF_K: case BPF_JMP | BPF_JLE | BPF_K: case BPF_JMP | BPF_JSGT | BPF_K: case BPF_JMP | BPF_JSLT | BPF_K: case BPF_JMP | BPF_JSGE | BPF_K: case BPF_JMP | BPF_JSLE | BPF_K: case BPF_JMP | BPF_JSET | BPF_K: /* Accommodate for extra offset in case of a backjump. */ off = from->off; if (off < 0) off -= 2; *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off); break; case BPF_JMP32 | BPF_JEQ | BPF_K: case BPF_JMP32 | BPF_JNE | BPF_K: case BPF_JMP32 | BPF_JGT | BPF_K: case BPF_JMP32 | BPF_JLT | BPF_K: case BPF_JMP32 | BPF_JGE | BPF_K: case BPF_JMP32 | BPF_JLE | BPF_K: case BPF_JMP32 | BPF_JSGT | BPF_K: case BPF_JMP32 | BPF_JSLT | BPF_K: case BPF_JMP32 | BPF_JSGE | BPF_K: case BPF_JMP32 | BPF_JSLE | BPF_K: case BPF_JMP32 | BPF_JSET | BPF_K: /* Accommodate for extra offset in case of a backjump. */ off = from->off; if (off < 0) off -= 2; *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); *to++ = BPF_JMP32_REG(from->code, from->dst_reg, BPF_REG_AX, off); break; case BPF_LD | BPF_IMM | BPF_DW: *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm); *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); *to++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32); *to++ = BPF_ALU64_REG(BPF_MOV, aux[0].dst_reg, BPF_REG_AX); break; case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */ *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm); *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); if (emit_zext) *to++ = BPF_ZEXT_REG(BPF_REG_AX); *to++ = BPF_ALU64_REG(BPF_OR, aux[0].dst_reg, BPF_REG_AX); break; case BPF_ST | BPF_MEM | BPF_DW: case BPF_ST | BPF_MEM | BPF_W: case BPF_ST | BPF_MEM | BPF_H: case BPF_ST | BPF_MEM | BPF_B: *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); *to++ = BPF_STX_MEM(from->code, from->dst_reg, BPF_REG_AX, from->off); break; } out: return to - to_buff; } static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, gfp_t gfp_extra_flags) { gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags); if (fp != NULL) { /* aux->prog still points to the fp_other one, so * when promoting the clone to the real program, * this still needs to be adapted. */ memcpy(fp, fp_other, fp_other->pages * PAGE_SIZE); } return fp; } static void bpf_prog_clone_free(struct bpf_prog *fp) { /* aux was stolen by the other clone, so we cannot free * it from this path! It will be freed eventually by the * other program on release. * * At this point, we don't need a deferred release since * clone is guaranteed to not be locked. */ fp->aux = NULL; fp->stats = NULL; fp->active = NULL; __bpf_prog_free(fp); } void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other) { /* We have to repoint aux->prog to self, as we don't * know whether fp here is the clone or the original. */ fp->aux->prog = fp; bpf_prog_clone_free(fp_other); } struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) { struct bpf_insn insn_buff[16], aux[2]; struct bpf_prog *clone, *tmp; int insn_delta, insn_cnt; struct bpf_insn *insn; int i, rewritten; if (!prog->blinding_requested || prog->blinded) return prog; clone = bpf_prog_clone_create(prog, GFP_USER); if (!clone) return ERR_PTR(-ENOMEM); insn_cnt = clone->len; insn = clone->insnsi; for (i = 0; i < insn_cnt; i++, insn++) { if (bpf_pseudo_func(insn)) { /* ld_imm64 with an address of bpf subprog is not * a user controlled constant. Don't randomize it, * since it will conflict with jit_subprogs() logic. */ insn++; i++; continue; } /* We temporarily need to hold the original ld64 insn * so that we can still access the first part in the * second blinding run. */ if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW) && insn[1].code == 0) memcpy(aux, insn, sizeof(aux)); rewritten = bpf_jit_blind_insn(insn, aux, insn_buff, clone->aux->verifier_zext); if (!rewritten) continue; tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten); if (IS_ERR(tmp)) { /* Patching may have repointed aux->prog during * realloc from the original one, so we need to * fix it up here on error. */ bpf_jit_prog_release_other(prog, clone); return tmp; } clone = tmp; insn_delta = rewritten - 1; /* Walk new program and skip insns we just inserted. */ insn = clone->insnsi + i + insn_delta; insn_cnt += insn_delta; i += insn_delta; } clone->blinded = 1; return clone; } #endif /* CONFIG_BPF_JIT */ /* Base function for offset calculation. Needs to go into .text section, * therefore keeping it non-static as well; will also be used by JITs * anyway later on, so do not let the compiler omit it. This also needs * to go into kallsyms for correlation from e.g. bpftool, so naming * must not change. */ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { return 0; } EXPORT_SYMBOL_GPL(__bpf_call_base); /* All UAPI available opcodes. */ #define BPF_INSN_MAP(INSN_2, INSN_3) \ /* 32 bit ALU operations. */ \ /* Register based. */ \ INSN_3(ALU, ADD, X), \ INSN_3(ALU, SUB, X), \ INSN_3(ALU, AND, X), \ INSN_3(ALU, OR, X), \ INSN_3(ALU, LSH, X), \ INSN_3(ALU, RSH, X), \ INSN_3(ALU, XOR, X), \ INSN_3(ALU, MUL, X), \ INSN_3(ALU, MOV, X), \ INSN_3(ALU, ARSH, X), \ INSN_3(ALU, DIV, X), \ INSN_3(ALU, MOD, X), \ INSN_2(ALU, NEG), \ INSN_3(ALU, END, TO_BE), \ INSN_3(ALU, END, TO_LE), \ /* Immediate based. */ \ INSN_3(ALU, ADD, K), \ INSN_3(ALU, SUB, K), \ INSN_3(ALU, AND, K), \ INSN_3(ALU, OR, K), \ INSN_3(ALU, LSH, K), \ INSN_3(ALU, RSH, K), \ INSN_3(ALU, XOR, K), \ INSN_3(ALU, MUL, K), \ INSN_3(ALU, MOV, K), \ INSN_3(ALU, ARSH, K), \ INSN_3(ALU, DIV, K), \ INSN_3(ALU, MOD, K), \ /* 64 bit ALU operations. */ \ /* Register based. */ \ INSN_3(ALU64, ADD, X), \ INSN_3(ALU64, SUB, X), \ INSN_3(ALU64, AND, X), \ INSN_3(ALU64, OR, X), \ INSN_3(ALU64, LSH, X), \ INSN_3(ALU64, RSH, X), \ INSN_3(ALU64, XOR, X), \ INSN_3(ALU64, MUL, X), \ INSN_3(ALU64, MOV, X), \ INSN_3(ALU64, ARSH, X), \ INSN_3(ALU64, DIV, X), \ INSN_3(ALU64, MOD, X), \ INSN_2(ALU64, NEG), \ INSN_3(ALU64, END, TO_LE), \ /* Immediate based. */ \ INSN_3(ALU64, ADD, K), \ INSN_3(ALU64, SUB, K), \ INSN_3(ALU64, AND, K), \ INSN_3(ALU64, OR, K), \ INSN_3(ALU64, LSH, K), \ INSN_3(ALU64, RSH, K), \ INSN_3(ALU64, XOR, K), \ INSN_3(ALU64, MUL, K), \ INSN_3(ALU64, MOV, K), \ INSN_3(ALU64, ARSH, K), \ INSN_3(ALU64, DIV, K), \ INSN_3(ALU64, MOD, K), \ /* Call instruction. */ \ INSN_2(JMP, CALL), \ /* Exit instruction. */ \ INSN_2(JMP, EXIT), \ /* 32-bit Jump instructions. */ \ /* Register based. */ \ INSN_3(JMP32, JEQ, X), \ INSN_3(JMP32, JNE, X), \ INSN_3(JMP32, JGT, X), \ INSN_3(JMP32, JLT, X), \ INSN_3(JMP32, JGE, X), \ INSN_3(JMP32, JLE, X), \ INSN_3(JMP32, JSGT, X), \ INSN_3(JMP32, JSLT, X), \ INSN_3(JMP32, JSGE, X), \ INSN_3(JMP32, JSLE, X), \ INSN_3(JMP32, JSET, X), \ /* Immediate based. */ \ INSN_3(JMP32, JEQ, K), \ INSN_3(JMP32, JNE, K), \ INSN_3(JMP32, JGT, K), \ INSN_3(JMP32, JLT, K), \ INSN_3(JMP32, JGE, K), \ INSN_3(JMP32, JLE, K), \ INSN_3(JMP32, JSGT, K), \ INSN_3(JMP32, JSLT, K), \ INSN_3(JMP32, JSGE, K), \ INSN_3(JMP32, JSLE, K), \ INSN_3(JMP32, JSET, K), \ /* Jump instructions. */ \ /* Register based. */ \ INSN_3(JMP, JEQ, X), \ INSN_3(JMP, JNE, X), \ INSN_3(JMP, JGT, X), \ INSN_3(JMP, JLT, X), \ INSN_3(JMP, JGE, X), \ INSN_3(JMP, JLE, X), \ INSN_3(JMP, JSGT, X), \ INSN_3(JMP, JSLT, X), \ INSN_3(JMP, JSGE, X), \ INSN_3(JMP, JSLE, X), \ INSN_3(JMP, JSET, X), \ /* Immediate based. */ \ INSN_3(JMP, JEQ, K), \ INSN_3(JMP, JNE, K), \ INSN_3(JMP, JGT, K), \ INSN_3(JMP, JLT, K), \ INSN_3(JMP, JGE, K), \ INSN_3(JMP, JLE, K), \ INSN_3(JMP, JSGT, K), \ INSN_3(JMP, JSLT, K), \ INSN_3(JMP, JSGE, K), \ INSN_3(JMP, JSLE, K), \ INSN_3(JMP, JSET, K), \ INSN_2(JMP, JA), \ INSN_2(JMP32, JA), \ /* Store instructions. */ \ /* Register based. */ \ INSN_3(STX, MEM, B), \ INSN_3(STX, MEM, H), \ INSN_3(STX, MEM, W), \ INSN_3(STX, MEM, DW), \ INSN_3(STX, ATOMIC, W), \ INSN_3(STX, ATOMIC, DW), \ /* Immediate based. */ \ INSN_3(ST, MEM, B), \ INSN_3(ST, MEM, H), \ INSN_3(ST, MEM, W), \ INSN_3(ST, MEM, DW), \ /* Load instructions. */ \ /* Register based. */ \ INSN_3(LDX, MEM, B), \ INSN_3(LDX, MEM, H), \ INSN_3(LDX, MEM, W), \ INSN_3(LDX, MEM, DW), \ INSN_3(LDX, MEMSX, B), \ INSN_3(LDX, MEMSX, H), \ INSN_3(LDX, MEMSX, W), \ /* Immediate based. */ \ INSN_3(LD, IMM, DW) bool bpf_opcode_in_insntable(u8 code) { #define BPF_INSN_2_TBL(x, y) [BPF_##x | BPF_##y] = true #define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true static const bool public_insntable[256] = { [0 ... 255] = false, /* Now overwrite non-defaults ... */ BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL), /* UAPI exposed, but rewritten opcodes. cBPF carry-over. */ [BPF_LD | BPF_ABS | BPF_B] = true, [BPF_LD | BPF_ABS | BPF_H] = true, [BPF_LD | BPF_ABS | BPF_W] = true, [BPF_LD | BPF_IND | BPF_B] = true, [BPF_LD | BPF_IND | BPF_H] = true, [BPF_LD | BPF_IND | BPF_W] = true, [BPF_JMP | BPF_JCOND] = true, }; #undef BPF_INSN_3_TBL #undef BPF_INSN_2_TBL return public_insntable[code]; } #ifndef CONFIG_BPF_JIT_ALWAYS_ON /** * ___bpf_prog_run - run eBPF program on a given context * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers * @insn: is the array of eBPF instructions * * Decode and execute eBPF instructions. * * Return: whatever value is in %BPF_R0 at program exit */ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn) { #define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z static const void * const jumptable[256] __annotate_jump_table = { [0 ... 255] = &&default_label, /* Now overwrite non-defaults ... */ BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL), /* Non-UAPI available opcodes. */ [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS, [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, [BPF_ST | BPF_NOSPEC] = &&ST_NOSPEC, [BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B, [BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H, [BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W, [BPF_LDX | BPF_PROBE_MEM | BPF_DW] = &&LDX_PROBE_MEM_DW, [BPF_LDX | BPF_PROBE_MEMSX | BPF_B] = &&LDX_PROBE_MEMSX_B, [BPF_LDX | BPF_PROBE_MEMSX | BPF_H] = &&LDX_PROBE_MEMSX_H, [BPF_LDX | BPF_PROBE_MEMSX | BPF_W] = &&LDX_PROBE_MEMSX_W, }; #undef BPF_INSN_3_LBL #undef BPF_INSN_2_LBL u32 tail_call_cnt = 0; #define CONT ({ insn++; goto select_insn; }) #define CONT_JMP ({ insn++; goto select_insn; }) select_insn: goto *jumptable[insn->code]; /* Explicitly mask the register-based shift amounts with 63 or 31 * to avoid undefined behavior. Normally this won't affect the * generated code, for example, in case of native 64 bit archs such * as x86-64 or arm64, the compiler is optimizing the AND away for * the interpreter. In case of JITs, each of the JIT backends compiles * the BPF shift operations to machine instructions which produce * implementation-defined results in such a case; the resulting * contents of the register may be arbitrary, but program behaviour * as a whole remains defined. In other words, in case of JIT backends, * the AND must /not/ be added to the emitted LSH/RSH/ARSH translation. */ /* ALU (shifts) */ #define SHT(OPCODE, OP) \ ALU64_##OPCODE##_X: \ DST = DST OP (SRC & 63); \ CONT; \ ALU_##OPCODE##_X: \ DST = (u32) DST OP ((u32) SRC & 31); \ CONT; \ ALU64_##OPCODE##_K: \ DST = DST OP IMM; \ CONT; \ ALU_##OPCODE##_K: \ DST = (u32) DST OP (u32) IMM; \ CONT; /* ALU (rest) */ #define ALU(OPCODE, OP) \ ALU64_##OPCODE##_X: \ DST = DST OP SRC; \ CONT; \ ALU_##OPCODE##_X: \ DST = (u32) DST OP (u32) SRC; \ CONT; \ ALU64_##OPCODE##_K: \ DST = DST OP IMM; \ CONT; \ ALU_##OPCODE##_K: \ DST = (u32) DST OP (u32) IMM; \ CONT; ALU(ADD, +) ALU(SUB, -) ALU(AND, &) ALU(OR, |) ALU(XOR, ^) ALU(MUL, *) SHT(LSH, <<) SHT(RSH, >>) #undef SHT #undef ALU ALU_NEG: DST = (u32) -DST; CONT; ALU64_NEG: DST = -DST; CONT; ALU_MOV_X: switch (OFF) { case 0: DST = (u32) SRC; break; case 8: DST = (u32)(s8) SRC; break; case 16: DST = (u32)(s16) SRC; break; } CONT; ALU_MOV_K: DST = (u32) IMM; CONT; ALU64_MOV_X: switch (OFF) { case 0: DST = SRC; break; case 8: DST = (s8) SRC; break; case 16: DST = (s16) SRC; break; case 32: DST = (s32) SRC; break; } CONT; ALU64_MOV_K: DST = IMM; CONT; LD_IMM_DW: DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32; insn++; CONT; ALU_ARSH_X: DST = (u64) (u32) (((s32) DST) >> (SRC & 31)); CONT; ALU_ARSH_K: DST = (u64) (u32) (((s32) DST) >> IMM); CONT; ALU64_ARSH_X: (*(s64 *) &DST) >>= (SRC & 63); CONT; ALU64_ARSH_K: (*(s64 *) &DST) >>= IMM; CONT; ALU64_MOD_X: switch (OFF) { case 0: div64_u64_rem(DST, SRC, &AX); DST = AX; break; case 1: AX = div64_s64(DST, SRC); DST = DST - AX * SRC; break; } CONT; ALU_MOD_X: switch (OFF) { case 0: AX = (u32) DST; DST = do_div(AX, (u32) SRC); break; case 1: AX = abs((s32)DST); AX = do_div(AX, abs((s32)SRC)); if ((s32)DST < 0) DST = (u32)-AX; else DST = (u32)AX; break; } CONT; ALU64_MOD_K: switch (OFF) { case 0: div64_u64_rem(DST, IMM, &AX); DST = AX; break; case 1: AX = div64_s64(DST, IMM); DST = DST - AX * IMM; break; } CONT; ALU_MOD_K: switch (OFF) { case 0: AX = (u32) DST; DST = do_div(AX, (u32) IMM); break; case 1: AX = abs((s32)DST); AX = do_div(AX, abs((s32)IMM)); if ((s32)DST < 0) DST = (u32)-AX; else DST = (u32)AX; break; } CONT; ALU64_DIV_X: switch (OFF) { case 0: DST = div64_u64(DST, SRC); break; case 1: DST = div64_s64(DST, SRC); break; } CONT; ALU_DIV_X: switch (OFF) { case 0: AX = (u32) DST; do_div(AX, (u32) SRC); DST = (u32) AX; break; case 1: AX = abs((s32)DST); do_div(AX, abs((s32)SRC)); if (((s32)DST < 0) == ((s32)SRC < 0)) DST = (u32)AX; else DST = (u32)-AX; break; } CONT; ALU64_DIV_K: switch (OFF) { case 0: DST = div64_u64(DST, IMM); break; case 1: DST = div64_s64(DST, IMM); break; } CONT; ALU_DIV_K: switch (OFF) { case 0: AX = (u32) DST; do_div(AX, (u32) IMM); DST = (u32) AX; break; case 1: AX = abs((s32)DST); do_div(AX, abs((s32)IMM)); if (((s32)DST < 0) == ((s32)IMM < 0)) DST = (u32)AX; else DST = (u32)-AX; break; } CONT; ALU_END_TO_BE: switch (IMM) { case 16: DST = (__force u16) cpu_to_be16(DST); break; case 32: DST = (__force u32) cpu_to_be32(DST); break; case 64: DST = (__force u64) cpu_to_be64(DST); break; } CONT; ALU_END_TO_LE: switch (IMM) { case 16: DST = (__force u16) cpu_to_le16(DST); break; case 32: DST = (__force u32) cpu_to_le32(DST); break; case 64: DST = (__force u64) cpu_to_le64(DST); break; } CONT; ALU64_END_TO_LE: switch (IMM) { case 16: DST = (__force u16) __swab16(DST); break; case 32: DST = (__force u32) __swab32(DST); break; case 64: DST = (__force u64) __swab64(DST); break; } CONT; /* CALL */ JMP_CALL: /* Function call scratches BPF_R1-BPF_R5 registers, * preserves BPF_R6-BPF_R9, and stores return value * into BPF_R0. */ BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3, BPF_R4, BPF_R5); CONT; JMP_CALL_ARGS: BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2, BPF_R3, BPF_R4, BPF_R5, insn + insn->off + 1); CONT; JMP_TAIL_CALL: { struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2; struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_prog *prog; u32 index = BPF_R3; if (unlikely(index >= array->map.max_entries)) goto out; if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT)) goto out; tail_call_cnt++; prog = READ_ONCE(array->ptrs[index]); if (!prog) goto out; /* ARG1 at this point is guaranteed to point to CTX from * the verifier side due to the fact that the tail call is * handled like a helper, that is, bpf_tail_call_proto, * where arg1_type is ARG_PTR_TO_CTX. */ insn = prog->insnsi; goto select_insn; out: CONT; } JMP_JA: insn += insn->off; CONT; JMP32_JA: insn += insn->imm; CONT; JMP_EXIT: return BPF_R0; /* JMP */ #define COND_JMP(SIGN, OPCODE, CMP_OP) \ JMP_##OPCODE##_X: \ if ((SIGN##64) DST CMP_OP (SIGN##64) SRC) { \ insn += insn->off; \ CONT_JMP; \ } \ CONT; \ JMP32_##OPCODE##_X: \ if ((SIGN##32) DST CMP_OP (SIGN##32) SRC) { \ insn += insn->off; \ CONT_JMP; \ } \ CONT; \ JMP_##OPCODE##_K: \ if ((SIGN##64) DST CMP_OP (SIGN##64) IMM) { \ insn += insn->off; \ CONT_JMP; \ } \ CONT; \ JMP32_##OPCODE##_K: \ if ((SIGN##32) DST CMP_OP (SIGN##32) IMM) { \ insn += insn->off; \ CONT_JMP; \ } \ CONT; COND_JMP(u, JEQ, ==) COND_JMP(u, JNE, !=) COND_JMP(u, JGT, >) COND_JMP(u, JLT, <) COND_JMP(u, JGE, >=) COND_JMP(u, JLE, <=) COND_JMP(u, JSET, &) COND_JMP(s, JSGT, >) COND_JMP(s, JSLT, <) COND_JMP(s, JSGE, >=) COND_JMP(s, JSLE, <=) #undef COND_JMP /* ST, STX and LDX*/ ST_NOSPEC: /* Speculation barrier for mitigating Speculative Store Bypass. * In case of arm64, we rely on the firmware mitigation as * controlled via the ssbd kernel parameter. Whenever the * mitigation is enabled, it works for all of the kernel code * with no need to provide any additional instructions here. * In case of x86, we use 'lfence' insn for mitigation. We * reuse preexisting logic from Spectre v1 mitigation that * happens to produce the required code on x86 for v4 as well. */ barrier_nospec(); CONT; #define LDST(SIZEOP, SIZE) \ STX_MEM_##SIZEOP: \ *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \ CONT; \ ST_MEM_##SIZEOP: \ *(SIZE *)(unsigned long) (DST + insn->off) = IMM; \ CONT; \ LDX_MEM_##SIZEOP: \ DST = *(SIZE *)(unsigned long) (SRC + insn->off); \ CONT; \ LDX_PROBE_MEM_##SIZEOP: \ bpf_probe_read_kernel_common(&DST, sizeof(SIZE), \ (const void *)(long) (SRC + insn->off)); \ DST = *((SIZE *)&DST); \ CONT; LDST(B, u8) LDST(H, u16) LDST(W, u32) LDST(DW, u64) #undef LDST #define LDSX(SIZEOP, SIZE) \ LDX_MEMSX_##SIZEOP: \ DST = *(SIZE *)(unsigned long) (SRC + insn->off); \ CONT; \ LDX_PROBE_MEMSX_##SIZEOP: \ bpf_probe_read_kernel_common(&DST, sizeof(SIZE), \ (const void *)(long) (SRC + insn->off)); \ DST = *((SIZE *)&DST); \ CONT; LDSX(B, s8) LDSX(H, s16) LDSX(W, s32) #undef LDSX #define ATOMIC_ALU_OP(BOP, KOP) \ case BOP: \ if (BPF_SIZE(insn->code) == BPF_W) \ atomic_##KOP((u32) SRC, (atomic_t *)(unsigned long) \ (DST + insn->off)); \ else \ atomic64_##KOP((u64) SRC, (atomic64_t *)(unsigned long) \ (DST + insn->off)); \ break; \ case BOP | BPF_FETCH: \ if (BPF_SIZE(insn->code) == BPF_W) \ SRC = (u32) atomic_fetch_##KOP( \ (u32) SRC, \ (atomic_t *)(unsigned long) (DST + insn->off)); \ else \ SRC = (u64) atomic64_fetch_##KOP( \ (u64) SRC, \ (atomic64_t *)(unsigned long) (DST + insn->off)); \ break; STX_ATOMIC_DW: STX_ATOMIC_W: switch (IMM) { ATOMIC_ALU_OP(BPF_ADD, add) ATOMIC_ALU_OP(BPF_AND, and) ATOMIC_ALU_OP(BPF_OR, or) ATOMIC_ALU_OP(BPF_XOR, xor) #undef ATOMIC_ALU_OP case BPF_XCHG: if (BPF_SIZE(insn->code) == BPF_W) SRC = (u32) atomic_xchg( (atomic_t *)(unsigned long) (DST + insn->off), (u32) SRC); else SRC = (u64) atomic64_xchg( (atomic64_t *)(unsigned long) (DST + insn->off), (u64) SRC); break; case BPF_CMPXCHG: if (BPF_SIZE(insn->code) == BPF_W) BPF_R0 = (u32) atomic_cmpxchg( (atomic_t *)(unsigned long) (DST + insn->off), (u32) BPF_R0, (u32) SRC); else BPF_R0 = (u64) atomic64_cmpxchg( (atomic64_t *)(unsigned long) (DST + insn->off), (u64) BPF_R0, (u64) SRC); break; default: goto default_label; } CONT; default_label: /* If we ever reach this, we have a bug somewhere. Die hard here * instead of just returning 0; we could be somewhere in a subprog, * so execution could continue otherwise which we do /not/ want. * * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable(). */ pr_warn("BPF interpreter: unknown opcode %02x (imm: 0x%x)\n", insn->code, insn->imm); BUG_ON(1); return 0; } #define PROG_NAME(stack_size) __bpf_prog_run##stack_size #define DEFINE_BPF_PROG_RUN(stack_size) \ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \ { \ u64 stack[stack_size / sizeof(u64)]; \ u64 regs[MAX_BPF_EXT_REG] = {}; \ \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ ARG1 = (u64) (unsigned long) ctx; \ return ___bpf_prog_run(regs, insn); \ } #define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size #define DEFINE_BPF_PROG_RUN_ARGS(stack_size) \ static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \ const struct bpf_insn *insn) \ { \ u64 stack[stack_size / sizeof(u64)]; \ u64 regs[MAX_BPF_EXT_REG]; \ \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ BPF_R1 = r1; \ BPF_R2 = r2; \ BPF_R3 = r3; \ BPF_R4 = r4; \ BPF_R5 = r5; \ return ___bpf_prog_run(regs, insn); \ } #define EVAL1(FN, X) FN(X) #define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y) #define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y) #define EVAL4(FN, X, Y...) FN(X) EVAL3(FN, Y) #define EVAL5(FN, X, Y...) FN(X) EVAL4(FN, Y) #define EVAL6(FN, X, Y...) FN(X) EVAL5(FN, Y) EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192); EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384); EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512); EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 32, 64, 96, 128, 160, 192); EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 224, 256, 288, 320, 352, 384); EVAL4(DEFINE_BPF_PROG_RUN_ARGS, 416, 448, 480, 512); #define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size), static unsigned int (*interpreters[])(const void *ctx, const struct bpf_insn *insn) = { EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) }; #undef PROG_NAME_LIST #define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size), static __maybe_unused u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, const struct bpf_insn *insn) = { EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) }; #undef PROG_NAME_LIST #ifdef CONFIG_BPF_SYSCALL void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) { stack_depth = max_t(u32, stack_depth, 1); insn->off = (s16) insn->imm; insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] - __bpf_call_base_args; insn->code = BPF_JMP | BPF_CALL_ARGS; } #endif #else static unsigned int __bpf_prog_ret0_warn(const void *ctx, const struct bpf_insn *insn) { /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON * is not working properly, so warn about it! */ WARN_ON_ONCE(1); return 0; } #endif bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp) { enum bpf_prog_type prog_type = resolve_prog_type(fp); bool ret; if (fp->kprobe_override) return false; /* XDP programs inserted into maps are not guaranteed to run on * a particular netdev (and can run outside driver context entirely * in the case of devmap and cpumap). Until device checks * are implemented, prohibit adding dev-bound programs to program maps. */ if (bpf_prog_is_dev_bound(fp->aux)) return false; spin_lock(&map->owner.lock); if (!map->owner.type) { /* There's no owner yet where we could check for * compatibility. */ map->owner.type = prog_type; map->owner.jited = fp->jited; map->owner.xdp_has_frags = fp->aux->xdp_has_frags; ret = true; } else { ret = map->owner.type == prog_type && map->owner.jited == fp->jited && map->owner.xdp_has_frags == fp->aux->xdp_has_frags; } spin_unlock(&map->owner.lock); return ret; } static int bpf_check_tail_call(const struct bpf_prog *fp) { struct bpf_prog_aux *aux = fp->aux; int i, ret = 0; mutex_lock(&aux->used_maps_mutex); for (i = 0; i < aux->used_map_cnt; i++) { struct bpf_map *map = aux->used_maps[i]; if (!map_type_contains_progs(map)) continue; if (!bpf_prog_map_compatible(map, fp)) { ret = -EINVAL; goto out; } } out: mutex_unlock(&aux->used_maps_mutex); return ret; } static void bpf_prog_select_func(struct bpf_prog *fp) { #ifndef CONFIG_BPF_JIT_ALWAYS_ON u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; #else fp->bpf_func = __bpf_prog_ret0_warn; #endif } /** * bpf_prog_select_runtime - select exec runtime for BPF program * @fp: bpf_prog populated with BPF program * @err: pointer to error variable * * Try to JIT eBPF program, if JIT is not available, use interpreter. * The BPF program will be executed via bpf_prog_run() function. * * Return: the &fp argument along with &err set to 0 for success or * a negative errno code on failure */ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { /* In case of BPF to BPF calls, verifier did all the prep * work with regards to JITing, etc. */ bool jit_needed = false; if (fp->bpf_func) goto finalize; if (IS_ENABLED(CONFIG_BPF_JIT_ALWAYS_ON) || bpf_prog_has_kfunc_call(fp)) jit_needed = true; bpf_prog_select_func(fp); /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during * blinding, bpf_int_jit_compile() must always return a * valid program, which in this case would simply not * be JITed, but falls back to the interpreter. */ if (!bpf_prog_is_offloaded(fp->aux)) { *err = bpf_prog_alloc_jited_linfo(fp); if (*err) return fp; fp = bpf_int_jit_compile(fp); bpf_prog_jit_attempt_done(fp); if (!fp->jited && jit_needed) { *err = -ENOTSUPP; return fp; } } else { *err = bpf_prog_offload_compile(fp); if (*err) return fp; } finalize: bpf_prog_lock_ro(fp); /* The tail call compatibility check can only be done at * this late stage as we need to determine, if we deal * with JITed or non JITed program concatenations and not * all eBPF JITs might immediately support all features. */ *err = bpf_check_tail_call(fp); return fp; } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); static unsigned int __bpf_prog_ret1(const void *ctx, const struct bpf_insn *insn) { return 1; } static struct bpf_prog_dummy { struct bpf_prog prog; } dummy_bpf_prog = { .prog = { .bpf_func = __bpf_prog_ret1, }, }; struct bpf_empty_prog_array bpf_empty_prog_array = { .null_prog = NULL, }; EXPORT_SYMBOL(bpf_empty_prog_array); struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) { if (prog_cnt) return kzalloc(sizeof(struct bpf_prog_array) + sizeof(struct bpf_prog_array_item) * (prog_cnt + 1), flags); return &bpf_empty_prog_array.hdr; } void bpf_prog_array_free(struct bpf_prog_array *progs) { if (!progs || progs == &bpf_empty_prog_array.hdr) return; kfree_rcu(progs, rcu); } static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu) { struct bpf_prog_array *progs; /* If RCU Tasks Trace grace period implies RCU grace period, there is * no need to call kfree_rcu(), just call kfree() directly. */ progs = container_of(rcu, struct bpf_prog_array, rcu); if (rcu_trace_implies_rcu_gp()) kfree(progs); else kfree_rcu(progs, rcu); } void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs) { if (!progs || progs == &bpf_empty_prog_array.hdr) return; call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb); } int bpf_prog_array_length(struct bpf_prog_array *array) { struct bpf_prog_array_item *item; u32 cnt = 0; for (item = array->items; item->prog; item++) if (item->prog != &dummy_bpf_prog.prog) cnt++; return cnt; } bool bpf_prog_array_is_empty(struct bpf_prog_array *array) { struct bpf_prog_array_item *item; for (item = array->items; item->prog; item++) if (item->prog != &dummy_bpf_prog.prog) return false; return true; } static bool bpf_prog_array_copy_core(struct bpf_prog_array *array, u32 *prog_ids, u32 request_cnt) { struct bpf_prog_array_item *item; int i = 0; for (item = array->items; item->prog; item++) { if (item->prog == &dummy_bpf_prog.prog) continue; prog_ids[i] = item->prog->aux->id; if (++i == request_cnt) { item++; break; } } return !!(item->prog); } int bpf_prog_array_copy_to_user(struct bpf_prog_array *array, __u32 __user *prog_ids, u32 cnt) { unsigned long err = 0; bool nospc; u32 *ids; /* users of this function are doing: * cnt = bpf_prog_array_length(); * if (cnt > 0) * bpf_prog_array_copy_to_user(..., cnt); * so below kcalloc doesn't need extra cnt > 0 check. */ ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN); if (!ids) return -ENOMEM; nospc = bpf_prog_array_copy_core(array, ids, cnt); err = copy_to_user(prog_ids, ids, cnt * sizeof(u32)); kfree(ids); if (err) return -EFAULT; if (nospc) return -ENOSPC; return 0; } void bpf_prog_array_delete_safe(struct bpf_prog_array *array, struct bpf_prog *old_prog) { struct bpf_prog_array_item *item; for (item = array->items; item->prog; item++) if (item->prog == old_prog) { WRITE_ONCE(item->prog, &dummy_bpf_prog.prog); break; } } /** * bpf_prog_array_delete_safe_at() - Replaces the program at the given * index into the program array with * a dummy no-op program. * @array: a bpf_prog_array * @index: the index of the program to replace * * Skips over dummy programs, by not counting them, when calculating * the position of the program to replace. * * Return: * * 0 - Success * * -EINVAL - Invalid index value. Must be a non-negative integer. * * -ENOENT - Index out of range */ int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index) { return bpf_prog_array_update_at(array, index, &dummy_bpf_prog.prog); } /** * bpf_prog_array_update_at() - Updates the program at the given index * into the program array. * @array: a bpf_prog_array * @index: the index of the program to update * @prog: the program to insert into the array * * Skips over dummy programs, by not counting them, when calculating * the position of the program to update. * * Return: * * 0 - Success * * -EINVAL - Invalid index value. Must be a non-negative integer. * * -ENOENT - Index out of range */ int bpf_prog_array_update_at(struct bpf_prog_array *array, int index, struct bpf_prog *prog) { struct bpf_prog_array_item *item; if (unlikely(index < 0)) return -EINVAL; for (item = array->items; item->prog; item++) { if (item->prog == &dummy_bpf_prog.prog) continue; if (!index) { WRITE_ONCE(item->prog, prog); return 0; } index--; } return -ENOENT; } int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, u64 bpf_cookie, struct bpf_prog_array **new_array) { int new_prog_cnt, carry_prog_cnt = 0; struct bpf_prog_array_item *existing, *new; struct bpf_prog_array *array; bool found_exclude = false; /* Figure out how many existing progs we need to carry over to * the new array. */ if (old_array) { existing = old_array->items; for (; existing->prog; existing++) { if (existing->prog == exclude_prog) { found_exclude = true; continue; } if (existing->prog != &dummy_bpf_prog.prog) carry_prog_cnt++; if (existing->prog == include_prog) return -EEXIST; } } if (exclude_prog && !found_exclude) return -ENOENT; /* How many progs (not NULL) will be in the new array? */ new_prog_cnt = carry_prog_cnt; if (include_prog) new_prog_cnt += 1; /* Do we have any prog (not NULL) in the new array? */ if (!new_prog_cnt) { *new_array = NULL; return 0; } /* +1 as the end of prog_array is marked with NULL */ array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL); if (!array) return -ENOMEM; new = array->items; /* Fill in the new prog array */ if (carry_prog_cnt) { existing = old_array->items; for (; existing->prog; existing++) { if (existing->prog == exclude_prog || existing->prog == &dummy_bpf_prog.prog) continue; new->prog = existing->prog; new->bpf_cookie = existing->bpf_cookie; new++; } } if (include_prog) { new->prog = include_prog; new->bpf_cookie = bpf_cookie; new++; } new->prog = NULL; *new_array = array; return 0; } int bpf_prog_array_copy_info(struct bpf_prog_array *array, u32 *prog_ids, u32 request_cnt, u32 *prog_cnt) { u32 cnt = 0; if (array) cnt = bpf_prog_array_length(array); *prog_cnt = cnt; /* return early if user requested only program count or nothing to copy */ if (!request_cnt || !cnt) return 0; /* this function is called under trace/bpf_trace.c: bpf_event_mutex */ return bpf_prog_array_copy_core(array, prog_ids, request_cnt) ? -ENOSPC : 0; } void __bpf_free_used_maps(struct bpf_prog_aux *aux, struct bpf_map **used_maps, u32 len) { struct bpf_map *map; bool sleepable; u32 i; sleepable = aux->prog->sleepable; for (i = 0; i < len; i++) { map = used_maps[i]; if (map->ops->map_poke_untrack) map->ops->map_poke_untrack(map, aux); if (sleepable) atomic64_dec(&map->sleepable_refcnt); bpf_map_put(map); } } static void bpf_free_used_maps(struct bpf_prog_aux *aux) { __bpf_free_used_maps(aux, aux->used_maps, aux->used_map_cnt); kfree(aux->used_maps); } void __bpf_free_used_btfs(struct bpf_prog_aux *aux, struct btf_mod_pair *used_btfs, u32 len) { #ifdef CONFIG_BPF_SYSCALL struct btf_mod_pair *btf_mod; u32 i; for (i = 0; i < len; i++) { btf_mod = &used_btfs[i]; if (btf_mod->module) module_put(btf_mod->module); btf_put(btf_mod->btf); } #endif } static void bpf_free_used_btfs(struct bpf_prog_aux *aux) { __bpf_free_used_btfs(aux, aux->used_btfs, aux->used_btf_cnt); kfree(aux->used_btfs); } static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; int i; aux = container_of(work, struct bpf_prog_aux, work); #ifdef CONFIG_BPF_SYSCALL bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab); #endif #ifdef CONFIG_CGROUP_BPF if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID) bpf_cgroup_atype_put(aux->cgroup_atype); #endif bpf_free_used_maps(aux); bpf_free_used_btfs(aux); if (bpf_prog_is_dev_bound(aux)) bpf_prog_dev_bound_destroy(aux->prog); #ifdef CONFIG_PERF_EVENTS if (aux->prog->has_callchain_buf) put_callchain_buffers(); #endif if (aux->dst_trampoline) bpf_trampoline_put(aux->dst_trampoline); for (i = 0; i < aux->real_func_cnt; i++) { /* We can just unlink the subprog poke descriptor table as * it was originally linked to the main program and is also * released along with it. */ aux->func[i]->aux->poke_tab = NULL; bpf_jit_free(aux->func[i]); } if (aux->real_func_cnt) { kfree(aux->func); bpf_prog_unlock_free(aux->prog); } else { bpf_jit_free(aux->prog); } } void bpf_prog_free(struct bpf_prog *fp) { struct bpf_prog_aux *aux = fp->aux; if (aux->dst_prog) bpf_prog_put(aux->dst_prog); bpf_token_put(aux->token); INIT_WORK(&aux->work, bpf_prog_free_deferred); schedule_work(&aux->work); } EXPORT_SYMBOL_GPL(bpf_prog_free); /* RNG for unpriviledged user space with separated state from prandom_u32(). */ static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state); void bpf_user_rnd_init_once(void) { prandom_init_once(&bpf_user_rnd_state); } BPF_CALL_0(bpf_user_rnd_u32) { /* Should someone ever have the rather unwise idea to use some * of the registers passed into this function, then note that * this function is called from native eBPF and classic-to-eBPF * transformations. Register assignments from both sides are * different, f.e. classic always sets fn(ctx, A, X) here. */ struct rnd_state *state; u32 res; state = &get_cpu_var(bpf_user_rnd_state); res = prandom_u32_state(state); put_cpu_var(bpf_user_rnd_state); return res; } BPF_CALL_0(bpf_get_raw_cpu_id) { return raw_smp_processor_id(); } /* Weak definitions of helper functions in case we don't have bpf syscall. */ const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; const struct bpf_func_proto bpf_map_update_elem_proto __weak; const struct bpf_func_proto bpf_map_delete_elem_proto __weak; const struct bpf_func_proto bpf_map_push_elem_proto __weak; const struct bpf_func_proto bpf_map_pop_elem_proto __weak; const struct bpf_func_proto bpf_map_peek_elem_proto __weak; const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto __weak; const struct bpf_func_proto bpf_spin_lock_proto __weak; const struct bpf_func_proto bpf_spin_unlock_proto __weak; const struct bpf_func_proto bpf_jiffies64_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; const struct bpf_func_proto bpf_get_numa_node_id_proto __weak; const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak; const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak; const struct bpf_func_proto bpf_ktime_get_tai_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_local_storage_proto __weak; const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_snprintf_btf_proto __weak; const struct bpf_func_proto bpf_seq_printf_btf_proto __weak; const struct bpf_func_proto bpf_set_retval_proto __weak; const struct bpf_func_proto bpf_get_retval_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { return NULL; } const struct bpf_func_proto * __weak bpf_get_trace_vprintk_proto(void) { return NULL; } u64 __weak bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) { return -ENOTSUPP; } EXPORT_SYMBOL_GPL(bpf_event_output); /* Always built-in helper functions. */ const struct bpf_func_proto bpf_tail_call_proto = { .func = NULL, .gpl_only = false, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; /* Stub for JITs that only support cBPF. eBPF programs are interpreted. * It is encouraged to implement bpf_int_jit_compile() instead, so that * eBPF and implicitly also cBPF can get JITed! */ struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog) { return prog; } /* Stub for JITs that support eBPF. All cBPF code gets transformed into * eBPF by the kernel and is later compiled by bpf_int_jit_compile(). */ void __weak bpf_jit_compile(struct bpf_prog *prog) { } bool __weak bpf_helper_changes_pkt_data(void *func) { return false; } /* Return TRUE if the JIT backend wants verifier to enable sub-register usage * analysis code and wants explicit zero extension inserted by verifier. * Otherwise, return FALSE. * * The verifier inserts an explicit zero extension after BPF_CMPXCHGs even if * you don't override this. JITs that don't want these extra insns can detect * them using insn_is_zext. */ bool __weak bpf_jit_needs_zext(void) { return false; } /* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */ bool __weak bpf_jit_supports_subprog_tailcalls(void) { return false; } bool __weak bpf_jit_supports_kfunc_call(void) { return false; } bool __weak bpf_jit_supports_far_kfunc_call(void) { return false; } bool __weak bpf_jit_supports_arena(void) { return false; } /* Return TRUE if the JIT backend satisfies the following two conditions: * 1) JIT backend supports atomic_xchg() on pointer-sized words. * 2) Under the specific arch, the implementation of xchg() is the same * as atomic_xchg() on pointer-sized words. */ bool __weak bpf_jit_supports_ptr_xchg(void) { return false; } /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call * skb_copy_bits(), so provide a weak definition of it for NET-less config. */ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) { return -EFAULT; } int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2) { return -ENOTSUPP; } void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len) { return ERR_PTR(-ENOTSUPP); } int __weak bpf_arch_text_invalidate(void *dst, size_t len) { return -ENOTSUPP; } bool __weak bpf_jit_supports_exceptions(void) { return false; } void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie) { } /* for configs without MMU or 32-bit */ __weak const struct bpf_map_ops arena_map_ops; __weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena) { return 0; } __weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena) { return 0; } #ifdef CONFIG_BPF_SYSCALL static int __init bpf_global_ma_init(void) { int ret; ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false); bpf_global_ma_set = !ret; return ret; } late_initcall(bpf_global_ma_init); #endif DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); EXPORT_SYMBOL(bpf_stats_enabled_key); /* All definitions of tracepoints related to BPF. */ #define CREATE_TRACE_POINTS #include <linux/bpf_trace.h> EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx);
17 18 20 20 18 3 17 15 2 13 13 179 179 178 79 81 80 1 46 24 82 82 1 1 81 82 82 82 82 82 82 81 82 82 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 // SPDX-License-Identifier: GPL-2.0 /* * event tracer * * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com> * * - Added format output of fields of the trace point. * This was based off of work by Tom Zanussi <tzanussi@gmail.com>. * */ #define pr_fmt(fmt) fmt #include <linux/workqueue.h> #include <linux/security.h> #include <linux/spinlock.h> #include <linux/kthread.h> #include <linux/tracefs.h> #include <linux/uaccess.h> #include <linux/module.h> #include <linux/ctype.h> #include <linux/sort.h> #include <linux/slab.h> #include <linux/delay.h> #include <trace/events/sched.h> #include <trace/syscall.h> #include <asm/setup.h> #include "trace_output.h" #undef TRACE_SYSTEM #define TRACE_SYSTEM "TRACE_SYSTEM" DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); static LIST_HEAD(ftrace_generic_fields); static LIST_HEAD(ftrace_common_fields); static bool eventdir_initialized; static LIST_HEAD(module_strings); struct module_string { struct list_head next; struct module *module; char *str; }; #define GFP_TRACE (GFP_KERNEL | __GFP_ZERO) static struct kmem_cache *field_cachep; static struct kmem_cache *file_cachep; static inline int system_refcount(struct event_subsystem *system) { return system->ref_count; } static int system_refcount_inc(struct event_subsystem *system) { return system->ref_count++; } static int system_refcount_dec(struct event_subsystem *system) { return --system->ref_count; } /* Double loops, do not use break, only goto's work */ #define do_for_each_event_file(tr, file) \ list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ list_for_each_entry(file, &tr->events, list) #define do_for_each_event_file_safe(tr, file) \ list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ struct trace_event_file *___n; \ list_for_each_entry_safe(file, ___n, &tr->events, list) #define while_for_each_event_file() \ } static struct ftrace_event_field * __find_event_field(struct list_head *head, char *name) { struct ftrace_event_field *field; list_for_each_entry(field, head, link) { if (!strcmp(field->name, name)) return field; } return NULL; } struct ftrace_event_field * trace_find_event_field(struct trace_event_call *call, char *name) { struct ftrace_event_field *field; struct list_head *head; head = trace_get_fields(call); field = __find_event_field(head, name); if (field) return field; field = __find_event_field(&ftrace_generic_fields, name); if (field) return field; return __find_event_field(&ftrace_common_fields, name); } static int __trace_define_field(struct list_head *head, const char *type, const char *name, int offset, int size, int is_signed, int filter_type, int len) { struct ftrace_event_field *field; field = kmem_cache_alloc(field_cachep, GFP_TRACE); if (!field) return -ENOMEM; field->name = name; field->type = type; if (filter_type == FILTER_OTHER) field->filter_type = filter_assign_type(type); else field->filter_type = filter_type; field->offset = offset; field->size = size; field->is_signed = is_signed; field->len = len; list_add(&field->link, head); return 0; } int trace_define_field(struct trace_event_call *call, const char *type, const char *name, int offset, int size, int is_signed, int filter_type) { struct list_head *head; if (WARN_ON(!call->class)) return 0; head = trace_get_fields(call); return __trace_define_field(head, type, name, offset, size, is_signed, filter_type, 0); } EXPORT_SYMBOL_GPL(trace_define_field); static int trace_define_field_ext(struct trace_event_call *call, const char *type, const char *name, int offset, int size, int is_signed, int filter_type, int len) { struct list_head *head; if (WARN_ON(!call->class)) return 0; head = trace_get_fields(call); return __trace_define_field(head, type, name, offset, size, is_signed, filter_type, len); } #define __generic_field(type, item, filter_type) \ ret = __trace_define_field(&ftrace_generic_fields, #type, \ #item, 0, 0, is_signed_type(type), \ filter_type, 0); \ if (ret) \ return ret; #define __common_field(type, item) \ ret = __trace_define_field(&ftrace_common_fields, #type, \ "common_" #item, \ offsetof(typeof(ent), item), \ sizeof(ent.item), \ is_signed_type(type), FILTER_OTHER, 0); \ if (ret) \ return ret; static int trace_define_generic_fields(void) { int ret; __generic_field(int, CPU, FILTER_CPU); __generic_field(int, cpu, FILTER_CPU); __generic_field(int, common_cpu, FILTER_CPU); __generic_field(char *, COMM, FILTER_COMM); __generic_field(char *, comm, FILTER_COMM); __generic_field(char *, stacktrace, FILTER_STACKTRACE); __generic_field(char *, STACKTRACE, FILTER_STACKTRACE); return ret; } static int trace_define_common_fields(void) { int ret; struct trace_entry ent; __common_field(unsigned short, type); __common_field(unsigned char, flags); /* Holds both preempt_count and migrate_disable */ __common_field(unsigned char, preempt_count); __common_field(int, pid); return ret; } static void trace_destroy_fields(struct trace_event_call *call) { struct ftrace_event_field *field, *next; struct list_head *head; head = trace_get_fields(call); list_for_each_entry_safe(field, next, head, link) { list_del(&field->link); kmem_cache_free(field_cachep, field); } } /* * run-time version of trace_event_get_offsets_<call>() that returns the last * accessible offset of trace fields excluding __dynamic_array bytes */ int trace_event_get_offsets(struct trace_event_call *call) { struct ftrace_event_field *tail; struct list_head *head; head = trace_get_fields(call); /* * head->next points to the last field with the largest offset, * since it was added last by trace_define_field() */ tail = list_first_entry(head, struct ftrace_event_field, link); return tail->offset + tail->size; } /* * Check if the referenced field is an array and return true, * as arrays are OK to dereference. */ static bool test_field(const char *fmt, struct trace_event_call *call) { struct trace_event_fields *field = call->class->fields_array; const char *array_descriptor; const char *p = fmt; int len; if (!(len = str_has_prefix(fmt, "REC->"))) return false; fmt += len; for (p = fmt; *p; p++) { if (!isalnum(*p) && *p != '_') break; } len = p - fmt; for (; field->type; field++) { if (strncmp(field->name, fmt, len) || field->name[len]) continue; array_descriptor = strchr(field->type, '['); /* This is an array and is OK to dereference. */ return array_descriptor != NULL; } return false; } /* * Examine the print fmt of the event looking for unsafe dereference * pointers using %p* that could be recorded in the trace event and * much later referenced after the pointer was freed. Dereferencing * pointers are OK, if it is dereferenced into the event itself. */ static void test_event_printk(struct trace_event_call *call) { u64 dereference_flags = 0; bool first = true; const char *fmt, *c, *r, *a; int parens = 0; char in_quote = 0; int start_arg = 0; int arg = 0; int i; fmt = call->print_fmt; if (!fmt) return; for (i = 0; fmt[i]; i++) { switch (fmt[i]) { case '\\': i++; if (!fmt[i]) return; continue; case '"': case '\'': /* * The print fmt starts with a string that * is processed first to find %p* usage, * then after the first string, the print fmt * contains arguments that are used to check * if the dereferenced %p* usage is safe. */ if (first) { if (fmt[i] == '\'') continue; if (in_quote) { arg = 0; first = false; /* * If there was no %p* uses * the fmt is OK. */ if (!dereference_flags) return; } } if (in_quote) { if (in_quote == fmt[i]) in_quote = 0; } else { in_quote = fmt[i]; } continue; case '%': if (!first || !in_quote) continue; i++; if (!fmt[i]) return; switch (fmt[i]) { case '%': continue; case 'p': /* Find dereferencing fields */ switch (fmt[i + 1]) { case 'B': case 'R': case 'r': case 'b': case 'M': case 'm': case 'I': case 'i': case 'E': case 'U': case 'V': case 'N': case 'a': case 'd': case 'D': case 'g': case 't': case 'C': case 'O': case 'f': if (WARN_ONCE(arg == 63, "Too many args for event: %s", trace_event_name(call))) return; dereference_flags |= 1ULL << arg; } break; default: { bool star = false; int j; /* Increment arg if %*s exists. */ for (j = 0; fmt[i + j]; j++) { if (isdigit(fmt[i + j]) || fmt[i + j] == '.') continue; if (fmt[i + j] == '*') { star = true; continue; } if ((fmt[i + j] == 's') && star) arg++; break; } break; } /* default */ } /* switch */ arg++; continue; case '(': if (in_quote) continue; parens++; continue; case ')': if (in_quote) continue; parens--; if (WARN_ONCE(parens < 0, "Paren mismatch for event: %s\narg='%s'\n%*s", trace_event_name(call), fmt + start_arg, (i - start_arg) + 5, "^")) return; continue; case ',': if (in_quote || parens) continue; i++; while (isspace(fmt[i])) i++; start_arg = i; if (!(dereference_flags & (1ULL << arg))) goto next_arg; /* Find the REC-> in the argument */ c = strchr(fmt + i, ','); r = strstr(fmt + i, "REC->"); if (r && (!c || r < c)) { /* * Addresses of events on the buffer, * or an array on the buffer is * OK to dereference. * There's ways to fool this, but * this is to catch common mistakes, * not malicious code. */ a = strchr(fmt + i, '&'); if ((a && (a < r)) || test_field(r, call)) dereference_flags &= ~(1ULL << arg); } else if ((r = strstr(fmt + i, "__get_dynamic_array(")) && (!c || r < c)) { dereference_flags &= ~(1ULL << arg); } else if ((r = strstr(fmt + i, "__get_sockaddr(")) && (!c || r < c)) { dereference_flags &= ~(1ULL << arg); } next_arg: i--; arg++; } } /* * If you triggered the below warning, the trace event reported * uses an unsafe dereference pointer %p*. As the data stored * at the trace event time may no longer exist when the trace * event is printed, dereferencing to the original source is * unsafe. The source of the dereference must be copied into the * event itself, and the dereference must access the copy instead. */ if (WARN_ON_ONCE(dereference_flags)) { arg = 1; while (!(dereference_flags & 1)) { dereference_flags >>= 1; arg++; } pr_warn("event %s has unsafe dereference of argument %d\n", trace_event_name(call), arg); pr_warn("print_fmt: %s\n", fmt); } } int trace_event_raw_init(struct trace_event_call *call) { int id; id = register_trace_event(&call->event); if (!id) return -ENODEV; test_event_printk(call); return 0; } EXPORT_SYMBOL_GPL(trace_event_raw_init); bool trace_event_ignore_this_pid(struct trace_event_file *trace_file) { struct trace_array *tr = trace_file->tr; struct trace_array_cpu *data; struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; pid_list = rcu_dereference_raw(tr->filtered_pids); no_pid_list = rcu_dereference_raw(tr->filtered_no_pids); if (!pid_list && !no_pid_list) return false; data = this_cpu_ptr(tr->array_buffer.data); return data->ignore_pid; } EXPORT_SYMBOL_GPL(trace_event_ignore_this_pid); void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, struct trace_event_file *trace_file, unsigned long len) { struct trace_event_call *event_call = trace_file->event_call; if ((trace_file->flags & EVENT_FILE_FL_PID_FILTER) && trace_event_ignore_this_pid(trace_file)) return NULL; /* * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables * preemption (adding one to the preempt_count). Since we are * interested in the preempt_count at the time the tracepoint was * hit, we need to subtract one to offset the increment. */ fbuffer->trace_ctx = tracing_gen_ctx_dec(); fbuffer->trace_file = trace_file; fbuffer->event = trace_event_buffer_lock_reserve(&fbuffer->buffer, trace_file, event_call->event.type, len, fbuffer->trace_ctx); if (!fbuffer->event) return NULL; fbuffer->regs = NULL; fbuffer->entry = ring_buffer_event_data(fbuffer->event); return fbuffer->entry; } EXPORT_SYMBOL_GPL(trace_event_buffer_reserve); int trace_event_reg(struct trace_event_call *call, enum trace_reg type, void *data) { struct trace_event_file *file = data; WARN_ON(!(call->flags & TRACE_EVENT_FL_TRACEPOINT)); switch (type) { case TRACE_REG_REGISTER: return tracepoint_probe_register(call->tp, call->class->probe, file); case TRACE_REG_UNREGISTER: tracepoint_probe_unregister(call->tp, call->class->probe, file); return 0; #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: return tracepoint_probe_register(call->tp, call->class->perf_probe, call); case TRACE_REG_PERF_UNREGISTER: tracepoint_probe_unregister(call->tp, call->class->perf_probe, call); return 0; case TRACE_REG_PERF_OPEN: case TRACE_REG_PERF_CLOSE: case TRACE_REG_PERF_ADD: case TRACE_REG_PERF_DEL: return 0; #endif } return 0; } EXPORT_SYMBOL_GPL(trace_event_reg); void trace_event_enable_cmd_record(bool enable) { struct trace_event_file *file; struct trace_array *tr; lockdep_assert_held(&event_mutex); do_for_each_event_file(tr, file) { if (!(file->flags & EVENT_FILE_FL_ENABLED)) continue; if (enable) { tracing_start_cmdline_record(); set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } else { tracing_stop_cmdline_record(); clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } } while_for_each_event_file(); } void trace_event_enable_tgid_record(bool enable) { struct trace_event_file *file; struct trace_array *tr; lockdep_assert_held(&event_mutex); do_for_each_event_file(tr, file) { if (!(file->flags & EVENT_FILE_FL_ENABLED)) continue; if (enable) { tracing_start_tgid_record(); set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); } else { tracing_stop_tgid_record(); clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); } } while_for_each_event_file(); } static int __ftrace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable) { struct trace_event_call *call = file->event_call; struct trace_array *tr = file->tr; int ret = 0; int disable; switch (enable) { case 0: /* * When soft_disable is set and enable is cleared, the sm_ref * reference counter is decremented. If it reaches 0, we want * to clear the SOFT_DISABLED flag but leave the event in the * state that it was. That is, if the event was enabled and * SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED * is set we do not want the event to be enabled before we * clear the bit. * * When soft_disable is not set but the SOFT_MODE flag is, * we do nothing. Do not disable the tracepoint, otherwise * "soft enable"s (clearing the SOFT_DISABLED bit) wont work. */ if (soft_disable) { if (atomic_dec_return(&file->sm_ref) > 0) break; disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED; clear_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags); /* Disable use of trace_buffered_event */ trace_buffered_event_disable(); } else disable = !(file->flags & EVENT_FILE_FL_SOFT_MODE); if (disable && (file->flags & EVENT_FILE_FL_ENABLED)) { clear_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags); if (file->flags & EVENT_FILE_FL_RECORDED_CMD) { tracing_stop_cmdline_record(); clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } if (file->flags & EVENT_FILE_FL_RECORDED_TGID) { tracing_stop_tgid_record(); clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); } call->class->reg(call, TRACE_REG_UNREGISTER, file); } /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ if (file->flags & EVENT_FILE_FL_SOFT_MODE) set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); else clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); break; case 1: /* * When soft_disable is set and enable is set, we want to * register the tracepoint for the event, but leave the event * as is. That means, if the event was already enabled, we do * nothing (but set SOFT_MODE). If the event is disabled, we * set SOFT_DISABLED before enabling the event tracepoint, so * it still seems to be disabled. */ if (!soft_disable) clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); else { if (atomic_inc_return(&file->sm_ref) > 1) break; set_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags); /* Enable use of trace_buffered_event */ trace_buffered_event_enable(); } if (!(file->flags & EVENT_FILE_FL_ENABLED)) { bool cmd = false, tgid = false; /* Keep the event disabled, when going to SOFT_MODE. */ if (soft_disable) set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); if (tr->trace_flags & TRACE_ITER_RECORD_CMD) { cmd = true; tracing_start_cmdline_record(); set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { tgid = true; tracing_start_tgid_record(); set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); } ret = call->class->reg(call, TRACE_REG_REGISTER, file); if (ret) { if (cmd) tracing_stop_cmdline_record(); if (tgid) tracing_stop_tgid_record(); pr_info("event trace: Could not enable event " "%s\n", trace_event_name(call)); break; } set_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags); /* WAS_ENABLED gets set but never cleared. */ set_bit(EVENT_FILE_FL_WAS_ENABLED_BIT, &file->flags); } break; } return ret; } int trace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable) { return __ftrace_event_enable_disable(file, enable, soft_disable); } static int ftrace_event_enable_disable(struct trace_event_file *file, int enable) { return __ftrace_event_enable_disable(file, enable, 0); } static void ftrace_clear_events(struct trace_array *tr) { struct trace_event_file *file; mutex_lock(&event_mutex); list_for_each_entry(file, &tr->events, list) { ftrace_event_enable_disable(file, 0); } mutex_unlock(&event_mutex); } static void event_filter_pid_sched_process_exit(void *data, struct task_struct *task) { struct trace_pid_list *pid_list; struct trace_array *tr = data; pid_list = rcu_dereference_raw(tr->filtered_pids); trace_filter_add_remove_task(pid_list, NULL, task); pid_list = rcu_dereference_raw(tr->filtered_no_pids); trace_filter_add_remove_task(pid_list, NULL, task); } static void event_filter_pid_sched_process_fork(void *data, struct task_struct *self, struct task_struct *task) { struct trace_pid_list *pid_list; struct trace_array *tr = data; pid_list = rcu_dereference_sched(tr->filtered_pids); trace_filter_add_remove_task(pid_list, self, task); pid_list = rcu_dereference_sched(tr->filtered_no_pids); trace_filter_add_remove_task(pid_list, self, task); } void trace_event_follow_fork(struct trace_array *tr, bool enable) { if (enable) { register_trace_prio_sched_process_fork(event_filter_pid_sched_process_fork, tr, INT_MIN); register_trace_prio_sched_process_free(event_filter_pid_sched_process_exit, tr, INT_MAX); } else { unregister_trace_sched_process_fork(event_filter_pid_sched_process_fork, tr); unregister_trace_sched_process_free(event_filter_pid_sched_process_exit, tr); } } static void event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, struct task_struct *prev, struct task_struct *next, unsigned int prev_state) { struct trace_array *tr = data; struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; bool ret; pid_list = rcu_dereference_sched(tr->filtered_pids); no_pid_list = rcu_dereference_sched(tr->filtered_no_pids); /* * Sched switch is funny, as we only want to ignore it * in the notrace case if both prev and next should be ignored. */ ret = trace_ignore_this_task(NULL, no_pid_list, prev) && trace_ignore_this_task(NULL, no_pid_list, next); this_cpu_write(tr->array_buffer.data->ignore_pid, ret || (trace_ignore_this_task(pid_list, NULL, prev) && trace_ignore_this_task(pid_list, NULL, next))); } static void event_filter_pid_sched_switch_probe_post(void *data, bool preempt, struct task_struct *prev, struct task_struct *next, unsigned int prev_state) { struct trace_array *tr = data; struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; pid_list = rcu_dereference_sched(tr->filtered_pids); no_pid_list = rcu_dereference_sched(tr->filtered_no_pids); this_cpu_write(tr->array_buffer.data->ignore_pid, trace_ignore_this_task(pid_list, no_pid_list, next)); } static void event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task) { struct trace_array *tr = data; struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; /* Nothing to do if we are already tracing */ if (!this_cpu_read(tr->array_buffer.data->ignore_pid)) return; pid_list = rcu_dereference_sched(tr->filtered_pids); no_pid_list = rcu_dereference_sched(tr->filtered_no_pids); this_cpu_write(tr->array_buffer.data->ignore_pid, trace_ignore_this_task(pid_list, no_pid_list, task)); } static void event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task) { struct trace_array *tr = data; struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; /* Nothing to do if we are not tracing */ if (this_cpu_read(tr->array_buffer.data->ignore_pid)) return; pid_list = rcu_dereference_sched(tr->filtered_pids); no_pid_list = rcu_dereference_sched(tr->filtered_no_pids); /* Set tracing if current is enabled */ this_cpu_write(tr->array_buffer.data->ignore_pid, trace_ignore_this_task(pid_list, no_pid_list, current)); } static void unregister_pid_events(struct trace_array *tr) { unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_pre, tr); unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_post, tr); unregister_trace_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre, tr); unregister_trace_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, tr); unregister_trace_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_pre, tr); unregister_trace_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_post, tr); unregister_trace_sched_waking(event_filter_pid_sched_wakeup_probe_pre, tr); unregister_trace_sched_waking(event_filter_pid_sched_wakeup_probe_post, tr); } static void __ftrace_clear_event_pids(struct trace_array *tr, int type) { struct trace_pid_list *pid_list; struct trace_pid_list *no_pid_list; struct trace_event_file *file; int cpu; pid_list = rcu_dereference_protected(tr->filtered_pids, lockdep_is_held(&event_mutex)); no_pid_list = rcu_dereference_protected(tr->filtered_no_pids, lockdep_is_held(&event_mutex)); /* Make sure there's something to do */ if (!pid_type_enabled(type, pid_list, no_pid_list)) return; if (!still_need_pid_events(type, pid_list, no_pid_list)) { unregister_pid_events(tr); list_for_each_entry(file, &tr->events, list) { clear_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags); } for_each_possible_cpu(cpu) per_cpu_ptr(tr->array_buffer.data, cpu)->ignore_pid = false; } if (type & TRACE_PIDS) rcu_assign_pointer(tr->filtered_pids, NULL); if (type & TRACE_NO_PIDS) rcu_assign_pointer(tr->filtered_no_pids, NULL); /* Wait till all users are no longer using pid filtering */ tracepoint_synchronize_unregister(); if ((type & TRACE_PIDS) && pid_list) trace_pid_list_free(pid_list); if ((type & TRACE_NO_PIDS) && no_pid_list) trace_pid_list_free(no_pid_list); } static void ftrace_clear_event_pids(struct trace_array *tr, int type) { mutex_lock(&event_mutex); __ftrace_clear_event_pids(tr, type); mutex_unlock(&event_mutex); } static void __put_system(struct event_subsystem *system) { struct event_filter *filter = system->filter; WARN_ON_ONCE(system_refcount(system) == 0); if (system_refcount_dec(system)) return; list_del(&system->list); if (filter) { kfree(filter->filter_string); kfree(filter); } kfree_const(system->name); kfree(system); } static void __get_system(struct event_subsystem *system) { WARN_ON_ONCE(system_refcount(system) == 0); system_refcount_inc(system); } static void __get_system_dir(struct trace_subsystem_dir *dir) { WARN_ON_ONCE(dir->ref_count == 0); dir->ref_count++; __get_system(dir->subsystem); } static void __put_system_dir(struct trace_subsystem_dir *dir) { WARN_ON_ONCE(dir->ref_count == 0); /* If the subsystem is about to be freed, the dir must be too */ WARN_ON_ONCE(system_refcount(dir->subsystem) == 1 && dir->ref_count != 1); __put_system(dir->subsystem); if (!--dir->ref_count) kfree(dir); } static void put_system(struct trace_subsystem_dir *dir) { mutex_lock(&event_mutex); __put_system_dir(dir); mutex_unlock(&event_mutex); } static void remove_subsystem(struct trace_subsystem_dir *dir) { if (!dir) return; if (!--dir->nr_events) { eventfs_remove_dir(dir->ei); list_del(&dir->list); __put_system_dir(dir); } } void event_file_get(struct trace_event_file *file) { atomic_inc(&file->ref); } void event_file_put(struct trace_event_file *file) { if (WARN_ON_ONCE(!atomic_read(&file->ref))) { if (file->flags & EVENT_FILE_FL_FREED) kmem_cache_free(file_cachep, file); return; } if (atomic_dec_and_test(&file->ref)) { /* Count should only go to zero when it is freed */ if (WARN_ON_ONCE(!(file->flags & EVENT_FILE_FL_FREED))) return; kmem_cache_free(file_cachep, file); } } static void remove_event_file_dir(struct trace_event_file *file) { eventfs_remove_dir(file->ei); list_del(&file->list); remove_subsystem(file->system); free_event_filter(file->filter); file->flags |= EVENT_FILE_FL_FREED; event_file_put(file); } /* * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. */ static int __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, const char *sub, const char *event, int set) { struct trace_event_file *file; struct trace_event_call *call; const char *name; int ret = -EINVAL; int eret = 0; list_for_each_entry(file, &tr->events, list) { call = file->event_call; name = trace_event_name(call); if (!name || !call->class || !call->class->reg) continue; if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) continue; if (match && strcmp(match, name) != 0 && strcmp(match, call->class->system) != 0) continue; if (sub && strcmp(sub, call->class->system) != 0) continue; if (event && strcmp(event, name) != 0) continue; ret = ftrace_event_enable_disable(file, set); /* * Save the first error and return that. Some events * may still have been enabled, but let the user * know that something went wrong. */ if (ret && !eret) eret = ret; ret = eret; } return ret; } static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, const char *sub, const char *event, int set) { int ret; mutex_lock(&event_mutex); ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set); mutex_unlock(&event_mutex); return ret; } int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) { char *event = NULL, *sub = NULL, *match; int ret; if (!tr) return -ENOENT; /* * The buf format can be <subsystem>:<event-name> * *:<event-name> means any event by that name. * :<event-name> is the same. * * <subsystem>:* means all events in that subsystem * <subsystem>: means the same. * * <name> (no ':') means all events in a subsystem with * the name <name> or any event that matches <name> */ match = strsep(&buf, ":"); if (buf) { sub = match; event = buf; match = NULL; if (!strlen(sub) || strcmp(sub, "*") == 0) sub = NULL; if (!strlen(event) || strcmp(event, "*") == 0) event = NULL; } ret = __ftrace_set_clr_event(tr, match, sub, event, set); /* Put back the colon to allow this to be called again */ if (buf) *(buf - 1) = ':'; return ret; } /** * trace_set_clr_event - enable or disable an event * @system: system name to match (NULL for any system) * @event: event name to match (NULL for all events, within system) * @set: 1 to enable, 0 to disable * * This is a way for other parts of the kernel to enable or disable * event recording. * * Returns 0 on success, -EINVAL if the parameters do not match any * registered events. */ int trace_set_clr_event(const char *system, const char *event, int set) { struct trace_array *tr = top_trace_array(); if (!tr) return -ENODEV; return __ftrace_set_clr_event(tr, NULL, system, event, set); } EXPORT_SYMBOL_GPL(trace_set_clr_event); /** * trace_array_set_clr_event - enable or disable an event for a trace array. * @tr: concerned trace array. * @system: system name to match (NULL for any system) * @event: event name to match (NULL for all events, within system) * @enable: true to enable, false to disable * * This is a way for other parts of the kernel to enable or disable * event recording. * * Returns 0 on success, -EINVAL if the parameters do not match any * registered events. */ int trace_array_set_clr_event(struct trace_array *tr, const char *system, const char *event, bool enable) { int set; if (!tr) return -ENOENT; set = (enable == true) ? 1 : 0; return __ftrace_set_clr_event(tr, NULL, system, event, set); } EXPORT_SYMBOL_GPL(trace_array_set_clr_event); /* 128 should be much more than enough */ #define EVENT_BUF_SIZE 127 static ssize_t ftrace_event_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_parser parser; struct seq_file *m = file->private_data; struct trace_array *tr = m->private; ssize_t read, ret; if (!cnt) return 0; ret = tracing_update_buffers(tr); if (ret < 0) return ret; if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1)) return -ENOMEM; read = trace_get_user(&parser, ubuf, cnt, ppos); if (read >= 0 && trace_parser_loaded((&parser))) { int set = 1; if (*parser.buffer == '!') set = 0; ret = ftrace_set_clr_event(tr, parser.buffer + !set, set); if (ret) goto out_put; } ret = read; out_put: trace_parser_put(&parser); return ret; } static void * t_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_event_file *file = v; struct trace_event_call *call; struct trace_array *tr = m->private; (*pos)++; list_for_each_entry_continue(file, &tr->events, list) { call = file->event_call; /* * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. */ if (call->class && call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) return file; } return NULL; } static void *t_start(struct seq_file *m, loff_t *pos) { struct trace_event_file *file; struct trace_array *tr = m->private; loff_t l; mutex_lock(&event_mutex); file = list_entry(&tr->events, struct trace_event_file, list); for (l = 0; l <= *pos; ) { file = t_next(m, file, &l); if (!file) break; } return file; } static void * s_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_event_file *file = v; struct trace_array *tr = m->private; (*pos)++; list_for_each_entry_continue(file, &tr->events, list) { if (file->flags & EVENT_FILE_FL_ENABLED) return file; } return NULL; } static void *s_start(struct seq_file *m, loff_t *pos) { struct trace_event_file *file; struct trace_array *tr = m->private; loff_t l; mutex_lock(&event_mutex); file = list_entry(&tr->events, struct trace_event_file, list); for (l = 0; l <= *pos; ) { file = s_next(m, file, &l); if (!file) break; } return file; } static int t_show(struct seq_file *m, void *v) { struct trace_event_file *file = v; struct trace_event_call *call = file->event_call; if (strcmp(call->class->system, TRACE_SYSTEM) != 0) seq_printf(m, "%s:", call->class->system); seq_printf(m, "%s\n", trace_event_name(call)); return 0; } static void t_stop(struct seq_file *m, void *p) { mutex_unlock(&event_mutex); } static void * __next(struct seq_file *m, void *v, loff_t *pos, int type) { struct trace_array *tr = m->private; struct trace_pid_list *pid_list; if (type == TRACE_PIDS) pid_list = rcu_dereference_sched(tr->filtered_pids); else pid_list = rcu_dereference_sched(tr->filtered_no_pids); return trace_pid_next(pid_list, v, pos); } static void * p_next(struct seq_file *m, void *v, loff_t *pos) { return __next(m, v, pos, TRACE_PIDS); } static void * np_next(struct seq_file *m, void *v, loff_t *pos) { return __next(m, v, pos, TRACE_NO_PIDS); } static void *__start(struct seq_file *m, loff_t *pos, int type) __acquires(RCU) { struct trace_pid_list *pid_list; struct trace_array *tr = m->private; /* * Grab the mutex, to keep calls to p_next() having the same * tr->filtered_pids as p_start() has. * If we just passed the tr->filtered_pids around, then RCU would * have been enough, but doing that makes things more complex. */ mutex_lock(&event_mutex); rcu_read_lock_sched(); if (type == TRACE_PIDS) pid_list = rcu_dereference_sched(tr->filtered_pids); else pid_list = rcu_dereference_sched(tr->filtered_no_pids); if (!pid_list) return NULL; return trace_pid_start(pid_list, pos); } static void *p_start(struct seq_file *m, loff_t *pos) __acquires(RCU) { return __start(m, pos, TRACE_PIDS); } static void *np_start(struct seq_file *m, loff_t *pos) __acquires(RCU) { return __start(m, pos, TRACE_NO_PIDS); } static void p_stop(struct seq_file *m, void *p) __releases(RCU) { rcu_read_unlock_sched(); mutex_unlock(&event_mutex); } static ssize_t event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_event_file *file; unsigned long flags; char buf[4] = "0"; mutex_lock(&event_mutex); file = event_file_data(filp); if (likely(file)) flags = file->flags; mutex_unlock(&event_mutex); if (!file || flags & EVENT_FILE_FL_FREED) return -ENODEV; if (flags & EVENT_FILE_FL_ENABLED && !(flags & EVENT_FILE_FL_SOFT_DISABLED)) strcpy(buf, "1"); if (flags & EVENT_FILE_FL_SOFT_DISABLED || flags & EVENT_FILE_FL_SOFT_MODE) strcat(buf, "*"); strcat(buf, "\n"); return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf)); } static ssize_t event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_event_file *file; unsigned long val; int ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; switch (val) { case 0: case 1: ret = -ENODEV; mutex_lock(&event_mutex); file = event_file_data(filp); if (likely(file && !(file->flags & EVENT_FILE_FL_FREED))) { ret = tracing_update_buffers(file->tr); if (ret < 0) { mutex_unlock(&event_mutex); return ret; } ret = ftrace_event_enable_disable(file, val); } mutex_unlock(&event_mutex); break; default: return -EINVAL; } *ppos += cnt; return ret ? ret : cnt; } static ssize_t system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { const char set_to_char[4] = { '?', '0', '1', 'X' }; struct trace_subsystem_dir *dir = filp->private_data; struct event_subsystem *system = dir->subsystem; struct trace_event_call *call; struct trace_event_file *file; struct trace_array *tr = dir->tr; char buf[2]; int set = 0; int ret; mutex_lock(&event_mutex); list_for_each_entry(file, &tr->events, list) { call = file->event_call; if ((call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) || !trace_event_name(call) || !call->class || !call->class->reg) continue; if (system && strcmp(call->class->system, system->name) != 0) continue; /* * We need to find out if all the events are set * or if all events or cleared, or if we have * a mixture. */ set |= (1 << !!(file->flags & EVENT_FILE_FL_ENABLED)); /* * If we have a mixture, no need to look further. */ if (set == 3) break; } mutex_unlock(&event_mutex); buf[0] = set_to_char[set]; buf[1] = '\n'; ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); return ret; } static ssize_t system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_subsystem_dir *dir = filp->private_data; struct event_subsystem *system = dir->subsystem; const char *name = NULL; unsigned long val; ssize_t ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; ret = tracing_update_buffers(dir->tr); if (ret < 0) return ret; if (val != 0 && val != 1) return -EINVAL; /* * Opening of "enable" adds a ref count to system, * so the name is safe to use. */ if (system) name = system->name; ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val); if (ret) goto out; ret = cnt; out: *ppos += cnt; return ret; } enum { FORMAT_HEADER = 1, FORMAT_FIELD_SEPERATOR = 2, FORMAT_PRINTFMT = 3, }; static void *f_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_event_call *call = event_file_data(m->private); struct list_head *common_head = &ftrace_common_fields; struct list_head *head = trace_get_fields(call); struct list_head *node = v; (*pos)++; switch ((unsigned long)v) { case FORMAT_HEADER: node = common_head; break; case FORMAT_FIELD_SEPERATOR: node = head; break; case FORMAT_PRINTFMT: /* all done */ return NULL; } node = node->prev; if (node == common_head) return (void *)FORMAT_FIELD_SEPERATOR; else if (node == head) return (void *)FORMAT_PRINTFMT; else return node; } static int f_show(struct seq_file *m, void *v) { struct trace_event_call *call = event_file_data(m->private); struct ftrace_event_field *field; const char *array_descriptor; switch ((unsigned long)v) { case FORMAT_HEADER: seq_printf(m, "name: %s\n", trace_event_name(call)); seq_printf(m, "ID: %d\n", call->event.type); seq_puts(m, "format:\n"); return 0; case FORMAT_FIELD_SEPERATOR: seq_putc(m, '\n'); return 0; case FORMAT_PRINTFMT: seq_printf(m, "\nprint fmt: %s\n", call->print_fmt); return 0; } field = list_entry(v, struct ftrace_event_field, link); /* * Smartly shows the array type(except dynamic array). * Normal: * field:TYPE VAR * If TYPE := TYPE[LEN], it is shown: * field:TYPE VAR[LEN] */ array_descriptor = strchr(field->type, '['); if (str_has_prefix(field->type, "__data_loc")) array_descriptor = NULL; if (!array_descriptor) seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", field->type, field->name, field->offset, field->size, !!field->is_signed); else if (field->len) seq_printf(m, "\tfield:%.*s %s[%d];\toffset:%u;\tsize:%u;\tsigned:%d;\n", (int)(array_descriptor - field->type), field->type, field->name, field->len, field->offset, field->size, !!field->is_signed); else seq_printf(m, "\tfield:%.*s %s[];\toffset:%u;\tsize:%u;\tsigned:%d;\n", (int)(array_descriptor - field->type), field->type, field->name, field->offset, field->size, !!field->is_signed); return 0; } static void *f_start(struct seq_file *m, loff_t *pos) { void *p = (void *)FORMAT_HEADER; loff_t l = 0; /* ->stop() is called even if ->start() fails */ mutex_lock(&event_mutex); if (!event_file_data(m->private)) return ERR_PTR(-ENODEV); while (l < *pos && p) p = f_next(m, p, &l); return p; } static void f_stop(struct seq_file *m, void *p) { mutex_unlock(&event_mutex); } static const struct seq_operations trace_format_seq_ops = { .start = f_start, .next = f_next, .stop = f_stop, .show = f_show, }; static int trace_format_open(struct inode *inode, struct file *file) { struct seq_file *m; int ret; /* Do we want to hide event format files on tracefs lockdown? */ ret = seq_open(file, &trace_format_seq_ops); if (ret < 0) return ret; m = file->private_data; m->private = file; return 0; } static ssize_t event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { int id = (long)event_file_data(filp); char buf[32]; int len; if (unlikely(!id)) return -ENODEV; len = sprintf(buf, "%d\n", id); return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); } static ssize_t event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_event_file *file; struct trace_seq *s; int r = -ENODEV; if (*ppos) return 0; s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) return -ENOMEM; trace_seq_init(s); mutex_lock(&event_mutex); file = event_file_data(filp); if (file && !(file->flags & EVENT_FILE_FL_FREED)) print_event_filter(file, s); mutex_unlock(&event_mutex); if (file) r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s)); kfree(s); return r; } static ssize_t event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_event_file *file; char *buf; int err = -ENODEV; if (cnt >= PAGE_SIZE) return -EINVAL; buf = memdup_user_nul(ubuf, cnt); if (IS_ERR(buf)) return PTR_ERR(buf); mutex_lock(&event_mutex); file = event_file_data(filp); if (file) err = apply_event_filter(file, buf); mutex_unlock(&event_mutex); kfree(buf); if (err < 0) return err; *ppos += cnt; return cnt; } static LIST_HEAD(event_subsystems); static int subsystem_open(struct inode *inode, struct file *filp) { struct trace_subsystem_dir *dir = NULL, *iter_dir; struct trace_array *tr = NULL, *iter_tr; struct event_subsystem *system = NULL; int ret; if (tracing_is_disabled()) return -ENODEV; /* Make sure the system still exists */ mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); list_for_each_entry(iter_tr, &ftrace_trace_arrays, list) { list_for_each_entry(iter_dir, &iter_tr->systems, list) { if (iter_dir == inode->i_private) { /* Don't open systems with no events */ tr = iter_tr; dir = iter_dir; if (dir->nr_events) { __get_system_dir(dir); system = dir->subsystem; } goto exit_loop; } } } exit_loop: mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); if (!system) return -ENODEV; /* Still need to increment the ref count of the system */ if (trace_array_get(tr) < 0) { put_system(dir); return -ENODEV; } ret = tracing_open_generic(inode, filp); if (ret < 0) { trace_array_put(tr); put_system(dir); } return ret; } static int system_tr_open(struct inode *inode, struct file *filp) { struct trace_subsystem_dir *dir; struct trace_array *tr = inode->i_private; int ret; /* Make a temporary dir that has no system but points to tr */ dir = kzalloc(sizeof(*dir), GFP_KERNEL); if (!dir) return -ENOMEM; ret = tracing_open_generic_tr(inode, filp); if (ret < 0) { kfree(dir); return ret; } dir->tr = tr; filp->private_data = dir; return 0; } static int subsystem_release(struct inode *inode, struct file *file) { struct trace_subsystem_dir *dir = file->private_data; trace_array_put(dir->tr); /* * If dir->subsystem is NULL, then this is a temporary * descriptor that was made for a trace_array to enable * all subsystems. */ if (dir->subsystem) put_system(dir); else kfree(dir); return 0; } static ssize_t subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_subsystem_dir *dir = filp->private_data; struct event_subsystem *system = dir->subsystem; struct trace_seq *s; int r; if (*ppos) return 0; s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) return -ENOMEM; trace_seq_init(s); print_subsystem_event_filter(system, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s)); kfree(s); return r; } static ssize_t subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_subsystem_dir *dir = filp->private_data; char *buf; int err; if (cnt >= PAGE_SIZE) return -EINVAL; buf = memdup_user_nul(ubuf, cnt); if (IS_ERR(buf)) return PTR_ERR(buf); err = apply_subsystem_event_filter(dir, buf); kfree(buf); if (err < 0) return err; *ppos += cnt; return cnt; } static ssize_t show_header_page_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; struct trace_seq *s; int r; if (*ppos) return 0; s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) return -ENOMEM; trace_seq_init(s); ring_buffer_print_page_header(tr->array_buffer.buffer, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s)); kfree(s); return r; } static ssize_t show_header_event_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_seq *s; int r; if (*ppos) return 0; s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) return -ENOMEM; trace_seq_init(s); ring_buffer_print_entry_header(s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s)); kfree(s); return r; } static void ignore_task_cpu(void *data) { struct trace_array *tr = data; struct trace_pid_list *pid_list; struct trace_pid_list *no_pid_list; /* * This function is called by on_each_cpu() while the * event_mutex is held. */ pid_list = rcu_dereference_protected(tr->filtered_pids, mutex_is_locked(&event_mutex)); no_pid_list = rcu_dereference_protected(tr->filtered_no_pids, mutex_is_locked(&event_mutex)); this_cpu_write(tr->array_buffer.data->ignore_pid, trace_ignore_this_task(pid_list, no_pid_list, current)); } static void register_pid_events(struct trace_array *tr) { /* * Register a probe that is called before all other probes * to set ignore_pid if next or prev do not match. * Register a probe this is called after all other probes * to only keep ignore_pid set if next pid matches. */ register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_pre, tr, INT_MAX); register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_post, tr, 0); register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre, tr, INT_MAX); register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, tr, 0); register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_pre, tr, INT_MAX); register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_post, tr, 0); register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_pre, tr, INT_MAX); register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_post, tr, 0); } static ssize_t event_pid_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos, int type) { struct seq_file *m = filp->private_data; struct trace_array *tr = m->private; struct trace_pid_list *filtered_pids = NULL; struct trace_pid_list *other_pids = NULL; struct trace_pid_list *pid_list; struct trace_event_file *file; ssize_t ret; if (!cnt) return 0; ret = tracing_update_buffers(tr); if (ret < 0) return ret; mutex_lock(&event_mutex); if (type == TRACE_PIDS) { filtered_pids = rcu_dereference_protected(tr->filtered_pids, lockdep_is_held(&event_mutex)); other_pids = rcu_dereference_protected(tr->filtered_no_pids, lockdep_is_held(&event_mutex)); } else { filtered_pids = rcu_dereference_protected(tr->filtered_no_pids, lockdep_is_held(&event_mutex)); other_pids = rcu_dereference_protected(tr->filtered_pids, lockdep_is_held(&event_mutex)); } ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); if (ret < 0) goto out; if (type == TRACE_PIDS) rcu_assign_pointer(tr->filtered_pids, pid_list); else rcu_assign_pointer(tr->filtered_no_pids, pid_list); list_for_each_entry(file, &tr->events, list) { set_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags); } if (filtered_pids) { tracepoint_synchronize_unregister(); trace_pid_list_free(filtered_pids); } else if (pid_list && !other_pids) { register_pid_events(tr); } /* * Ignoring of pids is done at task switch. But we have to * check for those tasks that are currently running. * Always do this in case a pid was appended or removed. */ on_each_cpu(ignore_task_cpu, tr, 1); out: mutex_unlock(&event_mutex); if (ret > 0) *ppos += ret; return ret; } static ssize_t ftrace_event_pid_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { return event_pid_write(filp, ubuf, cnt, ppos, TRACE_PIDS); } static ssize_t ftrace_event_npid_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { return event_pid_write(filp, ubuf, cnt, ppos, TRACE_NO_PIDS); } static int ftrace_event_avail_open(struct inode *inode, struct file *file); static int ftrace_event_set_open(struct inode *inode, struct file *file); static int ftrace_event_set_pid_open(struct inode *inode, struct file *file); static int ftrace_event_set_npid_open(struct inode *inode, struct file *file); static int ftrace_event_release(struct inode *inode, struct file *file); static const struct seq_operations show_event_seq_ops = { .start = t_start, .next = t_next, .show = t_show, .stop = t_stop, }; static const struct seq_operations show_set_event_seq_ops = { .start = s_start, .next = s_next, .show = t_show, .stop = t_stop, }; static const struct seq_operations show_set_pid_seq_ops = { .start = p_start, .next = p_next, .show = trace_pid_show, .stop = p_stop, }; static const struct seq_operations show_set_no_pid_seq_ops = { .start = np_start, .next = np_next, .show = trace_pid_show, .stop = p_stop, }; static const struct file_operations ftrace_avail_fops = { .open = ftrace_event_avail_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; static const struct file_operations ftrace_set_event_fops = { .open = ftrace_event_set_open, .read = seq_read, .write = ftrace_event_write, .llseek = seq_lseek, .release = ftrace_event_release, }; static const struct file_operations ftrace_set_event_pid_fops = { .open = ftrace_event_set_pid_open, .read = seq_read, .write = ftrace_event_pid_write, .llseek = seq_lseek, .release = ftrace_event_release, }; static const struct file_operations ftrace_set_event_notrace_pid_fops = { .open = ftrace_event_set_npid_open, .read = seq_read, .write = ftrace_event_npid_write, .llseek = seq_lseek, .release = ftrace_event_release, }; static const struct file_operations ftrace_enable_fops = { .open = tracing_open_file_tr, .read = event_enable_read, .write = event_enable_write, .release = tracing_release_file_tr, .llseek = default_llseek, }; static const struct file_operations ftrace_event_format_fops = { .open = trace_format_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; static const struct file_operations ftrace_event_id_fops = { .read = event_id_read, .llseek = default_llseek, }; static const struct file_operations ftrace_event_filter_fops = { .open = tracing_open_file_tr, .read = event_filter_read, .write = event_filter_write, .release = tracing_release_file_tr, .llseek = default_llseek, }; static const struct file_operations ftrace_subsystem_filter_fops = { .open = subsystem_open, .read = subsystem_filter_read, .write = subsystem_filter_write, .llseek = default_llseek, .release = subsystem_release, }; static const struct file_operations ftrace_system_enable_fops = { .open = subsystem_open, .read = system_enable_read, .write = system_enable_write, .llseek = default_llseek, .release = subsystem_release, }; static const struct file_operations ftrace_tr_enable_fops = { .open = system_tr_open, .read = system_enable_read, .write = system_enable_write, .llseek = default_llseek, .release = subsystem_release, }; static const struct file_operations ftrace_show_header_page_fops = { .open = tracing_open_generic_tr, .read = show_header_page_file, .llseek = default_llseek, .release = tracing_release_generic_tr, }; static const struct file_operations ftrace_show_header_event_fops = { .open = tracing_open_generic_tr, .read = show_header_event_file, .llseek = default_llseek, .release = tracing_release_generic_tr, }; static int ftrace_event_open(struct inode *inode, struct file *file, const struct seq_operations *seq_ops) { struct seq_file *m; int ret; ret = security_locked_down(LOCKDOWN_TRACEFS); if (ret) return ret; ret = seq_open(file, seq_ops); if (ret < 0) return ret; m = file->private_data; /* copy tr over to seq ops */ m->private = inode->i_private; return ret; } static int ftrace_event_release(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; trace_array_put(tr); return seq_release(inode, file); } static int ftrace_event_avail_open(struct inode *inode, struct file *file) { const struct seq_operations *seq_ops = &show_event_seq_ops; /* Checks for tracefs lockdown */ return ftrace_event_open(inode, file, seq_ops); } static int ftrace_event_set_open(struct inode *inode, struct file *file) { const struct seq_operations *seq_ops = &show_set_event_seq_ops; struct trace_array *tr = inode->i_private; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) ftrace_clear_events(tr); ret = ftrace_event_open(inode, file, seq_ops); if (ret < 0) trace_array_put(tr); return ret; } static int ftrace_event_set_pid_open(struct inode *inode, struct file *file) { const struct seq_operations *seq_ops = &show_set_pid_seq_ops; struct trace_array *tr = inode->i_private; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) ftrace_clear_event_pids(tr, TRACE_PIDS); ret = ftrace_event_open(inode, file, seq_ops); if (ret < 0) trace_array_put(tr); return ret; } static int ftrace_event_set_npid_open(struct inode *inode, struct file *file) { const struct seq_operations *seq_ops = &show_set_no_pid_seq_ops; struct trace_array *tr = inode->i_private; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) ftrace_clear_event_pids(tr, TRACE_NO_PIDS); ret = ftrace_event_open(inode, file, seq_ops); if (ret < 0) trace_array_put(tr); return ret; } static struct event_subsystem * create_new_subsystem(const char *name) { struct event_subsystem *system; /* need to create new entry */ system = kmalloc(sizeof(*system), GFP_KERNEL); if (!system) return NULL; system->ref_count = 1; /* Only allocate if dynamic (kprobes and modules) */ system->name = kstrdup_const(name, GFP_KERNEL); if (!system->name) goto out_free; system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); if (!system->filter) goto out_free; list_add(&system->list, &event_subsystems); return system; out_free: kfree_const(system->name); kfree(system); return NULL; } static int system_callback(const char *name, umode_t *mode, void **data, const struct file_operations **fops) { if (strcmp(name, "filter") == 0) *fops = &ftrace_subsystem_filter_fops; else if (strcmp(name, "enable") == 0) *fops = &ftrace_system_enable_fops; else return 0; *mode = TRACE_MODE_WRITE; return 1; } static struct eventfs_inode * event_subsystem_dir(struct trace_array *tr, const char *name, struct trace_event_file *file, struct eventfs_inode *parent) { struct event_subsystem *system, *iter; struct trace_subsystem_dir *dir; struct eventfs_inode *ei; int nr_entries; static struct eventfs_entry system_entries[] = { { .name = "filter", .callback = system_callback, }, { .name = "enable", .callback = system_callback, } }; /* First see if we did not already create this dir */ list_for_each_entry(dir, &tr->systems, list) { system = dir->subsystem; if (strcmp(system->name, name) == 0) { dir->nr_events++; file->system = dir; return dir->ei; } } /* Now see if the system itself exists. */ system = NULL; list_for_each_entry(iter, &event_subsystems, list) { if (strcmp(iter->name, name) == 0) { system = iter; break; } } dir = kmalloc(sizeof(*dir), GFP_KERNEL); if (!dir) goto out_fail; if (!system) { system = create_new_subsystem(name); if (!system) goto out_free; } else __get_system(system); /* ftrace only has directories no files */ if (strcmp(name, "ftrace") == 0) nr_entries = 0; else nr_entries = ARRAY_SIZE(system_entries); ei = eventfs_create_dir(name, parent, system_entries, nr_entries, dir); if (IS_ERR(ei)) { pr_warn("Failed to create system directory %s\n", name); __put_system(system); goto out_free; } dir->ei = ei; dir->tr = tr; dir->ref_count = 1; dir->nr_events = 1; dir->subsystem = system; file->system = dir; list_add(&dir->list, &tr->systems); return dir->ei; out_free: kfree(dir); out_fail: /* Only print this message if failed on memory allocation */ if (!dir || !system) pr_warn("No memory to create event subsystem %s\n", name); return NULL; } static int event_define_fields(struct trace_event_call *call) { struct list_head *head; int ret = 0; /* * Other events may have the same class. Only update * the fields if they are not already defined. */ head = trace_get_fields(call); if (list_empty(head)) { struct trace_event_fields *field = call->class->fields_array; unsigned int offset = sizeof(struct trace_entry); for (; field->type; field++) { if (field->type == TRACE_FUNCTION_TYPE) { field->define_fields(call); break; } offset = ALIGN(offset, field->align); ret = trace_define_field_ext(call, field->type, field->name, offset, field->size, field->is_signed, field->filter_type, field->len); if (WARN_ON_ONCE(ret)) { pr_err("error code is %d\n", ret); break; } offset += field->size; } } return ret; } static int event_callback(const char *name, umode_t *mode, void **data, const struct file_operations **fops) { struct trace_event_file *file = *data; struct trace_event_call *call = file->event_call; if (strcmp(name, "format") == 0) { *mode = TRACE_MODE_READ; *fops = &ftrace_event_format_fops; *data = call; return 1; } /* * Only event directories that can be enabled should have * triggers or filters, with the exception of the "print" * event that can have a "trigger" file. */ if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) { if (call->class->reg && strcmp(name, "enable") == 0) { *mode = TRACE_MODE_WRITE; *fops = &ftrace_enable_fops; return 1; } if (strcmp(name, "filter") == 0) { *mode = TRACE_MODE_WRITE; *fops = &ftrace_event_filter_fops; return 1; } } if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) || strcmp(trace_event_name(call), "print") == 0) { if (strcmp(name, "trigger") == 0) { *mode = TRACE_MODE_WRITE; *fops = &event_trigger_fops; return 1; } } #ifdef CONFIG_PERF_EVENTS if (call->event.type && call->class->reg && strcmp(name, "id") == 0) { *mode = TRACE_MODE_READ; *data = (void *)(long)call->event.type; *fops = &ftrace_event_id_fops; return 1; } #endif #ifdef CONFIG_HIST_TRIGGERS if (strcmp(name, "hist") == 0) { *mode = TRACE_MODE_READ; *fops = &event_hist_fops; return 1; } #endif #ifdef CONFIG_HIST_TRIGGERS_DEBUG if (strcmp(name, "hist_debug") == 0) { *mode = TRACE_MODE_READ; *fops = &event_hist_debug_fops; return 1; } #endif #ifdef CONFIG_TRACE_EVENT_INJECT if (call->event.type && call->class->reg && strcmp(name, "inject") == 0) { *mode = 0200; *fops = &event_inject_fops; return 1; } #endif return 0; } static int event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file) { struct trace_event_call *call = file->event_call; struct trace_array *tr = file->tr; struct eventfs_inode *e_events; struct eventfs_inode *ei; const char *name; int nr_entries; int ret; static struct eventfs_entry event_entries[] = { { .name = "enable", .callback = event_callback, }, { .name = "filter", .callback = event_callback, }, { .name = "trigger", .callback = event_callback, }, { .name = "format", .callback = event_callback, }, #ifdef CONFIG_PERF_EVENTS { .name = "id", .callback = event_callback, }, #endif #ifdef CONFIG_HIST_TRIGGERS { .name = "hist", .callback = event_callback, }, #endif #ifdef CONFIG_HIST_TRIGGERS_DEBUG { .name = "hist_debug", .callback = event_callback, }, #endif #ifdef CONFIG_TRACE_EVENT_INJECT { .name = "inject", .callback = event_callback, }, #endif }; /* * If the trace point header did not define TRACE_SYSTEM * then the system would be called "TRACE_SYSTEM". This should * never happen. */ if (WARN_ON_ONCE(strcmp(call->class->system, TRACE_SYSTEM) == 0)) return -ENODEV; e_events = event_subsystem_dir(tr, call->class->system, file, parent); if (!e_events) return -ENOMEM; nr_entries = ARRAY_SIZE(event_entries); name = trace_event_name(call); ei = eventfs_create_dir(name, e_events, event_entries, nr_entries, file); if (IS_ERR(ei)) { pr_warn("Could not create tracefs '%s' directory\n", name); return -1; } file->ei = ei; ret = event_define_fields(call); if (ret < 0) { pr_warn("Could not initialize trace point events/%s\n", name); return ret; } return 0; } static void remove_event_from_tracers(struct trace_event_call *call) { struct trace_event_file *file; struct trace_array *tr; do_for_each_event_file_safe(tr, file) { if (file->event_call != call) continue; remove_event_file_dir(file); /* * The do_for_each_event_file_safe() is * a double loop. After finding the call for this * trace_array, we use break to jump to the next * trace_array. */ break; } while_for_each_event_file(); } static void event_remove(struct trace_event_call *call) { struct trace_array *tr; struct trace_event_file *file; do_for_each_event_file(tr, file) { if (file->event_call != call) continue; if (file->flags & EVENT_FILE_FL_WAS_ENABLED) tr->clear_trace = true; ftrace_event_enable_disable(file, 0); /* * The do_for_each_event_file() is * a double loop. After finding the call for this * trace_array, we use break to jump to the next * trace_array. */ break; } while_for_each_event_file(); if (call->event.funcs) __unregister_trace_event(&call->event); remove_event_from_tracers(call); list_del(&call->list); } static int event_init(struct trace_event_call *call) { int ret = 0; const char *name; name = trace_event_name(call); if (WARN_ON(!name)) return -EINVAL; if (call->class->raw_init) { ret = call->class->raw_init(call); if (ret < 0 && ret != -ENOSYS) pr_warn("Could not initialize trace events/%s\n", name); } return ret; } static int __register_event(struct trace_event_call *call, struct module *mod) { int ret; ret = event_init(call); if (ret < 0) return ret; list_add(&call->list, &ftrace_events); if (call->flags & TRACE_EVENT_FL_DYNAMIC) atomic_set(&call->refcnt, 0); else call->module = mod; return 0; } static char *eval_replace(char *ptr, struct trace_eval_map *map, int len) { int rlen; int elen; /* Find the length of the eval value as a string */ elen = snprintf(ptr, 0, "%ld", map->eval_value); /* Make sure there's enough room to replace the string with the value */ if (len < elen) return NULL; snprintf(ptr, elen + 1, "%ld", map->eval_value); /* Get the rest of the string of ptr */ rlen = strlen(ptr + len); memmove(ptr + elen, ptr + len, rlen); /* Make sure we end the new string */ ptr[elen + rlen] = 0; return ptr + elen; } static void update_event_printk(struct trace_event_call *call, struct trace_eval_map *map) { char *ptr; int quote = 0; int len = strlen(map->eval_string); for (ptr = call->print_fmt; *ptr; ptr++) { if (*ptr == '\\') { ptr++; /* paranoid */ if (!*ptr) break; continue; } if (*ptr == '"') { quote ^= 1; continue; } if (quote) continue; if (isdigit(*ptr)) { /* skip numbers */ do { ptr++; /* Check for alpha chars like ULL */ } while (isalnum(*ptr)); if (!*ptr) break; /* * A number must have some kind of delimiter after * it, and we can ignore that too. */ continue; } if (isalpha(*ptr) || *ptr == '_') { if (strncmp(map->eval_string, ptr, len) == 0 && !isalnum(ptr[len]) && ptr[len] != '_') { ptr = eval_replace(ptr, map, len); /* enum/sizeof string smaller than value */ if (WARN_ON_ONCE(!ptr)) return; /* * No need to decrement here, as eval_replace() * returns the pointer to the character passed * the eval, and two evals can not be placed * back to back without something in between. * We can skip that something in between. */ continue; } skip_more: do { ptr++; } while (isalnum(*ptr) || *ptr == '_'); if (!*ptr) break; /* * If what comes after this variable is a '.' or * '->' then we can continue to ignore that string. */ if (*ptr == '.' || (ptr[0] == '-' && ptr[1] == '>')) { ptr += *ptr == '.' ? 1 : 2; if (!*ptr) break; goto skip_more; } /* * Once again, we can skip the delimiter that came * after the string. */ continue; } } } static void add_str_to_module(struct module *module, char *str) { struct module_string *modstr; modstr = kmalloc(sizeof(*modstr), GFP_KERNEL); /* * If we failed to allocate memory here, then we'll just * let the str memory leak when the module is removed. * If this fails to allocate, there's worse problems than * a leaked string on module removal. */ if (WARN_ON_ONCE(!modstr)) return; modstr->module = module; modstr->str = str; list_add(&modstr->next, &module_strings); } static void update_event_fields(struct trace_event_call *call, struct trace_eval_map *map) { struct ftrace_event_field *field; struct list_head *head; char *ptr; char *str; int len = strlen(map->eval_string); /* Dynamic events should never have field maps */ if (WARN_ON_ONCE(call->flags & TRACE_EVENT_FL_DYNAMIC)) return; head = trace_get_fields(call); list_for_each_entry(field, head, link) { ptr = strchr(field->type, '['); if (!ptr) continue; ptr++; if (!isalpha(*ptr) && *ptr != '_') continue; if (strncmp(map->eval_string, ptr, len) != 0) continue; str = kstrdup(field->type, GFP_KERNEL); if (WARN_ON_ONCE(!str)) return; ptr = str + (ptr - field->type); ptr = eval_replace(ptr, map, len); /* enum/sizeof string smaller than value */ if (WARN_ON_ONCE(!ptr)) { kfree(str); continue; } /* * If the event is part of a module, then we need to free the string * when the module is removed. Otherwise, it will stay allocated * until a reboot. */ if (call->module) add_str_to_module(call->module, str); field->type = str; } } void trace_event_eval_update(struct trace_eval_map **map, int len) { struct trace_event_call *call, *p; const char *last_system = NULL; bool first = false; int last_i; int i; down_write(&trace_event_sem); list_for_each_entry_safe(call, p, &ftrace_events, list) { /* events are usually grouped together with systems */ if (!last_system || call->class->system != last_system) { first = true; last_i = 0; last_system = call->class->system; } /* * Since calls are grouped by systems, the likelihood that the * next call in the iteration belongs to the same system as the * previous call is high. As an optimization, we skip searching * for a map[] that matches the call's system if the last call * was from the same system. That's what last_i is for. If the * call has the same system as the previous call, then last_i * will be the index of the first map[] that has a matching * system. */ for (i = last_i; i < len; i++) { if (call->class->system == map[i]->system) { /* Save the first system if need be */ if (first) { last_i = i; first = false; } update_event_printk(call, map[i]); update_event_fields(call, map[i]); } } cond_resched(); } up_write(&trace_event_sem); } static bool event_in_systems(struct trace_event_call *call, const char *systems) { const char *system; const char *p; if (!systems) return true; system = call->class->system; p = strstr(systems, system); if (!p) return false; if (p != systems && !isspace(*(p - 1)) && *(p - 1) != ',') return false; p += strlen(system); return !*p || isspace(*p) || *p == ','; } static struct trace_event_file * trace_create_new_event(struct trace_event_call *call, struct trace_array *tr) { struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; struct trace_event_file *file; unsigned int first; if (!event_in_systems(call, tr->system_names)) return NULL; file = kmem_cache_alloc(file_cachep, GFP_TRACE); if (!file) return ERR_PTR(-ENOMEM); pid_list = rcu_dereference_protected(tr->filtered_pids, lockdep_is_held(&event_mutex)); no_pid_list = rcu_dereference_protected(tr->filtered_no_pids, lockdep_is_held(&event_mutex)); if (!trace_pid_list_first(pid_list, &first) || !trace_pid_list_first(no_pid_list, &first)) file->flags |= EVENT_FILE_FL_PID_FILTER; file->event_call = call; file->tr = tr; atomic_set(&file->sm_ref, 0); atomic_set(&file->tm_ref, 0); INIT_LIST_HEAD(&file->triggers); list_add(&file->list, &tr->events); event_file_get(file); return file; } #define MAX_BOOT_TRIGGERS 32 static struct boot_triggers { const char *event; char *trigger; } bootup_triggers[MAX_BOOT_TRIGGERS]; static char bootup_trigger_buf[COMMAND_LINE_SIZE]; static int nr_boot_triggers; static __init int setup_trace_triggers(char *str) { char *trigger; char *buf; int i; strscpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE); trace_set_ring_buffer_expanded(NULL); disable_tracing_selftest("running event triggers"); buf = bootup_trigger_buf; for (i = 0; i < MAX_BOOT_TRIGGERS; i++) { trigger = strsep(&buf, ","); if (!trigger) break; bootup_triggers[i].event = strsep(&trigger, "."); bootup_triggers[i].trigger = trigger; if (!bootup_triggers[i].trigger) break; } nr_boot_triggers = i; return 1; } __setup("trace_trigger=", setup_trace_triggers); /* Add an event to a trace directory */ static int __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) { struct trace_event_file *file; file = trace_create_new_event(call, tr); /* * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed * allocation, or NULL if the event is not part of the tr->system_names. * When the event is not part of the tr->system_names, return zero, not * an error. */ if (!file) return 0; if (IS_ERR(file)) return PTR_ERR(file); if (eventdir_initialized) return event_create_dir(tr->event_dir, file); else return event_define_fields(call); } static void trace_early_triggers(struct trace_event_file *file, const char *name) { int ret; int i; for (i = 0; i < nr_boot_triggers; i++) { if (strcmp(name, bootup_triggers[i].event)) continue; mutex_lock(&event_mutex); ret = trigger_process_regex(file, bootup_triggers[i].trigger); mutex_unlock(&event_mutex); if (ret) pr_err("Failed to register trigger '%s' on event %s\n", bootup_triggers[i].trigger, bootup_triggers[i].event); } } /* * Just create a descriptor for early init. A descriptor is required * for enabling events at boot. We want to enable events before * the filesystem is initialized. */ static int __trace_early_add_new_event(struct trace_event_call *call, struct trace_array *tr) { struct trace_event_file *file; int ret; file = trace_create_new_event(call, tr); /* * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed * allocation, or NULL if the event is not part of the tr->system_names. * When the event is not part of the tr->system_names, return zero, not * an error. */ if (!file) return 0; if (IS_ERR(file)) return PTR_ERR(file); ret = event_define_fields(call); if (ret) return ret; trace_early_triggers(file, trace_event_name(call)); return 0; } struct ftrace_module_file_ops; static void __add_event_to_tracers(struct trace_event_call *call); /* Add an additional event_call dynamically */ int trace_add_event_call(struct trace_event_call *call) { int ret; lockdep_assert_held(&event_mutex); mutex_lock(&trace_types_lock); ret = __register_event(call, NULL); if (ret >= 0) __add_event_to_tracers(call); mutex_unlock(&trace_types_lock); return ret; } EXPORT_SYMBOL_GPL(trace_add_event_call); /* * Must be called under locking of trace_types_lock, event_mutex and * trace_event_sem. */ static void __trace_remove_event_call(struct trace_event_call *call) { event_remove(call); trace_destroy_fields(call); free_event_filter(call->filter); call->filter = NULL; } static int probe_remove_event_call(struct trace_event_call *call) { struct trace_array *tr; struct trace_event_file *file; #ifdef CONFIG_PERF_EVENTS if (call->perf_refcount) return -EBUSY; #endif do_for_each_event_file(tr, file) { if (file->event_call != call) continue; /* * We can't rely on ftrace_event_enable_disable(enable => 0) * we are going to do, EVENT_FILE_FL_SOFT_MODE can suppress * TRACE_REG_UNREGISTER. */ if (file->flags & EVENT_FILE_FL_ENABLED) goto busy; if (file->flags & EVENT_FILE_FL_WAS_ENABLED) tr->clear_trace = true; /* * The do_for_each_event_file_safe() is * a double loop. After finding the call for this * trace_array, we use break to jump to the next * trace_array. */ break; } while_for_each_event_file(); __trace_remove_event_call(call); return 0; busy: /* No need to clear the trace now */ list_for_each_entry(tr, &ftrace_trace_arrays, list) { tr->clear_trace = false; } return -EBUSY; } /* Remove an event_call */ int trace_remove_event_call(struct trace_event_call *call) { int ret; lockdep_assert_held(&event_mutex); mutex_lock(&trace_types_lock); down_write(&trace_event_sem); ret = probe_remove_event_call(call); up_write(&trace_event_sem); mutex_unlock(&trace_types_lock); return ret; } EXPORT_SYMBOL_GPL(trace_remove_event_call); #define for_each_event(event, start, end) \ for (event = start; \ (unsigned long)event < (unsigned long)end; \ event++) #ifdef CONFIG_MODULES static void trace_module_add_events(struct module *mod) { struct trace_event_call **call, **start, **end; if (!mod->num_trace_events) return; /* Don't add infrastructure for mods without tracepoints */ if (trace_module_has_bad_taint(mod)) { pr_err("%s: module has bad taint, not creating trace events\n", mod->name); return; } start = mod->trace_events; end = mod->trace_events + mod->num_trace_events; for_each_event(call, start, end) { __register_event(*call, mod); __add_event_to_tracers(*call); } } static void trace_module_remove_events(struct module *mod) { struct trace_event_call *call, *p; struct module_string *modstr, *m; down_write(&trace_event_sem); list_for_each_entry_safe(call, p, &ftrace_events, list) { if ((call->flags & TRACE_EVENT_FL_DYNAMIC) || !call->module) continue; if (call->module == mod) __trace_remove_event_call(call); } /* Check for any strings allocade for this module */ list_for_each_entry_safe(modstr, m, &module_strings, next) { if (modstr->module != mod) continue; list_del(&modstr->next); kfree(modstr->str); kfree(modstr); } up_write(&trace_event_sem); /* * It is safest to reset the ring buffer if the module being unloaded * registered any events that were used. The only worry is if * a new module gets loaded, and takes on the same id as the events * of this module. When printing out the buffer, traced events left * over from this module may be passed to the new module events and * unexpected results may occur. */ tracing_reset_all_online_cpus_unlocked(); } static int trace_module_notify(struct notifier_block *self, unsigned long val, void *data) { struct module *mod = data; mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); switch (val) { case MODULE_STATE_COMING: trace_module_add_events(mod); break; case MODULE_STATE_GOING: trace_module_remove_events(mod); break; } mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); return NOTIFY_OK; } static struct notifier_block trace_module_nb = { .notifier_call = trace_module_notify, .priority = 1, /* higher than trace.c module notify */ }; #endif /* CONFIG_MODULES */ /* Create a new event directory structure for a trace directory. */ static void __trace_add_event_dirs(struct trace_array *tr) { struct trace_event_call *call; int ret; list_for_each_entry(call, &ftrace_events, list) { ret = __trace_add_new_event(call, tr); if (ret < 0) pr_warn("Could not create directory for event %s\n", trace_event_name(call)); } } /* Returns any file that matches the system and event */ struct trace_event_file * __find_event_file(struct trace_array *tr, const char *system, const char *event) { struct trace_event_file *file; struct trace_event_call *call; const char *name; list_for_each_entry(file, &tr->events, list) { call = file->event_call; name = trace_event_name(call); if (!name || !call->class) continue; if (strcmp(event, name) == 0 && strcmp(system, call->class->system) == 0) return file; } return NULL; } /* Returns valid trace event files that match system and event */ struct trace_event_file * find_event_file(struct trace_array *tr, const char *system, const char *event) { struct trace_event_file *file; file = __find_event_file(tr, system, event); if (!file || !file->event_call->class->reg || file->event_call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) return NULL; return file; } /** * trace_get_event_file - Find and return a trace event file * @instance: The name of the trace instance containing the event * @system: The name of the system containing the event * @event: The name of the event * * Return a trace event file given the trace instance name, trace * system, and trace event name. If the instance name is NULL, it * refers to the top-level trace array. * * This function will look it up and return it if found, after calling * trace_array_get() to prevent the instance from going away, and * increment the event's module refcount to prevent it from being * removed. * * To release the file, call trace_put_event_file(), which will call * trace_array_put() and decrement the event's module refcount. * * Return: The trace event on success, ERR_PTR otherwise. */ struct trace_event_file *trace_get_event_file(const char *instance, const char *system, const char *event) { struct trace_array *tr = top_trace_array(); struct trace_event_file *file = NULL; int ret = -EINVAL; if (instance) { tr = trace_array_find_get(instance); if (!tr) return ERR_PTR(-ENOENT); } else { ret = trace_array_get(tr); if (ret) return ERR_PTR(ret); } mutex_lock(&event_mutex); file = find_event_file(tr, system, event); if (!file) { trace_array_put(tr); ret = -EINVAL; goto out; } /* Don't let event modules unload while in use */ ret = trace_event_try_get_ref(file->event_call); if (!ret) { trace_array_put(tr); ret = -EBUSY; goto out; } ret = 0; out: mutex_unlock(&event_mutex); if (ret) file = ERR_PTR(ret); return file; } EXPORT_SYMBOL_GPL(trace_get_event_file); /** * trace_put_event_file - Release a file from trace_get_event_file() * @file: The trace event file * * If a file was retrieved using trace_get_event_file(), this should * be called when it's no longer needed. It will cancel the previous * trace_array_get() called by that function, and decrement the * event's module refcount. */ void trace_put_event_file(struct trace_event_file *file) { mutex_lock(&event_mutex); trace_event_put_ref(file->event_call); mutex_unlock(&event_mutex); trace_array_put(file->tr); } EXPORT_SYMBOL_GPL(trace_put_event_file); #ifdef CONFIG_DYNAMIC_FTRACE /* Avoid typos */ #define ENABLE_EVENT_STR "enable_event" #define DISABLE_EVENT_STR "disable_event" struct event_probe_data { struct trace_event_file *file; unsigned long count; int ref; bool enable; }; static void update_event_probe(struct event_probe_data *data) { if (data->enable) clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags); else set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags); } static void event_enable_probe(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, void *data) { struct ftrace_func_mapper *mapper = data; struct event_probe_data *edata; void **pdata; pdata = ftrace_func_mapper_find_ip(mapper, ip); if (!pdata || !*pdata) return; edata = *pdata; update_event_probe(edata); } static void event_enable_count_probe(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, void *data) { struct ftrace_func_mapper *mapper = data; struct event_probe_data *edata; void **pdata; pdata = ftrace_func_mapper_find_ip(mapper, ip); if (!pdata || !*pdata) return; edata = *pdata; if (!edata->count) return; /* Skip if the event is in a state we want to switch to */ if (edata->enable == !(edata->file->flags & EVENT_FILE_FL_SOFT_DISABLED)) return; if (edata->count != -1) (edata->count)--; update_event_probe(edata); } static int event_enable_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { struct ftrace_func_mapper *mapper = data; struct event_probe_data *edata; void **pdata; pdata = ftrace_func_mapper_find_ip(mapper, ip); if (WARN_ON_ONCE(!pdata || !*pdata)) return 0; edata = *pdata; seq_printf(m, "%ps:", (void *)ip); seq_printf(m, "%s:%s:%s", edata->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, edata->file->event_call->class->system, trace_event_name(edata->file->event_call)); if (edata->count == -1) seq_puts(m, ":unlimited\n"); else seq_printf(m, ":count=%ld\n", edata->count); return 0; } static int event_enable_init(struct ftrace_probe_ops *ops, struct trace_array *tr, unsigned long ip, void *init_data, void **data) { struct ftrace_func_mapper *mapper = *data; struct event_probe_data *edata = init_data; int ret; if (!mapper) { mapper = allocate_ftrace_func_mapper(); if (!mapper) return -ENODEV; *data = mapper; } ret = ftrace_func_mapper_add_ip(mapper, ip, edata); if (ret < 0) return ret; edata->ref++; return 0; } static int free_probe_data(void *data) { struct event_probe_data *edata = data; edata->ref--; if (!edata->ref) { /* Remove the SOFT_MODE flag */ __ftrace_event_enable_disable(edata->file, 0, 1); trace_event_put_ref(edata->file->event_call); kfree(edata); } return 0; } static void event_enable_free(struct ftrace_probe_ops *ops, struct trace_array *tr, unsigned long ip, void *data) { struct ftrace_func_mapper *mapper = data; struct event_probe_data *edata; if (!ip) { if (!mapper) return; free_ftrace_func_mapper(mapper, free_probe_data); return; } edata = ftrace_func_mapper_remove_ip(mapper, ip); if (WARN_ON_ONCE(!edata)) return; if (WARN_ON_ONCE(edata->ref <= 0)) return; free_probe_data(edata); } static struct ftrace_probe_ops event_enable_probe_ops = { .func = event_enable_probe, .print = event_enable_print, .init = event_enable_init, .free = event_enable_free, }; static struct ftrace_probe_ops event_enable_count_probe_ops = { .func = event_enable_count_probe, .print = event_enable_print, .init = event_enable_init, .free = event_enable_free, }; static struct ftrace_probe_ops event_disable_probe_ops = { .func = event_enable_probe, .print = event_enable_print, .init = event_enable_init, .free = event_enable_free, }; static struct ftrace_probe_ops event_disable_count_probe_ops = { .func = event_enable_count_probe, .print = event_enable_print, .init = event_enable_init, .free = event_enable_free, }; static int event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enabled) { struct trace_event_file *file; struct ftrace_probe_ops *ops; struct event_probe_data *data; const char *system; const char *event; char *number; bool enable; int ret; if (!tr) return -ENODEV; /* hash funcs only work with set_ftrace_filter */ if (!enabled || !param) return -EINVAL; system = strsep(&param, ":"); if (!param) return -EINVAL; event = strsep(&param, ":"); mutex_lock(&event_mutex); ret = -EINVAL; file = find_event_file(tr, system, event); if (!file) goto out; enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; if (enable) ops = param ? &event_enable_count_probe_ops : &event_enable_probe_ops; else ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops; if (glob[0] == '!') { ret = unregister_ftrace_function_probe_func(glob+1, tr, ops); goto out; } ret = -ENOMEM; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) goto out; data->enable = enable; data->count = -1; data->file = file; if (!param) goto out_reg; number = strsep(&param, ":"); ret = -EINVAL; if (!strlen(number)) goto out_free; /* * We use the callback data field (which is a pointer) * as our counter. */ ret = kstrtoul(number, 0, &data->count); if (ret) goto out_free; out_reg: /* Don't let event modules unload while probe registered */ ret = trace_event_try_get_ref(file->event_call); if (!ret) { ret = -EBUSY; goto out_free; } ret = __ftrace_event_enable_disable(file, 1, 1); if (ret < 0) goto out_put; ret = register_ftrace_function_probe(glob, tr, ops, data); /* * The above returns on success the # of functions enabled, * but if it didn't find any functions it returns zero. * Consider no functions a failure too. */ if (!ret) { ret = -ENOENT; goto out_disable; } else if (ret < 0) goto out_disable; /* Just return zero, not the number of enabled functions */ ret = 0; out: mutex_unlock(&event_mutex); return ret; out_disable: __ftrace_event_enable_disable(file, 0, 1); out_put: trace_event_put_ref(file->event_call); out_free: kfree(data); goto out; } static struct ftrace_func_command event_enable_cmd = { .name = ENABLE_EVENT_STR, .func = event_enable_func, }; static struct ftrace_func_command event_disable_cmd = { .name = DISABLE_EVENT_STR, .func = event_enable_func, }; static __init int register_event_cmds(void) { int ret; ret = register_ftrace_command(&event_enable_cmd); if (WARN_ON(ret < 0)) return ret; ret = register_ftrace_command(&event_disable_cmd); if (WARN_ON(ret < 0)) unregister_ftrace_command(&event_enable_cmd); return ret; } #else static inline int register_event_cmds(void) { return 0; } #endif /* CONFIG_DYNAMIC_FTRACE */ /* * The top level array and trace arrays created by boot-time tracing * have already had its trace_event_file descriptors created in order * to allow for early events to be recorded. * This function is called after the tracefs has been initialized, * and we now have to create the files associated to the events. */ static void __trace_early_add_event_dirs(struct trace_array *tr) { struct trace_event_file *file; int ret; list_for_each_entry(file, &tr->events, list) { ret = event_create_dir(tr->event_dir, file); if (ret < 0) pr_warn("Could not create directory for event %s\n", trace_event_name(file->event_call)); } } /* * For early boot up, the top trace array and the trace arrays created * by boot-time tracing require to have a list of events that can be * enabled. This must be done before the filesystem is set up in order * to allow events to be traced early. */ void __trace_early_add_events(struct trace_array *tr) { struct trace_event_call *call; int ret; list_for_each_entry(call, &ftrace_events, list) { /* Early boot up should not have any modules loaded */ if (!(call->flags & TRACE_EVENT_FL_DYNAMIC) && WARN_ON_ONCE(call->module)) continue; ret = __trace_early_add_new_event(call, tr); if (ret < 0) pr_warn("Could not create early event %s\n", trace_event_name(call)); } } /* Remove the event directory structure for a trace directory. */ static void __trace_remove_event_dirs(struct trace_array *tr) { struct trace_event_file *file, *next; list_for_each_entry_safe(file, next, &tr->events, list) remove_event_file_dir(file); } static void __add_event_to_tracers(struct trace_event_call *call) { struct trace_array *tr; list_for_each_entry(tr, &ftrace_trace_arrays, list) __trace_add_new_event(call, tr); } extern struct trace_event_call *__start_ftrace_events[]; extern struct trace_event_call *__stop_ftrace_events[]; static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; static __init int setup_trace_event(char *str) { strscpy(bootup_event_buf, str, COMMAND_LINE_SIZE); trace_set_ring_buffer_expanded(NULL); disable_tracing_selftest("running event tracing"); return 1; } __setup("trace_event=", setup_trace_event); static int events_callback(const char *name, umode_t *mode, void **data, const struct file_operations **fops) { if (strcmp(name, "enable") == 0) { *mode = TRACE_MODE_WRITE; *fops = &ftrace_tr_enable_fops; return 1; } if (strcmp(name, "header_page") == 0) { *mode = TRACE_MODE_READ; *fops = &ftrace_show_header_page_fops; } else if (strcmp(name, "header_event") == 0) { *mode = TRACE_MODE_READ; *fops = &ftrace_show_header_event_fops; } else return 0; return 1; } /* Expects to have event_mutex held when called */ static int create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) { struct eventfs_inode *e_events; struct dentry *entry; int nr_entries; static struct eventfs_entry events_entries[] = { { .name = "enable", .callback = events_callback, }, { .name = "header_page", .callback = events_callback, }, { .name = "header_event", .callback = events_callback, }, }; entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent, tr, &ftrace_set_event_fops); if (!entry) return -ENOMEM; nr_entries = ARRAY_SIZE(events_entries); e_events = eventfs_create_events_dir("events", parent, events_entries, nr_entries, tr); if (IS_ERR(e_events)) { pr_warn("Could not create tracefs 'events' directory\n"); return -ENOMEM; } /* There are not as crucial, just warn if they are not created */ trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent, tr, &ftrace_set_event_pid_fops); trace_create_file("set_event_notrace_pid", TRACE_MODE_WRITE, parent, tr, &ftrace_set_event_notrace_pid_fops); tr->event_dir = e_events; return 0; } /** * event_trace_add_tracer - add a instance of a trace_array to events * @parent: The parent dentry to place the files/directories for events in * @tr: The trace array associated with these events * * When a new instance is created, it needs to set up its events * directory, as well as other files associated with events. It also * creates the event hierarchy in the @parent/events directory. * * Returns 0 on success. * * Must be called with event_mutex held. */ int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr) { int ret; lockdep_assert_held(&event_mutex); ret = create_event_toplevel_files(parent, tr); if (ret) goto out; down_write(&trace_event_sem); /* If tr already has the event list, it is initialized in early boot. */ if (unlikely(!list_empty(&tr->events))) __trace_early_add_event_dirs(tr); else __trace_add_event_dirs(tr); up_write(&trace_event_sem); out: return ret; } /* * The top trace array already had its file descriptors created. * Now the files themselves need to be created. */ static __init int early_event_add_tracer(struct dentry *parent, struct trace_array *tr) { int ret; mutex_lock(&event_mutex); ret = create_event_toplevel_files(parent, tr); if (ret) goto out_unlock; down_write(&trace_event_sem); __trace_early_add_event_dirs(tr); up_write(&trace_event_sem); out_unlock: mutex_unlock(&event_mutex); return ret; } /* Must be called with event_mutex held */ int event_trace_del_tracer(struct trace_array *tr) { lockdep_assert_held(&event_mutex); /* Disable any event triggers and associated soft-disabled events */ clear_event_triggers(tr); /* Clear the pid list */ __ftrace_clear_event_pids(tr, TRACE_PIDS | TRACE_NO_PIDS); /* Disable any running events */ __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); /* Make sure no more events are being executed */ tracepoint_synchronize_unregister(); down_write(&trace_event_sem); __trace_remove_event_dirs(tr); eventfs_remove_events_dir(tr->event_dir); up_write(&trace_event_sem); tr->event_dir = NULL; return 0; } static __init int event_trace_memsetup(void) { field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC); file_cachep = KMEM_CACHE(trace_event_file, SLAB_PANIC); return 0; } __init void early_enable_events(struct trace_array *tr, char *buf, bool disable_first) { char *token; int ret; while (true) { token = strsep(&buf, ","); if (!token) break; if (*token) { /* Restarting syscalls requires that we stop them first */ if (disable_first) ftrace_set_clr_event(tr, token, 0); ret = ftrace_set_clr_event(tr, token, 1); if (ret) pr_warn("Failed to enable trace event: %s\n", token); } /* Put back the comma to allow this to be called again */ if (buf) *(buf - 1) = ','; } } static __init int event_trace_enable(void) { struct trace_array *tr = top_trace_array(); struct trace_event_call **iter, *call; int ret; if (!tr) return -ENODEV; for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) { call = *iter; ret = event_init(call); if (!ret) list_add(&call->list, &ftrace_events); } register_trigger_cmds(); /* * We need the top trace array to have a working set of trace * points at early init, before the debug files and directories * are created. Create the file entries now, and attach them * to the actual file dentries later. */ __trace_early_add_events(tr); early_enable_events(tr, bootup_event_buf, false); trace_printk_start_comm(); register_event_cmds(); return 0; } /* * event_trace_enable() is called from trace_event_init() first to * initialize events and perhaps start any events that are on the * command line. Unfortunately, there are some events that will not * start this early, like the system call tracepoints that need * to set the %SYSCALL_WORK_SYSCALL_TRACEPOINT flag of pid 1. But * event_trace_enable() is called before pid 1 starts, and this flag * is never set, making the syscall tracepoint never get reached, but * the event is enabled regardless (and not doing anything). */ static __init int event_trace_enable_again(void) { struct trace_array *tr; tr = top_trace_array(); if (!tr) return -ENODEV; early_enable_events(tr, bootup_event_buf, true); return 0; } early_initcall(event_trace_enable_again); /* Init fields which doesn't related to the tracefs */ static __init int event_trace_init_fields(void) { if (trace_define_generic_fields()) pr_warn("tracing: Failed to allocated generic fields"); if (trace_define_common_fields()) pr_warn("tracing: Failed to allocate common fields"); return 0; } __init int event_trace_init(void) { struct trace_array *tr; int ret; tr = top_trace_array(); if (!tr) return -ENODEV; trace_create_file("available_events", TRACE_MODE_READ, NULL, tr, &ftrace_avail_fops); ret = early_event_add_tracer(NULL, tr); if (ret) return ret; #ifdef CONFIG_MODULES ret = register_module_notifier(&trace_module_nb); if (ret) pr_warn("Failed to register trace events module notifier\n"); #endif eventdir_initialized = true; return 0; } void __init trace_event_init(void) { event_trace_memsetup(); init_ftrace_syscalls(); event_trace_enable(); event_trace_init_fields(); } #ifdef CONFIG_EVENT_TRACE_STARTUP_TEST static DEFINE_SPINLOCK(test_spinlock); static DEFINE_SPINLOCK(test_spinlock_irq); static DEFINE_MUTEX(test_mutex); static __init void test_work(struct work_struct *dummy) { spin_lock(&test_spinlock); spin_lock_irq(&test_spinlock_irq); udelay(1); spin_unlock_irq(&test_spinlock_irq); spin_unlock(&test_spinlock); mutex_lock(&test_mutex); msleep(1); mutex_unlock(&test_mutex); } static __init int event_test_thread(void *unused) { void *test_malloc; test_malloc = kmalloc(1234, GFP_KERNEL); if (!test_malloc) pr_info("failed to kmalloc\n"); schedule_on_each_cpu(test_work); kfree(test_malloc); set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { schedule(); set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); return 0; } /* * Do various things that may trigger events. */ static __init void event_test_stuff(void) { struct task_struct *test_thread; test_thread = kthread_run(event_test_thread, NULL, "test-events"); msleep(1); kthread_stop(test_thread); } /* * For every trace event defined, we will test each trace point separately, * and then by groups, and finally all trace points. */ static __init void event_trace_self_tests(void) { struct trace_subsystem_dir *dir; struct trace_event_file *file; struct trace_event_call *call; struct event_subsystem *system; struct trace_array *tr; int ret; tr = top_trace_array(); if (!tr) return; pr_info("Running tests on trace events:\n"); list_for_each_entry(file, &tr->events, list) { call = file->event_call; /* Only test those that have a probe */ if (!call->class || !call->class->probe) continue; /* * Testing syscall events here is pretty useless, but * we still do it if configured. But this is time consuming. * What we really need is a user thread to perform the * syscalls as we test. */ #ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS if (call->class->system && strcmp(call->class->system, "syscalls") == 0) continue; #endif pr_info("Testing event %s: ", trace_event_name(call)); /* * If an event is already enabled, someone is using * it and the self test should not be on. */ if (file->flags & EVENT_FILE_FL_ENABLED) { pr_warn("Enabled event during self test!\n"); WARN_ON_ONCE(1); continue; } ftrace_event_enable_disable(file, 1); event_test_stuff(); ftrace_event_enable_disable(file, 0); pr_cont("OK\n"); } /* Now test at the sub system level */ pr_info("Running tests on trace event systems:\n"); list_for_each_entry(dir, &tr->systems, list) { system = dir->subsystem; /* the ftrace system is special, skip it */ if (strcmp(system->name, "ftrace") == 0) continue; pr_info("Testing event system %s: ", system->name); ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1); if (WARN_ON_ONCE(ret)) { pr_warn("error enabling system %s\n", system->name); continue; } event_test_stuff(); ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0); if (WARN_ON_ONCE(ret)) { pr_warn("error disabling system %s\n", system->name); continue; } pr_cont("OK\n"); } /* Test with all events enabled */ pr_info("Running tests on all trace events:\n"); pr_info("Testing all events: "); ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1); if (WARN_ON_ONCE(ret)) { pr_warn("error enabling all events\n"); return; } event_test_stuff(); /* reset sysname */ ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); if (WARN_ON_ONCE(ret)) { pr_warn("error disabling all events\n"); return; } pr_cont("OK\n"); } #ifdef CONFIG_FUNCTION_TRACER static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); static struct trace_event_file event_trace_file __initdata; static void __init function_test_events_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *regs) { struct trace_buffer *buffer; struct ring_buffer_event *event; struct ftrace_entry *entry; unsigned int trace_ctx; long disabled; int cpu; trace_ctx = tracing_gen_ctx(); preempt_disable_notrace(); cpu = raw_smp_processor_id(); disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); if (disabled != 1) goto out; event = trace_event_buffer_lock_reserve(&buffer, &event_trace_file, TRACE_FN, sizeof(*entry), trace_ctx); if (!event) goto out; entry = ring_buffer_event_data(event); entry->ip = ip; entry->parent_ip = parent_ip; event_trigger_unlock_commit(&event_trace_file, buffer, event, entry, trace_ctx); out: atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); preempt_enable_notrace(); } static struct ftrace_ops trace_ops __initdata = { .func = function_test_events_call, }; static __init void event_trace_self_test_with_function(void) { int ret; event_trace_file.tr = top_trace_array(); if (WARN_ON(!event_trace_file.tr)) return; ret = register_ftrace_function(&trace_ops); if (WARN_ON(ret < 0)) { pr_info("Failed to enable function tracer for event tests\n"); return; } pr_info("Running tests again, along with the function tracer\n"); event_trace_self_tests(); unregister_ftrace_function(&trace_ops); } #else static __init void event_trace_self_test_with_function(void) { } #endif static __init int event_trace_self_tests_init(void) { if (!tracing_selftest_disabled) { event_trace_self_tests(); event_trace_self_test_with_function(); } return 0; } late_initcall(event_trace_self_tests_init); #endif
4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 9 4 4 4 4 5 9 5 4 4 4 1 3 1 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 // SPDX-License-Identifier: GPL-2.0-only /* * kallsyms.c: in-kernel printing of symbolic oopses and stack traces. * * Rewritten and vastly simplified by Rusty Russell for in-kernel * module loader: * Copyright 2002 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation * * ChangeLog: * * (25/Aug/2004) Paulo Marques <pmarques@grupopie.com> * Changed the compression method from stem compression to "table lookup" * compression (see scripts/kallsyms.c for a more complete description) */ #include <linux/kallsyms.h> #include <linux/init.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/kdb.h> #include <linux/err.h> #include <linux/proc_fs.h> #include <linux/sched.h> /* for cond_resched */ #include <linux/ctype.h> #include <linux/slab.h> #include <linux/filter.h> #include <linux/ftrace.h> #include <linux/kprobes.h> #include <linux/build_bug.h> #include <linux/compiler.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/bsearch.h> #include <linux/btf_ids.h> #include "kallsyms_internal.h" /* * Expand a compressed symbol data into the resulting uncompressed string, * if uncompressed string is too long (>= maxlen), it will be truncated, * given the offset to where the symbol is in the compressed stream. */ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result, size_t maxlen) { int len, skipped_first = 0; const char *tptr; const u8 *data; /* Get the compressed symbol length from the first symbol byte. */ data = &kallsyms_names[off]; len = *data; data++; off++; /* If MSB is 1, it is a "big" symbol, so needs an additional byte. */ if ((len & 0x80) != 0) { len = (len & 0x7F) | (*data << 7); data++; off++; } /* * Update the offset to return the offset for the next symbol on * the compressed stream. */ off += len; /* * For every byte on the compressed symbol data, copy the table * entry for that byte. */ while (len) { tptr = &kallsyms_token_table[kallsyms_token_index[*data]]; data++; len--; while (*tptr) { if (skipped_first) { if (maxlen <= 1) goto tail; *result = *tptr; result++; maxlen--; } else skipped_first = 1; tptr++; } } tail: if (maxlen) *result = '\0'; /* Return to offset to the next symbol. */ return off; } /* * Get symbol type information. This is encoded as a single char at the * beginning of the symbol name. */ static char kallsyms_get_symbol_type(unsigned int off) { /* * Get just the first code, look it up in the token table, * and return the first char from this token. */ return kallsyms_token_table[kallsyms_token_index[kallsyms_names[off + 1]]]; } /* * Find the offset on the compressed stream given and index in the * kallsyms array. */ static unsigned int get_symbol_offset(unsigned long pos) { const u8 *name; int i, len; /* * Use the closest marker we have. We have markers every 256 positions, * so that should be close enough. */ name = &kallsyms_names[kallsyms_markers[pos >> 8]]; /* * Sequentially scan all the symbols up to the point we're searching * for. Every symbol is stored in a [<len>][<len> bytes of data] format, * so we just need to add the len to the current pointer for every * symbol we wish to skip. */ for (i = 0; i < (pos & 0xFF); i++) { len = *name; /* * If MSB is 1, it is a "big" symbol, so we need to look into * the next byte (and skip it, too). */ if ((len & 0x80) != 0) len = ((len & 0x7F) | (name[1] << 7)) + 1; name = name + len + 1; } return name - kallsyms_names; } unsigned long kallsyms_sym_address(int idx) { if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE)) return kallsyms_addresses[idx]; /* values are unsigned offsets if --absolute-percpu is not in effect */ if (!IS_ENABLED(CONFIG_KALLSYMS_ABSOLUTE_PERCPU)) return kallsyms_relative_base + (u32)kallsyms_offsets[idx]; /* ...otherwise, positive offsets are absolute values */ if (kallsyms_offsets[idx] >= 0) return kallsyms_offsets[idx]; /* ...and negative offsets are relative to kallsyms_relative_base - 1 */ return kallsyms_relative_base - 1 - kallsyms_offsets[idx]; } static void cleanup_symbol_name(char *s) { char *res; if (!IS_ENABLED(CONFIG_LTO_CLANG)) return; /* * LLVM appends various suffixes for local functions and variables that * must be promoted to global scope as part of LTO. This can break * hooking of static functions with kprobes. '.' is not a valid * character in an identifier in C. Suffixes only in LLVM LTO observed: * - foo.llvm.[0-9a-f]+ */ res = strstr(s, ".llvm."); if (res) *res = '\0'; return; } static int compare_symbol_name(const char *name, char *namebuf) { /* The kallsyms_seqs_of_names is sorted based on names after * cleanup_symbol_name() (see scripts/kallsyms.c) if clang lto is enabled. * To ensure correct bisection in kallsyms_lookup_names(), do * cleanup_symbol_name(namebuf) before comparing name and namebuf. */ cleanup_symbol_name(namebuf); return strcmp(name, namebuf); } static unsigned int get_symbol_seq(int index) { unsigned int i, seq = 0; for (i = 0; i < 3; i++) seq = (seq << 8) | kallsyms_seqs_of_names[3 * index + i]; return seq; } static int kallsyms_lookup_names(const char *name, unsigned int *start, unsigned int *end) { int ret; int low, mid, high; unsigned int seq, off; char namebuf[KSYM_NAME_LEN]; low = 0; high = kallsyms_num_syms - 1; while (low <= high) { mid = low + (high - low) / 2; seq = get_symbol_seq(mid); off = get_symbol_offset(seq); kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); ret = compare_symbol_name(name, namebuf); if (ret > 0) low = mid + 1; else if (ret < 0) high = mid - 1; else break; } if (low > high) return -ESRCH; low = mid; while (low) { seq = get_symbol_seq(low - 1); off = get_symbol_offset(seq); kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); if (compare_symbol_name(name, namebuf)) break; low--; } *start = low; if (end) { high = mid; while (high < kallsyms_num_syms - 1) { seq = get_symbol_seq(high + 1); off = get_symbol_offset(seq); kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); if (compare_symbol_name(name, namebuf)) break; high++; } *end = high; } return 0; } /* Lookup the address for this symbol. Returns 0 if not found. */ unsigned long kallsyms_lookup_name(const char *name) { int ret; unsigned int i; /* Skip the search for empty string. */ if (!*name) return 0; ret = kallsyms_lookup_names(name, &i, NULL); if (!ret) return kallsyms_sym_address(get_symbol_seq(i)); return module_kallsyms_lookup_name(name); } /* * Iterate over all symbols in vmlinux. For symbols from modules use * module_kallsyms_on_each_symbol instead. */ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, unsigned long), void *data) { char namebuf[KSYM_NAME_LEN]; unsigned long i; unsigned int off; int ret; for (i = 0, off = 0; i < kallsyms_num_syms; i++) { off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); ret = fn(data, namebuf, kallsyms_sym_address(i)); if (ret != 0) return ret; cond_resched(); } return 0; } int kallsyms_on_each_match_symbol(int (*fn)(void *, unsigned long), const char *name, void *data) { int ret; unsigned int i, start, end; ret = kallsyms_lookup_names(name, &start, &end); if (ret) return 0; for (i = start; !ret && i <= end; i++) { ret = fn(data, kallsyms_sym_address(get_symbol_seq(i))); cond_resched(); } return ret; } static unsigned long get_symbol_pos(unsigned long addr, unsigned long *symbolsize, unsigned long *offset) { unsigned long symbol_start = 0, symbol_end = 0; unsigned long i, low, high, mid; /* This kernel should never had been booted. */ if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE)) BUG_ON(!kallsyms_addresses); else BUG_ON(!kallsyms_offsets); /* Do a binary search on the sorted kallsyms_addresses array. */ low = 0; high = kallsyms_num_syms; while (high - low > 1) { mid = low + (high - low) / 2; if (kallsyms_sym_address(mid) <= addr) low = mid; else high = mid; } /* * Search for the first aliased symbol. Aliased * symbols are symbols with the same address. */ while (low && kallsyms_sym_address(low-1) == kallsyms_sym_address(low)) --low; symbol_start = kallsyms_sym_address(low); /* Search for next non-aliased symbol. */ for (i = low + 1; i < kallsyms_num_syms; i++) { if (kallsyms_sym_address(i) > symbol_start) { symbol_end = kallsyms_sym_address(i); break; } } /* If we found no next symbol, we use the end of the section. */ if (!symbol_end) { if (is_kernel_inittext(addr)) symbol_end = (unsigned long)_einittext; else if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) symbol_end = (unsigned long)_end; else symbol_end = (unsigned long)_etext; } if (symbolsize) *symbolsize = symbol_end - symbol_start; if (offset) *offset = addr - symbol_start; return low; } /* * Lookup an address but don't bother to find any names. */ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, unsigned long *offset) { char namebuf[KSYM_NAME_LEN]; if (is_ksym_addr(addr)) { get_symbol_pos(addr, symbolsize, offset); return 1; } return !!module_address_lookup(addr, symbolsize, offset, NULL, NULL, namebuf) || !!__bpf_address_lookup(addr, symbolsize, offset, namebuf); } static const char *kallsyms_lookup_buildid(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, const unsigned char **modbuildid, char *namebuf) { const char *ret; namebuf[KSYM_NAME_LEN - 1] = 0; namebuf[0] = 0; if (is_ksym_addr(addr)) { unsigned long pos; pos = get_symbol_pos(addr, symbolsize, offset); /* Grab name */ kallsyms_expand_symbol(get_symbol_offset(pos), namebuf, KSYM_NAME_LEN); if (modname) *modname = NULL; if (modbuildid) *modbuildid = NULL; ret = namebuf; goto found; } /* See if it's in a module or a BPF JITed image. */ ret = module_address_lookup(addr, symbolsize, offset, modname, modbuildid, namebuf); if (!ret) ret = bpf_address_lookup(addr, symbolsize, offset, modname, namebuf); if (!ret) ret = ftrace_mod_address_lookup(addr, symbolsize, offset, modname, namebuf); found: cleanup_symbol_name(namebuf); return ret; } /* * Lookup an address * - modname is set to NULL if it's in the kernel. * - We guarantee that the returned name is valid until we reschedule even if. * It resides in a module. * - We also guarantee that modname will be valid until rescheduled. */ const char *kallsyms_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, char *namebuf) { return kallsyms_lookup_buildid(addr, symbolsize, offset, modname, NULL, namebuf); } int lookup_symbol_name(unsigned long addr, char *symname) { int res; symname[0] = '\0'; symname[KSYM_NAME_LEN - 1] = '\0'; if (is_ksym_addr(addr)) { unsigned long pos; pos = get_symbol_pos(addr, NULL, NULL); /* Grab name */ kallsyms_expand_symbol(get_symbol_offset(pos), symname, KSYM_NAME_LEN); goto found; } /* See if it's in a module. */ res = lookup_module_symbol_name(addr, symname); if (res) return res; found: cleanup_symbol_name(symname); return 0; } /* Look up a kernel symbol and return it in a text buffer. */ static int __sprint_symbol(char *buffer, unsigned long address, int symbol_offset, int add_offset, int add_buildid) { char *modname; const unsigned char *buildid; const char *name; unsigned long offset, size; int len; address += symbol_offset; name = kallsyms_lookup_buildid(address, &size, &offset, &modname, &buildid, buffer); if (!name) return sprintf(buffer, "0x%lx", address - symbol_offset); if (name != buffer) strcpy(buffer, name); len = strlen(buffer); offset -= symbol_offset; if (add_offset) len += sprintf(buffer + len, "+%#lx/%#lx", offset, size); if (modname) { len += sprintf(buffer + len, " [%s", modname); #if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) if (add_buildid && buildid) { /* build ID should match length of sprintf */ #if IS_ENABLED(CONFIG_MODULES) static_assert(sizeof(typeof_member(struct module, build_id)) == 20); #endif len += sprintf(buffer + len, " %20phN", buildid); } #endif len += sprintf(buffer + len, "]"); } return len; } /** * sprint_symbol - Look up a kernel symbol and return it in a text buffer * @buffer: buffer to be stored * @address: address to lookup * * This function looks up a kernel symbol with @address and stores its name, * offset, size and module name to @buffer if possible. If no symbol was found, * just saves its @address as is. * * This function returns the number of bytes stored in @buffer. */ int sprint_symbol(char *buffer, unsigned long address) { return __sprint_symbol(buffer, address, 0, 1, 0); } EXPORT_SYMBOL_GPL(sprint_symbol); /** * sprint_symbol_build_id - Look up a kernel symbol and return it in a text buffer * @buffer: buffer to be stored * @address: address to lookup * * This function looks up a kernel symbol with @address and stores its name, * offset, size, module name and module build ID to @buffer if possible. If no * symbol was found, just saves its @address as is. * * This function returns the number of bytes stored in @buffer. */ int sprint_symbol_build_id(char *buffer, unsigned long address) { return __sprint_symbol(buffer, address, 0, 1, 1); } EXPORT_SYMBOL_GPL(sprint_symbol_build_id); /** * sprint_symbol_no_offset - Look up a kernel symbol and return it in a text buffer * @buffer: buffer to be stored * @address: address to lookup * * This function looks up a kernel symbol with @address and stores its name * and module name to @buffer if possible. If no symbol was found, just saves * its @address as is. * * This function returns the number of bytes stored in @buffer. */ int sprint_symbol_no_offset(char *buffer, unsigned long address) { return __sprint_symbol(buffer, address, 0, 0, 0); } EXPORT_SYMBOL_GPL(sprint_symbol_no_offset); /** * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer * @buffer: buffer to be stored * @address: address to lookup * * This function is for stack backtrace and does the same thing as * sprint_symbol() but with modified/decreased @address. If there is a * tail-call to the function marked "noreturn", gcc optimized out code after * the call so that the stack-saved return address could point outside of the * caller. This function ensures that kallsyms will find the original caller * by decreasing @address. * * This function returns the number of bytes stored in @buffer. */ int sprint_backtrace(char *buffer, unsigned long address) { return __sprint_symbol(buffer, address, -1, 1, 0); } /** * sprint_backtrace_build_id - Look up a backtrace symbol and return it in a text buffer * @buffer: buffer to be stored * @address: address to lookup * * This function is for stack backtrace and does the same thing as * sprint_symbol() but with modified/decreased @address. If there is a * tail-call to the function marked "noreturn", gcc optimized out code after * the call so that the stack-saved return address could point outside of the * caller. This function ensures that kallsyms will find the original caller * by decreasing @address. This function also appends the module build ID to * the @buffer if @address is within a kernel module. * * This function returns the number of bytes stored in @buffer. */ int sprint_backtrace_build_id(char *buffer, unsigned long address) { return __sprint_symbol(buffer, address, -1, 1, 1); } /* To avoid using get_symbol_offset for every symbol, we carry prefix along. */ struct kallsym_iter { loff_t pos; loff_t pos_mod_end; loff_t pos_ftrace_mod_end; loff_t pos_bpf_end; unsigned long value; unsigned int nameoff; /* If iterating in core kernel symbols. */ char type; char name[KSYM_NAME_LEN]; char module_name[MODULE_NAME_LEN]; int exported; int show_value; }; static int get_ksymbol_mod(struct kallsym_iter *iter) { int ret = module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value, &iter->type, iter->name, iter->module_name, &iter->exported); if (ret < 0) { iter->pos_mod_end = iter->pos; return 0; } return 1; } /* * ftrace_mod_get_kallsym() may also get symbols for pages allocated for ftrace * purposes. In that case "__builtin__ftrace" is used as a module name, even * though "__builtin__ftrace" is not a module. */ static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter) { int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_mod_end, &iter->value, &iter->type, iter->name, iter->module_name, &iter->exported); if (ret < 0) { iter->pos_ftrace_mod_end = iter->pos; return 0; } return 1; } static int get_ksymbol_bpf(struct kallsym_iter *iter) { int ret; strscpy(iter->module_name, "bpf", MODULE_NAME_LEN); iter->exported = 0; ret = bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end, &iter->value, &iter->type, iter->name); if (ret < 0) { iter->pos_bpf_end = iter->pos; return 0; } return 1; } /* * This uses "__builtin__kprobes" as a module name for symbols for pages * allocated for kprobes' purposes, even though "__builtin__kprobes" is not a * module. */ static int get_ksymbol_kprobe(struct kallsym_iter *iter) { strscpy(iter->module_name, "__builtin__kprobes", MODULE_NAME_LEN); iter->exported = 0; return kprobe_get_kallsym(iter->pos - iter->pos_bpf_end, &iter->value, &iter->type, iter->name) < 0 ? 0 : 1; } /* Returns space to next name. */ static unsigned long get_ksymbol_core(struct kallsym_iter *iter) { unsigned off = iter->nameoff; iter->module_name[0] = '\0'; iter->value = kallsyms_sym_address(iter->pos); iter->type = kallsyms_get_symbol_type(off); off = kallsyms_expand_symbol(off, iter->name, ARRAY_SIZE(iter->name)); return off - iter->nameoff; } static void reset_iter(struct kallsym_iter *iter, loff_t new_pos) { iter->name[0] = '\0'; iter->nameoff = get_symbol_offset(new_pos); iter->pos = new_pos; if (new_pos == 0) { iter->pos_mod_end = 0; iter->pos_ftrace_mod_end = 0; iter->pos_bpf_end = 0; } } /* * The end position (last + 1) of each additional kallsyms section is recorded * in iter->pos_..._end as each section is added, and so can be used to * determine which get_ksymbol_...() function to call next. */ static int update_iter_mod(struct kallsym_iter *iter, loff_t pos) { iter->pos = pos; if ((!iter->pos_mod_end || iter->pos_mod_end > pos) && get_ksymbol_mod(iter)) return 1; if ((!iter->pos_ftrace_mod_end || iter->pos_ftrace_mod_end > pos) && get_ksymbol_ftrace_mod(iter)) return 1; if ((!iter->pos_bpf_end || iter->pos_bpf_end > pos) && get_ksymbol_bpf(iter)) return 1; return get_ksymbol_kprobe(iter); } /* Returns false if pos at or past end of file. */ static int update_iter(struct kallsym_iter *iter, loff_t pos) { /* Module symbols can be accessed randomly. */ if (pos >= kallsyms_num_syms) return update_iter_mod(iter, pos); /* If we're not on the desired position, reset to new position. */ if (pos != iter->pos) reset_iter(iter, pos); iter->nameoff += get_ksymbol_core(iter); iter->pos++; return 1; } static void *s_next(struct seq_file *m, void *p, loff_t *pos) { (*pos)++; if (!update_iter(m->private, *pos)) return NULL; return p; } static void *s_start(struct seq_file *m, loff_t *pos) { if (!update_iter(m->private, *pos)) return NULL; return m->private; } static void s_stop(struct seq_file *m, void *p) { } static int s_show(struct seq_file *m, void *p) { void *value; struct kallsym_iter *iter = m->private; /* Some debugging symbols have no name. Ignore them. */ if (!iter->name[0]) return 0; value = iter->show_value ? (void *)iter->value : NULL; if (iter->module_name[0]) { char type; /* * Label it "global" if it is exported, * "local" if not exported. */ type = iter->exported ? toupper(iter->type) : tolower(iter->type); seq_printf(m, "%px %c %s\t[%s]\n", value, type, iter->name, iter->module_name); } else seq_printf(m, "%px %c %s\n", value, iter->type, iter->name); return 0; } static const struct seq_operations kallsyms_op = { .start = s_start, .next = s_next, .stop = s_stop, .show = s_show }; #ifdef CONFIG_BPF_SYSCALL struct bpf_iter__ksym { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct kallsym_iter *, ksym); }; static int ksym_prog_seq_show(struct seq_file *m, bool in_stop) { struct bpf_iter__ksym ctx; struct bpf_iter_meta meta; struct bpf_prog *prog; meta.seq = m; prog = bpf_iter_get_info(&meta, in_stop); if (!prog) return 0; ctx.meta = &meta; ctx.ksym = m ? m->private : NULL; return bpf_iter_run_prog(prog, &ctx); } static int bpf_iter_ksym_seq_show(struct seq_file *m, void *p) { return ksym_prog_seq_show(m, false); } static void bpf_iter_ksym_seq_stop(struct seq_file *m, void *p) { if (!p) (void) ksym_prog_seq_show(m, true); else s_stop(m, p); } static const struct seq_operations bpf_iter_ksym_ops = { .start = s_start, .next = s_next, .stop = bpf_iter_ksym_seq_stop, .show = bpf_iter_ksym_seq_show, }; static int bpf_iter_ksym_init(void *priv_data, struct bpf_iter_aux_info *aux) { struct kallsym_iter *iter = priv_data; reset_iter(iter, 0); /* cache here as in kallsyms_open() case; use current process * credentials to tell BPF iterators if values should be shown. */ iter->show_value = kallsyms_show_value(current_cred()); return 0; } DEFINE_BPF_ITER_FUNC(ksym, struct bpf_iter_meta *meta, struct kallsym_iter *ksym) static const struct bpf_iter_seq_info ksym_iter_seq_info = { .seq_ops = &bpf_iter_ksym_ops, .init_seq_private = bpf_iter_ksym_init, .fini_seq_private = NULL, .seq_priv_size = sizeof(struct kallsym_iter), }; static struct bpf_iter_reg ksym_iter_reg_info = { .target = "ksym", .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__ksym, ksym), PTR_TO_BTF_ID_OR_NULL }, }, .seq_info = &ksym_iter_seq_info, }; BTF_ID_LIST(btf_ksym_iter_id) BTF_ID(struct, kallsym_iter) static int __init bpf_ksym_iter_register(void) { ksym_iter_reg_info.ctx_arg_info[0].btf_id = *btf_ksym_iter_id; return bpf_iter_reg_target(&ksym_iter_reg_info); } late_initcall(bpf_ksym_iter_register); #endif /* CONFIG_BPF_SYSCALL */ static int kallsyms_open(struct inode *inode, struct file *file) { /* * We keep iterator in m->private, since normal case is to * s_start from where we left off, so we avoid doing * using get_symbol_offset for every symbol. */ struct kallsym_iter *iter; iter = __seq_open_private(file, &kallsyms_op, sizeof(*iter)); if (!iter) return -ENOMEM; reset_iter(iter, 0); /* * Instead of checking this on every s_show() call, cache * the result here at open time. */ iter->show_value = kallsyms_show_value(file->f_cred); return 0; } #ifdef CONFIG_KGDB_KDB const char *kdb_walk_kallsyms(loff_t *pos) { static struct kallsym_iter kdb_walk_kallsyms_iter; if (*pos == 0) { memset(&kdb_walk_kallsyms_iter, 0, sizeof(kdb_walk_kallsyms_iter)); reset_iter(&kdb_walk_kallsyms_iter, 0); } while (1) { if (!update_iter(&kdb_walk_kallsyms_iter, *pos)) return NULL; ++*pos; /* Some debugging symbols have no name. Ignore them. */ if (kdb_walk_kallsyms_iter.name[0]) return kdb_walk_kallsyms_iter.name; } } #endif /* CONFIG_KGDB_KDB */ static const struct proc_ops kallsyms_proc_ops = { .proc_open = kallsyms_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = seq_release_private, }; static int __init kallsyms_init(void) { proc_create("kallsyms", 0444, NULL, &kallsyms_proc_ops); return 0; } device_initcall(kallsyms_init);
91 91 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 // SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2016-2020 Intel Corporation. All rights reserved. */ #include <linux/jump_label.h> #include <linux/uaccess.h> #include <linux/export.h> #include <linux/string.h> #include <linux/types.h> #include <asm/mce.h> #ifdef CONFIG_X86_MCE static DEFINE_STATIC_KEY_FALSE(copy_mc_fragile_key); void enable_copy_mc_fragile(void) { static_branch_inc(&copy_mc_fragile_key); } #define copy_mc_fragile_enabled (static_branch_unlikely(&copy_mc_fragile_key)) /* * Similar to copy_user_handle_tail, probe for the write fault point, or * source exception point. */ __visible notrace unsigned long copy_mc_fragile_handle_tail(char *to, char *from, unsigned len) { for (; len; --len, to++, from++) if (copy_mc_fragile(to, from, 1)) break; return len; } #else /* * No point in doing careful copying, or consulting a static key when * there is no #MC handler in the CONFIG_X86_MCE=n case. */ void enable_copy_mc_fragile(void) { } #define copy_mc_fragile_enabled (0) #endif unsigned long copy_mc_enhanced_fast_string(void *dst, const void *src, unsigned len); /** * copy_mc_to_kernel - memory copy that handles source exceptions * * @dst: destination address * @src: source address * @len: number of bytes to copy * * Call into the 'fragile' version on systems that benefit from avoiding * corner case poison consumption scenarios, For example, accessing * poison across 2 cachelines with a single instruction. Almost all * other uses case can use copy_mc_enhanced_fast_string() for a fast * recoverable copy, or fallback to plain memcpy. * * Return 0 for success, or number of bytes not copied if there was an * exception. */ unsigned long __must_check copy_mc_to_kernel(void *dst, const void *src, unsigned len) { if (copy_mc_fragile_enabled) return copy_mc_fragile(dst, src, len); if (static_cpu_has(X86_FEATURE_ERMS)) return copy_mc_enhanced_fast_string(dst, src, len); memcpy(dst, src, len); return 0; } EXPORT_SYMBOL_GPL(copy_mc_to_kernel); unsigned long __must_check copy_mc_to_user(void __user *dst, const void *src, unsigned len) { unsigned long ret; if (copy_mc_fragile_enabled) { __uaccess_begin(); ret = copy_mc_fragile((__force void *)dst, src, len); __uaccess_end(); return ret; } if (static_cpu_has(X86_FEATURE_ERMS)) { __uaccess_begin(); ret = copy_mc_enhanced_fast_string((__force void *)dst, src, len); __uaccess_end(); return ret; } return copy_user_generic((__force void *)dst, src, len); }
683 683 17 665 1 1 660 2 2 663 552 28 27 28 697 14 1 688 721 720 10 10 211 4 209 210 698 91 318 726 586 727 727 28 557 727 24 726 10 610 623 66 690 690 149 2 686 684 6 683 690 688 685 683 680 622 71 11 683 682 637 679 678 678 8 8 363 410 85 329 330 410 374 38 89 76 76 76 17 87 88 16 26 26 50 58 39 32 257 257 2 17 17 17 17 17 17 17 16 16 286 71 71 110 302 17 17 22 83 66 6 1 106 106 34 9 331 73 299 41 8 47 30 31 2 105 102 97 13 65 56 667 58 57 58 53 32 26 41 26 17 29 40 26 41 17 57 26 350 350 23 333 32 285 93 17 56 26 5 4 4 4 1 2 2 2 119 116 5 3 5 2 7 7 107 7 109 106 64 50 54 2761 2763 2662 684 690 50 74 1 684 71 684 682 1 677 678 14 677 53 178 8 53 178 2 1 687 689 690 687 2 2 71 580 580 579 7 46 2 46 46 621 61 4 68 69 34 63 39 33 6 44 6 38 37 38 16 23 38 3 108 119 8 111 111 3 3 3 108 111 26 42 42 42 42 42 42 42 18 1 57 57 57 57 56 30 30 30 2 2 2 2 276 500 10 5 5 10 486 26 500 2 501 176 2 245 26 297 315 5 23 295 8 53 247 247 3 296 286 11 50 238 2 237 287 1 259 4 238 53 8 257 265 265 8 312 265 53 318 83 237 265 53 8 313 13 70 237 277 4 271 9 237 271 237 271 249 259 257 502 236 94 178 32 500 325 176 498 326 176 2 2 29 17 14 549 2 29 28 520 504 76 505 547 547 543 543 543 2 2 2 2 2 2 55 9 1 8 3 34 11 4 10 30 30 30 1 29 45 15 30 30 30 20 25 57 1 1 55 55 55 55 64 2 1 1 3 57 4 57 61 62 2 1 64 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/namei.c * * Copyright (C) 1991, 1992 Linus Torvalds */ /* * Some corrections by tytso. */ /* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname * lookup logic. */ /* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture. */ #include <linux/init.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/wordpart.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/namei.h> #include <linux/pagemap.h> #include <linux/sched/mm.h> #include <linux/fsnotify.h> #include <linux/personality.h> #include <linux/security.h> #include <linux/ima.h> #include <linux/syscalls.h> #include <linux/mount.h> #include <linux/audit.h> #include <linux/capability.h> #include <linux/file.h> #include <linux/fcntl.h> #include <linux/device_cgroup.h> #include <linux/fs_struct.h> #include <linux/posix_acl.h> #include <linux/hash.h> #include <linux/bitops.h> #include <linux/init_task.h> #include <linux/uaccess.h> #include "internal.h" #include "mount.h" /* [Feb-1997 T. Schoebel-Theuer] * Fundamental changes in the pathname lookup mechanisms (namei) * were necessary because of omirr. The reason is that omirr needs * to know the _real_ pathname, not the user-supplied one, in case * of symlinks (and also when transname replacements occur). * * The new code replaces the old recursive symlink resolution with * an iterative one (in case of non-nested symlink chains). It does * this with calls to <fs>_follow_link(). * As a side effect, dir_namei(), _namei() and follow_link() are now * replaced with a single function lookup_dentry() that can handle all * the special cases of the former code. * * With the new dcache, the pathname is stored at each inode, at least as * long as the refcount of the inode is positive. As a side effect, the * size of the dcache depends on the inode cache and thus is dynamic. * * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink * resolution to correspond with current state of the code. * * Note that the symlink resolution is not *completely* iterative. * There is still a significant amount of tail- and mid- recursion in * the algorithm. Also, note that <fs>_readlink() is not used in * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink() * may return different results than <fs>_follow_link(). Many virtual * filesystems (including /proc) exhibit this behavior. */ /* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation: * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL * and the name already exists in form of a symlink, try to create the new * name indicated by the symlink. The old code always complained that the * name already exists, due to not following the symlink even if its target * is nonexistent. The new semantics affects also mknod() and link() when * the name is a symlink pointing to a non-existent name. * * I don't know which semantics is the right one, since I have no access * to standards. But I found by trial that HP-UX 9.0 has the full "new" * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the * "old" one. Personally, I think the new semantics is much more logical. * Note that "ln old new" where "new" is a symlink pointing to a non-existing * file does succeed in both HP-UX and SunOs, but not in Solaris * and in the old Linux semantics. */ /* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink * semantics. See the comments in "open_namei" and "do_link" below. * * [10-Sep-98 Alan Modra] Another symlink change. */ /* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks: * inside the path - always follow. * in the last component in creation/removal/renaming - never follow. * if LOOKUP_FOLLOW passed - follow. * if the pathname has trailing slashes - follow. * otherwise - don't follow. * (applied in that order). * * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT * restored for 2.4. This is the last surviving part of old 4.2BSD bug. * During the 2.4 we need to fix the userland stuff depending on it - * hopefully we will be able to get rid of that wart in 2.5. So far only * XEmacs seems to be relying on it... */ /* * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland) * implemented. Let's see if raised priority of ->s_vfs_rename_mutex gives * any extra contention... */ /* In order to reduce some races, while at the same time doing additional * checking and hopefully speeding things up, we copy filenames to the * kernel data space before using them.. * * POSIX.1 2.4: an empty pathname is invalid (ENOENT). * PATH_MAX includes the nul terminator --RR. */ #define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname)) struct filename * getname_flags(const char __user *filename, int flags, int *empty) { struct filename *result; char *kname; int len; result = audit_reusename(filename); if (result) return result; result = __getname(); if (unlikely(!result)) return ERR_PTR(-ENOMEM); /* * First, try to embed the struct filename inside the names_cache * allocation */ kname = (char *)result->iname; result->name = kname; len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX); if (unlikely(len < 0)) { __putname(result); return ERR_PTR(len); } /* * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a * separate struct filename so we can dedicate the entire * names_cache allocation for the pathname, and re-do the copy from * userland. */ if (unlikely(len == EMBEDDED_NAME_MAX)) { const size_t size = offsetof(struct filename, iname[1]); kname = (char *)result; /* * size is chosen that way we to guarantee that * result->iname[0] is within the same object and that * kname can't be equal to result->iname, no matter what. */ result = kzalloc(size, GFP_KERNEL); if (unlikely(!result)) { __putname(kname); return ERR_PTR(-ENOMEM); } result->name = kname; len = strncpy_from_user(kname, filename, PATH_MAX); if (unlikely(len < 0)) { __putname(kname); kfree(result); return ERR_PTR(len); } if (unlikely(len == PATH_MAX)) { __putname(kname); kfree(result); return ERR_PTR(-ENAMETOOLONG); } } atomic_set(&result->refcnt, 1); /* The empty path is special. */ if (unlikely(!len)) { if (empty) *empty = 1; if (!(flags & LOOKUP_EMPTY)) { putname(result); return ERR_PTR(-ENOENT); } } result->uptr = filename; result->aname = NULL; audit_getname(result); return result; } struct filename * getname_uflags(const char __user *filename, int uflags) { int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; return getname_flags(filename, flags, NULL); } struct filename * getname(const char __user * filename) { return getname_flags(filename, 0, NULL); } struct filename * getname_kernel(const char * filename) { struct filename *result; int len = strlen(filename) + 1; result = __getname(); if (unlikely(!result)) return ERR_PTR(-ENOMEM); if (len <= EMBEDDED_NAME_MAX) { result->name = (char *)result->iname; } else if (len <= PATH_MAX) { const size_t size = offsetof(struct filename, iname[1]); struct filename *tmp; tmp = kmalloc(size, GFP_KERNEL); if (unlikely(!tmp)) { __putname(result); return ERR_PTR(-ENOMEM); } tmp->name = (char *)result; result = tmp; } else { __putname(result); return ERR_PTR(-ENAMETOOLONG); } memcpy((char *)result->name, filename, len); result->uptr = NULL; result->aname = NULL; atomic_set(&result->refcnt, 1); audit_getname(result); return result; } EXPORT_SYMBOL(getname_kernel); void putname(struct filename *name) { if (IS_ERR(name)) return; if (WARN_ON_ONCE(!atomic_read(&name->refcnt))) return; if (!atomic_dec_and_test(&name->refcnt)) return; if (name->name != name->iname) { __putname(name->name); kfree(name); } else __putname(name); } EXPORT_SYMBOL(putname); /** * check_acl - perform ACL permission checking * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * * This function performs the ACL permission checking. Since this function * retrieve POSIX acls it needs to know whether it is called from a blocking or * non-blocking context and thus cares about the MAY_NOT_BLOCK bit. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ static int check_acl(struct mnt_idmap *idmap, struct inode *inode, int mask) { #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *acl; if (mask & MAY_NOT_BLOCK) { acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS); if (!acl) return -EAGAIN; /* no ->get_inode_acl() calls in RCU mode... */ if (is_uncached_acl(acl)) return -ECHILD; return posix_acl_permission(idmap, inode, acl, mask); } acl = get_inode_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl) { int error = posix_acl_permission(idmap, inode, acl, mask); posix_acl_release(acl); return error; } #endif return -EAGAIN; } /** * acl_permission_check - perform basic UNIX permission checking * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * * This function performs the basic UNIX permission checking. Since this * function may retrieve POSIX acls it needs to know whether it is called from a * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ static int acl_permission_check(struct mnt_idmap *idmap, struct inode *inode, int mask) { unsigned int mode = inode->i_mode; vfsuid_t vfsuid; /* Are we the owner? If so, ACL's don't matter */ vfsuid = i_uid_into_vfsuid(idmap, inode); if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) { mask &= 7; mode >>= 6; return (mask & ~mode) ? -EACCES : 0; } /* Do we have ACL's? */ if (IS_POSIXACL(inode) && (mode & S_IRWXG)) { int error = check_acl(idmap, inode, mask); if (error != -EAGAIN) return error; } /* Only RWX matters for group/other mode bits */ mask &= 7; /* * Are the group permissions different from * the other permissions in the bits we care * about? Need to check group ownership if so. */ if (mask & (mode ^ (mode >> 3))) { vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); if (vfsgid_in_group_p(vfsgid)) mode >>= 3; } /* Bits in 'mode' clear that we require? */ return (mask & ~mode) ? -EACCES : 0; } /** * generic_permission - check for access rights on a Posix-like filesystem * @idmap: idmap of the mount the inode was found from * @inode: inode to check access rights for * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, * %MAY_NOT_BLOCK ...) * * Used to check for read/write/execute permissions on a file. * We use "fsuid" for this, letting us set arbitrary permissions * for filesystem access without changing the "normal" uids which * are used for other things. * * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk * request cannot be satisfied (eg. requires blocking or too much complexity). * It would then be called again in ref-walk mode. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int generic_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int ret; /* * Do the basic permission checks. */ ret = acl_permission_check(idmap, inode, mask); if (ret != -EACCES) return ret; if (S_ISDIR(inode->i_mode)) { /* DACs are overridable for directories */ if (!(mask & MAY_WRITE)) if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_READ_SEARCH)) return 0; if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_OVERRIDE)) return 0; return -EACCES; } /* * Searching includes executable on directories, else just read. */ mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (mask == MAY_READ) if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_READ_SEARCH)) return 0; /* * Read/write DACs are always overridable. * Executable DACs are overridable when there is * at least one exec bit set. */ if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_OVERRIDE)) return 0; return -EACCES; } EXPORT_SYMBOL(generic_permission); /** * do_inode_permission - UNIX permission checking * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * * We _really_ want to just do "generic_permission()" without * even looking at the inode->i_op values. So we keep a cache * flag in inode->i_opflags, that says "this has not special * permission function, use the fast case". */ static inline int do_inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) { if (likely(inode->i_op->permission)) return inode->i_op->permission(idmap, inode, mask); /* This gets set once for the inode lifetime */ spin_lock(&inode->i_lock); inode->i_opflags |= IOP_FASTPERM; spin_unlock(&inode->i_lock); } return generic_permission(idmap, inode, mask); } /** * sb_permission - Check superblock-level permissions * @sb: Superblock of inode to check permission on * @inode: Inode to check permission on * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * * Separate out file-system wide checks from inode-specific permission checks. */ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) { if (unlikely(mask & MAY_WRITE)) { umode_t mode = inode->i_mode; /* Nobody gets write access to a read-only fs. */ if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) return -EROFS; } return 0; } /** * inode_permission - Check for access rights to a given inode * @idmap: idmap of the mount the inode was found from * @inode: Inode to check permission on * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * * Check for read/write/execute permissions on an inode. We use fs[ug]id for * this, letting us set arbitrary permissions for filesystem access without * changing the "normal" UIDs which are used for other things. * * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask. */ int inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int retval; retval = sb_permission(inode->i_sb, inode, mask); if (retval) return retval; if (unlikely(mask & MAY_WRITE)) { /* * Nobody gets write access to an immutable file. */ if (IS_IMMUTABLE(inode)) return -EPERM; /* * Updating mtime will likely cause i_uid and i_gid to be * written back improperly if their true value is unknown * to the vfs. */ if (HAS_UNMAPPED_ID(idmap, inode)) return -EACCES; } retval = do_inode_permission(idmap, inode, mask); if (retval) return retval; retval = devcgroup_inode_permission(inode, mask); if (retval) return retval; return security_inode_permission(inode, mask); } EXPORT_SYMBOL(inode_permission); /** * path_get - get a reference to a path * @path: path to get the reference to * * Given a path increment the reference count to the dentry and the vfsmount. */ void path_get(const struct path *path) { mntget(path->mnt); dget(path->dentry); } EXPORT_SYMBOL(path_get); /** * path_put - put a reference to a path * @path: path to put the reference to * * Given a path decrement the reference count to the dentry and the vfsmount. */ void path_put(const struct path *path) { dput(path->dentry); mntput(path->mnt); } EXPORT_SYMBOL(path_put); #define EMBEDDED_LEVELS 2 struct nameidata { struct path path; struct qstr last; struct path root; struct inode *inode; /* path.dentry.d_inode */ unsigned int flags, state; unsigned seq, next_seq, m_seq, r_seq; int last_type; unsigned depth; int total_link_count; struct saved { struct path link; struct delayed_call done; const char *name; unsigned seq; } *stack, internal[EMBEDDED_LEVELS]; struct filename *name; struct nameidata *saved; unsigned root_seq; int dfd; vfsuid_t dir_vfsuid; umode_t dir_mode; } __randomize_layout; #define ND_ROOT_PRESET 1 #define ND_ROOT_GRABBED 2 #define ND_JUMPED 4 static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name) { struct nameidata *old = current->nameidata; p->stack = p->internal; p->depth = 0; p->dfd = dfd; p->name = name; p->path.mnt = NULL; p->path.dentry = NULL; p->total_link_count = old ? old->total_link_count : 0; p->saved = old; current->nameidata = p; } static inline void set_nameidata(struct nameidata *p, int dfd, struct filename *name, const struct path *root) { __set_nameidata(p, dfd, name); p->state = 0; if (unlikely(root)) { p->state = ND_ROOT_PRESET; p->root = *root; } } static void restore_nameidata(void) { struct nameidata *now = current->nameidata, *old = now->saved; current->nameidata = old; if (old) old->total_link_count = now->total_link_count; if (now->stack != now->internal) kfree(now->stack); } static bool nd_alloc_stack(struct nameidata *nd) { struct saved *p; p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved), nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL); if (unlikely(!p)) return false; memcpy(p, nd->internal, sizeof(nd->internal)); nd->stack = p; return true; } /** * path_connected - Verify that a dentry is below mnt.mnt_root * @mnt: The mountpoint to check. * @dentry: The dentry to check. * * Rename can sometimes move a file or directory outside of a bind * mount, path_connected allows those cases to be detected. */ static bool path_connected(struct vfsmount *mnt, struct dentry *dentry) { struct super_block *sb = mnt->mnt_sb; /* Bind mounts can have disconnected paths */ if (mnt->mnt_root == sb->s_root) return true; return is_subdir(dentry, mnt->mnt_root); } static void drop_links(struct nameidata *nd) { int i = nd->depth; while (i--) { struct saved *last = nd->stack + i; do_delayed_call(&last->done); clear_delayed_call(&last->done); } } static void leave_rcu(struct nameidata *nd) { nd->flags &= ~LOOKUP_RCU; nd->seq = nd->next_seq = 0; rcu_read_unlock(); } static void terminate_walk(struct nameidata *nd) { drop_links(nd); if (!(nd->flags & LOOKUP_RCU)) { int i; path_put(&nd->path); for (i = 0; i < nd->depth; i++) path_put(&nd->stack[i].link); if (nd->state & ND_ROOT_GRABBED) { path_put(&nd->root); nd->state &= ~ND_ROOT_GRABBED; } } else { leave_rcu(nd); } nd->depth = 0; nd->path.mnt = NULL; nd->path.dentry = NULL; } /* path_put is needed afterwards regardless of success or failure */ static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq) { int res = __legitimize_mnt(path->mnt, mseq); if (unlikely(res)) { if (res > 0) path->mnt = NULL; path->dentry = NULL; return false; } if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) { path->dentry = NULL; return false; } return !read_seqcount_retry(&path->dentry->d_seq, seq); } static inline bool legitimize_path(struct nameidata *nd, struct path *path, unsigned seq) { return __legitimize_path(path, seq, nd->m_seq); } static bool legitimize_links(struct nameidata *nd) { int i; if (unlikely(nd->flags & LOOKUP_CACHED)) { drop_links(nd); nd->depth = 0; return false; } for (i = 0; i < nd->depth; i++) { struct saved *last = nd->stack + i; if (unlikely(!legitimize_path(nd, &last->link, last->seq))) { drop_links(nd); nd->depth = i + 1; return false; } } return true; } static bool legitimize_root(struct nameidata *nd) { /* Nothing to do if nd->root is zero or is managed by the VFS user. */ if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET)) return true; nd->state |= ND_ROOT_GRABBED; return legitimize_path(nd, &nd->root, nd->root_seq); } /* * Path walking has 2 modes, rcu-walk and ref-walk (see * Documentation/filesystems/path-lookup.txt). In situations when we can't * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab * normal reference counts on dentries and vfsmounts to transition to ref-walk * mode. Refcounts are grabbed at the last known good point before rcu-walk * got stuck, so ref-walk may continue from there. If this is not successful * (eg. a seqcount has changed), then failure is returned and it's up to caller * to restart the path walk from the beginning in ref-walk mode. */ /** * try_to_unlazy - try to switch to ref-walk mode. * @nd: nameidata pathwalk data * Returns: true on success, false on failure * * try_to_unlazy attempts to legitimize the current nd->path and nd->root * for ref-walk mode. * Must be called from rcu-walk context. * Nothing should touch nameidata between try_to_unlazy() failure and * terminate_walk(). */ static bool try_to_unlazy(struct nameidata *nd) { struct dentry *parent = nd->path.dentry; BUG_ON(!(nd->flags & LOOKUP_RCU)); if (unlikely(!legitimize_links(nd))) goto out1; if (unlikely(!legitimize_path(nd, &nd->path, nd->seq))) goto out; if (unlikely(!legitimize_root(nd))) goto out; leave_rcu(nd); BUG_ON(nd->inode != parent->d_inode); return true; out1: nd->path.mnt = NULL; nd->path.dentry = NULL; out: leave_rcu(nd); return false; } /** * try_to_unlazy_next - try to switch to ref-walk mode. * @nd: nameidata pathwalk data * @dentry: next dentry to step into * Returns: true on success, false on failure * * Similar to try_to_unlazy(), but here we have the next dentry already * picked by rcu-walk and want to legitimize that in addition to the current * nd->path and nd->root for ref-walk mode. Must be called from rcu-walk context. * Nothing should touch nameidata between try_to_unlazy_next() failure and * terminate_walk(). */ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry) { int res; BUG_ON(!(nd->flags & LOOKUP_RCU)); if (unlikely(!legitimize_links(nd))) goto out2; res = __legitimize_mnt(nd->path.mnt, nd->m_seq); if (unlikely(res)) { if (res > 0) goto out2; goto out1; } if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref))) goto out1; /* * We need to move both the parent and the dentry from the RCU domain * to be properly refcounted. And the sequence number in the dentry * validates *both* dentry counters, since we checked the sequence * number of the parent after we got the child sequence number. So we * know the parent must still be valid if the child sequence number is */ if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) goto out; if (read_seqcount_retry(&dentry->d_seq, nd->next_seq)) goto out_dput; /* * Sequence counts matched. Now make sure that the root is * still valid and get it if required. */ if (unlikely(!legitimize_root(nd))) goto out_dput; leave_rcu(nd); return true; out2: nd->path.mnt = NULL; out1: nd->path.dentry = NULL; out: leave_rcu(nd); return false; out_dput: leave_rcu(nd); dput(dentry); return false; } static inline int d_revalidate(struct dentry *dentry, unsigned int flags) { if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) return dentry->d_op->d_revalidate(dentry, flags); else return 1; } /** * complete_walk - successful completion of path walk * @nd: pointer nameidata * * If we had been in RCU mode, drop out of it and legitimize nd->path. * Revalidate the final result, unless we'd already done that during * the path walk or the filesystem doesn't ask for it. Return 0 on * success, -error on failure. In case of failure caller does not * need to drop nd->path. */ static int complete_walk(struct nameidata *nd) { struct dentry *dentry = nd->path.dentry; int status; if (nd->flags & LOOKUP_RCU) { /* * We don't want to zero nd->root for scoped-lookups or * externally-managed nd->root. */ if (!(nd->state & ND_ROOT_PRESET)) if (!(nd->flags & LOOKUP_IS_SCOPED)) nd->root.mnt = NULL; nd->flags &= ~LOOKUP_CACHED; if (!try_to_unlazy(nd)) return -ECHILD; } if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) { /* * While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't * ever step outside the root during lookup" and should already * be guaranteed by the rest of namei, we want to avoid a namei * BUG resulting in userspace being given a path that was not * scoped within the root at some point during the lookup. * * So, do a final sanity-check to make sure that in the * worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED) * we won't silently return an fd completely outside of the * requested root to userspace. * * Userspace could move the path outside the root after this * check, but as discussed elsewhere this is not a concern (the * resolved file was inside the root at some point). */ if (!path_is_under(&nd->path, &nd->root)) return -EXDEV; } if (likely(!(nd->state & ND_JUMPED))) return 0; if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE))) return 0; status = dentry->d_op->d_weak_revalidate(dentry, nd->flags); if (status > 0) return 0; if (!status) status = -ESTALE; return status; } static int set_root(struct nameidata *nd) { struct fs_struct *fs = current->fs; /* * Jumping to the real root in a scoped-lookup is a BUG in namei, but we * still have to ensure it doesn't happen because it will cause a breakout * from the dirfd. */ if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED)) return -ENOTRECOVERABLE; if (nd->flags & LOOKUP_RCU) { unsigned seq; do { seq = read_seqcount_begin(&fs->seq); nd->root = fs->root; nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq); } while (read_seqcount_retry(&fs->seq, seq)); } else { get_fs_root(fs, &nd->root); nd->state |= ND_ROOT_GRABBED; } return 0; } static int nd_jump_root(struct nameidata *nd) { if (unlikely(nd->flags & LOOKUP_BENEATH)) return -EXDEV; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) { /* Absolute path arguments to path_init() are allowed. */ if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt) return -EXDEV; } if (!nd->root.mnt) { int error = set_root(nd); if (error) return error; } if (nd->flags & LOOKUP_RCU) { struct dentry *d; nd->path = nd->root; d = nd->path.dentry; nd->inode = d->d_inode; nd->seq = nd->root_seq; if (read_seqcount_retry(&d->d_seq, nd->seq)) return -ECHILD; } else { path_put(&nd->path); nd->path = nd->root; path_get(&nd->path); nd->inode = nd->path.dentry->d_inode; } nd->state |= ND_JUMPED; return 0; } /* * Helper to directly jump to a known parsed path from ->get_link, * caller must have taken a reference to path beforehand. */ int nd_jump_link(const struct path *path) { int error = -ELOOP; struct nameidata *nd = current->nameidata; if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS)) goto err; error = -EXDEV; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) { if (nd->path.mnt != path->mnt) goto err; } /* Not currently safe for scoped-lookups. */ if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) goto err; path_put(&nd->path); nd->path = *path; nd->inode = nd->path.dentry->d_inode; nd->state |= ND_JUMPED; return 0; err: path_put(path); return error; } static inline void put_link(struct nameidata *nd) { struct saved *last = nd->stack + --nd->depth; do_delayed_call(&last->done); if (!(nd->flags & LOOKUP_RCU)) path_put(&last->link); } static int sysctl_protected_symlinks __read_mostly; static int sysctl_protected_hardlinks __read_mostly; static int sysctl_protected_fifos __read_mostly; static int sysctl_protected_regular __read_mostly; #ifdef CONFIG_SYSCTL static struct ctl_table namei_sysctls[] = { { .procname = "protected_symlinks", .data = &sysctl_protected_symlinks, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "protected_hardlinks", .data = &sysctl_protected_hardlinks, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "protected_fifos", .data = &sysctl_protected_fifos, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "protected_regular", .data = &sysctl_protected_regular, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, }; static int __init init_fs_namei_sysctls(void) { register_sysctl_init("fs", namei_sysctls); return 0; } fs_initcall(init_fs_namei_sysctls); #endif /* CONFIG_SYSCTL */ /** * may_follow_link - Check symlink following for unsafe situations * @nd: nameidata pathwalk data * @inode: Used for idmapping. * * In the case of the sysctl_protected_symlinks sysctl being enabled, * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is * in a sticky world-writable directory. This is to protect privileged * processes from failing races against path names that may change out * from under them by way of other users creating malicious symlinks. * It will permit symlinks to be followed only when outside a sticky * world-writable directory, or when the uid of the symlink and follower * match, or when the directory owner matches the symlink's owner. * * Returns 0 if following the symlink is allowed, -ve on error. */ static inline int may_follow_link(struct nameidata *nd, const struct inode *inode) { struct mnt_idmap *idmap; vfsuid_t vfsuid; if (!sysctl_protected_symlinks) return 0; idmap = mnt_idmap(nd->path.mnt); vfsuid = i_uid_into_vfsuid(idmap, inode); /* Allowed if owner and follower match. */ if (vfsuid_eq_kuid(vfsuid, current_fsuid())) return 0; /* Allowed if parent directory not sticky and world-writable. */ if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH)) return 0; /* Allowed if parent directory and link owner match. */ if (vfsuid_valid(nd->dir_vfsuid) && vfsuid_eq(nd->dir_vfsuid, vfsuid)) return 0; if (nd->flags & LOOKUP_RCU) return -ECHILD; audit_inode(nd->name, nd->stack[0].link.dentry, 0); audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link"); return -EACCES; } /** * safe_hardlink_source - Check for safe hardlink conditions * @idmap: idmap of the mount the inode was found from * @inode: the source inode to hardlink from * * Return false if at least one of the following conditions: * - inode is not a regular file * - inode is setuid * - inode is setgid and group-exec * - access failure for read and write * * Otherwise returns true. */ static bool safe_hardlink_source(struct mnt_idmap *idmap, struct inode *inode) { umode_t mode = inode->i_mode; /* Special files should not get pinned to the filesystem. */ if (!S_ISREG(mode)) return false; /* Setuid files should not get pinned to the filesystem. */ if (mode & S_ISUID) return false; /* Executable setgid files should not get pinned to the filesystem. */ if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) return false; /* Hardlinking to unreadable or unwritable sources is dangerous. */ if (inode_permission(idmap, inode, MAY_READ | MAY_WRITE)) return false; return true; } /** * may_linkat - Check permissions for creating a hardlink * @idmap: idmap of the mount the inode was found from * @link: the source to hardlink from * * Block hardlink when all of: * - sysctl_protected_hardlinks enabled * - fsuid does not match inode * - hardlink source is unsafe (see safe_hardlink_source() above) * - not CAP_FOWNER in a namespace with the inode owner uid mapped * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. * * Returns 0 if successful, -ve on error. */ int may_linkat(struct mnt_idmap *idmap, const struct path *link) { struct inode *inode = link->dentry->d_inode; /* Inode writeback is not safe when the uid or gid are invalid. */ if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) return -EOVERFLOW; if (!sysctl_protected_hardlinks) return 0; /* Source inode owner (or CAP_FOWNER) can hardlink all they like, * otherwise, it must be a safe source. */ if (safe_hardlink_source(idmap, inode) || inode_owner_or_capable(idmap, inode)) return 0; audit_log_path_denied(AUDIT_ANOM_LINK, "linkat"); return -EPERM; } /** * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory * should be allowed, or not, on files that already * exist. * @idmap: idmap of the mount the inode was found from * @nd: nameidata pathwalk data * @inode: the inode of the file to open * * Block an O_CREAT open of a FIFO (or a regular file) when: * - sysctl_protected_fifos (or sysctl_protected_regular) is enabled * - the file already exists * - we are in a sticky directory * - we don't own the file * - the owner of the directory doesn't own the file * - the directory is world writable * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2 * the directory doesn't have to be world writable: being group writable will * be enough. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. * * Returns 0 if the open is allowed, -ve on error. */ static int may_create_in_sticky(struct mnt_idmap *idmap, struct nameidata *nd, struct inode *const inode) { umode_t dir_mode = nd->dir_mode; vfsuid_t dir_vfsuid = nd->dir_vfsuid; if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) || (!sysctl_protected_regular && S_ISREG(inode->i_mode)) || likely(!(dir_mode & S_ISVTX)) || vfsuid_eq(i_uid_into_vfsuid(idmap, inode), dir_vfsuid) || vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid())) return 0; if (likely(dir_mode & 0002) || (dir_mode & 0020 && ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) || (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) { const char *operation = S_ISFIFO(inode->i_mode) ? "sticky_create_fifo" : "sticky_create_regular"; audit_log_path_denied(AUDIT_ANOM_CREAT, operation); return -EACCES; } return 0; } /* * follow_up - Find the mountpoint of path's vfsmount * * Given a path, find the mountpoint of its source file system. * Replace @path with the path of the mountpoint in the parent mount. * Up is towards /. * * Return 1 if we went up a level and 0 if we were already at the * root. */ int follow_up(struct path *path) { struct mount *mnt = real_mount(path->mnt); struct mount *parent; struct dentry *mountpoint; read_seqlock_excl(&mount_lock); parent = mnt->mnt_parent; if (parent == mnt) { read_sequnlock_excl(&mount_lock); return 0; } mntget(&parent->mnt); mountpoint = dget(mnt->mnt_mountpoint); read_sequnlock_excl(&mount_lock); dput(path->dentry); path->dentry = mountpoint; mntput(path->mnt); path->mnt = &parent->mnt; return 1; } EXPORT_SYMBOL(follow_up); static bool choose_mountpoint_rcu(struct mount *m, const struct path *root, struct path *path, unsigned *seqp) { while (mnt_has_parent(m)) { struct dentry *mountpoint = m->mnt_mountpoint; m = m->mnt_parent; if (unlikely(root->dentry == mountpoint && root->mnt == &m->mnt)) break; if (mountpoint != m->mnt.mnt_root) { path->mnt = &m->mnt; path->dentry = mountpoint; *seqp = read_seqcount_begin(&mountpoint->d_seq); return true; } } return false; } static bool choose_mountpoint(struct mount *m, const struct path *root, struct path *path) { bool found; rcu_read_lock(); while (1) { unsigned seq, mseq = read_seqbegin(&mount_lock); found = choose_mountpoint_rcu(m, root, path, &seq); if (unlikely(!found)) { if (!read_seqretry(&mount_lock, mseq)) break; } else { if (likely(__legitimize_path(path, seq, mseq))) break; rcu_read_unlock(); path_put(path); rcu_read_lock(); } } rcu_read_unlock(); return found; } /* * Perform an automount * - return -EISDIR to tell follow_managed() to stop and return the path we * were called with. */ static int follow_automount(struct path *path, int *count, unsigned lookup_flags) { struct dentry *dentry = path->dentry; /* We don't want to mount if someone's just doing a stat - * unless they're stat'ing a directory and appended a '/' to * the name. * * We do, however, want to mount if someone wants to open or * create a file of any type under the mountpoint, wants to * traverse through the mountpoint or wants to open the * mounted directory. Also, autofs may mark negative dentries * as being automount points. These will need the attentions * of the daemon to instantiate them before they can be used. */ if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && dentry->d_inode) return -EISDIR; if (count && (*count)++ >= MAXSYMLINKS) return -ELOOP; return finish_automount(dentry->d_op->d_automount(path), path); } /* * mount traversal - out-of-line part. One note on ->d_flags accesses - * dentries are pinned but not locked here, so negative dentry can go * positive right under us. Use of smp_load_acquire() provides a barrier * sufficient for ->d_inode and ->d_flags consistency. */ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped, int *count, unsigned lookup_flags) { struct vfsmount *mnt = path->mnt; bool need_mntput = false; int ret = 0; while (flags & DCACHE_MANAGED_DENTRY) { /* Allow the filesystem to manage the transit without i_mutex * being held. */ if (flags & DCACHE_MANAGE_TRANSIT) { ret = path->dentry->d_op->d_manage(path, false); flags = smp_load_acquire(&path->dentry->d_flags); if (ret < 0) break; } if (flags & DCACHE_MOUNTED) { // something's mounted on it.. struct vfsmount *mounted = lookup_mnt(path); if (mounted) { // ... in our namespace dput(path->dentry); if (need_mntput) mntput(path->mnt); path->mnt = mounted; path->dentry = dget(mounted->mnt_root); // here we know it's positive flags = path->dentry->d_flags; need_mntput = true; continue; } } if (!(flags & DCACHE_NEED_AUTOMOUNT)) break; // uncovered automount point ret = follow_automount(path, count, lookup_flags); flags = smp_load_acquire(&path->dentry->d_flags); if (ret < 0) break; } if (ret == -EISDIR) ret = 0; // possible if you race with several mount --move if (need_mntput && path->mnt == mnt) mntput(path->mnt); if (!ret && unlikely(d_flags_negative(flags))) ret = -ENOENT; *jumped = need_mntput; return ret; } static inline int traverse_mounts(struct path *path, bool *jumped, int *count, unsigned lookup_flags) { unsigned flags = smp_load_acquire(&path->dentry->d_flags); /* fastpath */ if (likely(!(flags & DCACHE_MANAGED_DENTRY))) { *jumped = false; if (unlikely(d_flags_negative(flags))) return -ENOENT; return 0; } return __traverse_mounts(path, flags, jumped, count, lookup_flags); } int follow_down_one(struct path *path) { struct vfsmount *mounted; mounted = lookup_mnt(path); if (mounted) { dput(path->dentry); mntput(path->mnt); path->mnt = mounted; path->dentry = dget(mounted->mnt_root); return 1; } return 0; } EXPORT_SYMBOL(follow_down_one); /* * Follow down to the covering mount currently visible to userspace. At each * point, the filesystem owning that dentry may be queried as to whether the * caller is permitted to proceed or not. */ int follow_down(struct path *path, unsigned int flags) { struct vfsmount *mnt = path->mnt; bool jumped; int ret = traverse_mounts(path, &jumped, NULL, flags); if (path->mnt != mnt) mntput(mnt); return ret; } EXPORT_SYMBOL(follow_down); /* * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if * we meet a managed dentry that would need blocking. */ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path) { struct dentry *dentry = path->dentry; unsigned int flags = dentry->d_flags; if (likely(!(flags & DCACHE_MANAGED_DENTRY))) return true; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) return false; for (;;) { /* * Don't forget we might have a non-mountpoint managed dentry * that wants to block transit. */ if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) { int res = dentry->d_op->d_manage(path, true); if (res) return res == -EISDIR; flags = dentry->d_flags; } if (flags & DCACHE_MOUNTED) { struct mount *mounted = __lookup_mnt(path->mnt, dentry); if (mounted) { path->mnt = &mounted->mnt; dentry = path->dentry = mounted->mnt.mnt_root; nd->state |= ND_JUMPED; nd->next_seq = read_seqcount_begin(&dentry->d_seq); flags = dentry->d_flags; // makes sure that non-RCU pathwalk could reach // this state. if (read_seqretry(&mount_lock, nd->m_seq)) return false; continue; } if (read_seqretry(&mount_lock, nd->m_seq)) return false; } return !(flags & DCACHE_NEED_AUTOMOUNT); } } static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, struct path *path) { bool jumped; int ret; path->mnt = nd->path.mnt; path->dentry = dentry; if (nd->flags & LOOKUP_RCU) { unsigned int seq = nd->next_seq; if (likely(__follow_mount_rcu(nd, path))) return 0; // *path and nd->next_seq might've been clobbered path->mnt = nd->path.mnt; path->dentry = dentry; nd->next_seq = seq; if (!try_to_unlazy_next(nd, dentry)) return -ECHILD; } ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags); if (jumped) { if (unlikely(nd->flags & LOOKUP_NO_XDEV)) ret = -EXDEV; else nd->state |= ND_JUMPED; } if (unlikely(ret)) { dput(path->dentry); if (path->mnt != nd->path.mnt) mntput(path->mnt); } return ret; } /* * This looks up the name in dcache and possibly revalidates the found dentry. * NULL is returned if the dentry does not exist in the cache. */ static struct dentry *lookup_dcache(const struct qstr *name, struct dentry *dir, unsigned int flags) { struct dentry *dentry = d_lookup(dir, name); if (dentry) { int error = d_revalidate(dentry, flags); if (unlikely(error <= 0)) { if (!error) d_invalidate(dentry); dput(dentry); return ERR_PTR(error); } } return dentry; } /* * Parent directory has inode locked exclusive. This is one * and only case when ->lookup() gets called on non in-lookup * dentries - as the matter of fact, this only gets called * when directory is guaranteed to have no in-lookup children * at all. */ struct dentry *lookup_one_qstr_excl(const struct qstr *name, struct dentry *base, unsigned int flags) { struct dentry *dentry = lookup_dcache(name, base, flags); struct dentry *old; struct inode *dir = base->d_inode; if (dentry) return dentry; /* Don't create child dentry for a dead directory. */ if (unlikely(IS_DEADDIR(dir))) return ERR_PTR(-ENOENT); dentry = d_alloc(base, name); if (unlikely(!dentry)) return ERR_PTR(-ENOMEM); old = dir->i_op->lookup(dir, dentry, flags); if (unlikely(old)) { dput(dentry); dentry = old; } return dentry; } EXPORT_SYMBOL(lookup_one_qstr_excl); static struct dentry *lookup_fast(struct nameidata *nd) { struct dentry *dentry, *parent = nd->path.dentry; int status = 1; /* * Rename seqlock is not required here because in the off chance * of a false negative due to a concurrent rename, the caller is * going to fall back to non-racy lookup. */ if (nd->flags & LOOKUP_RCU) { dentry = __d_lookup_rcu(parent, &nd->last, &nd->next_seq); if (unlikely(!dentry)) { if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); return NULL; } /* * This sequence count validates that the parent had no * changes while we did the lookup of the dentry above. */ if (read_seqcount_retry(&parent->d_seq, nd->seq)) return ERR_PTR(-ECHILD); status = d_revalidate(dentry, nd->flags); if (likely(status > 0)) return dentry; if (!try_to_unlazy_next(nd, dentry)) return ERR_PTR(-ECHILD); if (status == -ECHILD) /* we'd been told to redo it in non-rcu mode */ status = d_revalidate(dentry, nd->flags); } else { dentry = __d_lookup(parent, &nd->last); if (unlikely(!dentry)) return NULL; status = d_revalidate(dentry, nd->flags); } if (unlikely(status <= 0)) { if (!status) d_invalidate(dentry); dput(dentry); return ERR_PTR(status); } return dentry; } /* Fast lookup failed, do it the slow way */ static struct dentry *__lookup_slow(const struct qstr *name, struct dentry *dir, unsigned int flags) { struct dentry *dentry, *old; struct inode *inode = dir->d_inode; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); /* Don't go there if it's already dead */ if (unlikely(IS_DEADDIR(inode))) return ERR_PTR(-ENOENT); again: dentry = d_alloc_parallel(dir, name, &wq); if (IS_ERR(dentry)) return dentry; if (unlikely(!d_in_lookup(dentry))) { int error = d_revalidate(dentry, flags); if (unlikely(error <= 0)) { if (!error) { d_invalidate(dentry); dput(dentry); goto again; } dput(dentry); dentry = ERR_PTR(error); } } else { old = inode->i_op->lookup(inode, dentry, flags); d_lookup_done(dentry); if (unlikely(old)) { dput(dentry); dentry = old; } } return dentry; } static struct dentry *lookup_slow(const struct qstr *name, struct dentry *dir, unsigned int flags) { struct inode *inode = dir->d_inode; struct dentry *res; inode_lock_shared(inode); res = __lookup_slow(name, dir, flags); inode_unlock_shared(inode); return res; } static inline int may_lookup(struct mnt_idmap *idmap, struct nameidata *nd) { if (nd->flags & LOOKUP_RCU) { int err = inode_permission(idmap, nd->inode, MAY_EXEC|MAY_NOT_BLOCK); if (!err) // success, keep going return 0; if (!try_to_unlazy(nd)) return -ECHILD; // redo it all non-lazy if (err != -ECHILD) // hard error return err; } return inode_permission(idmap, nd->inode, MAY_EXEC); } static int reserve_stack(struct nameidata *nd, struct path *link) { if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) return -ELOOP; if (likely(nd->depth != EMBEDDED_LEVELS)) return 0; if (likely(nd->stack != nd->internal)) return 0; if (likely(nd_alloc_stack(nd))) return 0; if (nd->flags & LOOKUP_RCU) { // we need to grab link before we do unlazy. And we can't skip // unlazy even if we fail to grab the link - cleanup needs it bool grabbed_link = legitimize_path(nd, link, nd->next_seq); if (!try_to_unlazy(nd) || !grabbed_link) return -ECHILD; if (nd_alloc_stack(nd)) return 0; } return -ENOMEM; } enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; static const char *pick_link(struct nameidata *nd, struct path *link, struct inode *inode, int flags) { struct saved *last; const char *res; int error = reserve_stack(nd, link); if (unlikely(error)) { if (!(nd->flags & LOOKUP_RCU)) path_put(link); return ERR_PTR(error); } last = nd->stack + nd->depth++; last->link = *link; clear_delayed_call(&last->done); last->seq = nd->next_seq; if (flags & WALK_TRAILING) { error = may_follow_link(nd, inode); if (unlikely(error)) return ERR_PTR(error); } if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) || unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW)) return ERR_PTR(-ELOOP); if (!(nd->flags & LOOKUP_RCU)) { touch_atime(&last->link); cond_resched(); } else if (atime_needs_update(&last->link, inode)) { if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); touch_atime(&last->link); } error = security_inode_follow_link(link->dentry, inode, nd->flags & LOOKUP_RCU); if (unlikely(error)) return ERR_PTR(error); res = READ_ONCE(inode->i_link); if (!res) { const char * (*get)(struct dentry *, struct inode *, struct delayed_call *); get = inode->i_op->get_link; if (nd->flags & LOOKUP_RCU) { res = get(NULL, inode, &last->done); if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd)) res = get(link->dentry, inode, &last->done); } else { res = get(link->dentry, inode, &last->done); } if (!res) goto all_done; if (IS_ERR(res)) return res; } if (*res == '/') { error = nd_jump_root(nd); if (unlikely(error)) return ERR_PTR(error); while (unlikely(*++res == '/')) ; } if (*res) return res; all_done: // pure jump put_link(nd); return NULL; } /* * Do we need to follow links? We _really_ want to be able * to do this check without having to look at inode->i_op, * so we keep a cache of "no, this doesn't need follow_link" * for the common case. * * NOTE: dentry must be what nd->next_seq had been sampled from. */ static const char *step_into(struct nameidata *nd, int flags, struct dentry *dentry) { struct path path; struct inode *inode; int err = handle_mounts(nd, dentry, &path); if (err < 0) return ERR_PTR(err); inode = path.dentry->d_inode; if (likely(!d_is_symlink(path.dentry)) || ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) || (flags & WALK_NOFOLLOW)) { /* not a symlink or should not follow */ if (nd->flags & LOOKUP_RCU) { if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq)) return ERR_PTR(-ECHILD); if (unlikely(!inode)) return ERR_PTR(-ENOENT); } else { dput(nd->path.dentry); if (nd->path.mnt != path.mnt) mntput(nd->path.mnt); } nd->path = path; nd->inode = inode; nd->seq = nd->next_seq; return NULL; } if (nd->flags & LOOKUP_RCU) { /* make sure that d_is_symlink above matches inode */ if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq)) return ERR_PTR(-ECHILD); } else { if (path.mnt == nd->path.mnt) mntget(path.mnt); } return pick_link(nd, &path, inode, flags); } static struct dentry *follow_dotdot_rcu(struct nameidata *nd) { struct dentry *parent, *old; if (path_equal(&nd->path, &nd->root)) goto in_root; if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) { struct path path; unsigned seq; if (!choose_mountpoint_rcu(real_mount(nd->path.mnt), &nd->root, &path, &seq)) goto in_root; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) return ERR_PTR(-ECHILD); nd->path = path; nd->inode = path.dentry->d_inode; nd->seq = seq; // makes sure that non-RCU pathwalk could reach this state if (read_seqretry(&mount_lock, nd->m_seq)) return ERR_PTR(-ECHILD); /* we know that mountpoint was pinned */ } old = nd->path.dentry; parent = old->d_parent; nd->next_seq = read_seqcount_begin(&parent->d_seq); // makes sure that non-RCU pathwalk could reach this state if (read_seqcount_retry(&old->d_seq, nd->seq)) return ERR_PTR(-ECHILD); if (unlikely(!path_connected(nd->path.mnt, parent))) return ERR_PTR(-ECHILD); return parent; in_root: if (read_seqretry(&mount_lock, nd->m_seq)) return ERR_PTR(-ECHILD); if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-ECHILD); nd->next_seq = nd->seq; return nd->path.dentry; } static struct dentry *follow_dotdot(struct nameidata *nd) { struct dentry *parent; if (path_equal(&nd->path, &nd->root)) goto in_root; if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) { struct path path; if (!choose_mountpoint(real_mount(nd->path.mnt), &nd->root, &path)) goto in_root; path_put(&nd->path); nd->path = path; nd->inode = path.dentry->d_inode; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) return ERR_PTR(-EXDEV); } /* rare case of legitimate dget_parent()... */ parent = dget_parent(nd->path.dentry); if (unlikely(!path_connected(nd->path.mnt, parent))) { dput(parent); return ERR_PTR(-ENOENT); } return parent; in_root: if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-EXDEV); return dget(nd->path.dentry); } static const char *handle_dots(struct nameidata *nd, int type) { if (type == LAST_DOTDOT) { const char *error = NULL; struct dentry *parent; if (!nd->root.mnt) { error = ERR_PTR(set_root(nd)); if (error) return error; } if (nd->flags & LOOKUP_RCU) parent = follow_dotdot_rcu(nd); else parent = follow_dotdot(nd); if (IS_ERR(parent)) return ERR_CAST(parent); error = step_into(nd, WALK_NOFOLLOW, parent); if (unlikely(error)) return error; if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) { /* * If there was a racing rename or mount along our * path, then we can't be sure that ".." hasn't jumped * above nd->root (and so userspace should retry or use * some fallback). */ smp_rmb(); if (__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)) return ERR_PTR(-EAGAIN); if (__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)) return ERR_PTR(-EAGAIN); } } return NULL; } static const char *walk_component(struct nameidata *nd, int flags) { struct dentry *dentry; /* * "." and ".." are special - ".." especially so because it has * to be able to know about the current root directory and * parent relationships. */ if (unlikely(nd->last_type != LAST_NORM)) { if (!(flags & WALK_MORE) && nd->depth) put_link(nd); return handle_dots(nd, nd->last_type); } dentry = lookup_fast(nd); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (unlikely(!dentry)) { dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags); if (IS_ERR(dentry)) return ERR_CAST(dentry); } if (!(flags & WALK_MORE) && nd->depth) put_link(nd); return step_into(nd, flags, dentry); } /* * We can do the critical dentry name comparison and hashing * operations one word at a time, but we are limited to: * * - Architectures with fast unaligned word accesses. We could * do a "get_unaligned()" if this helps and is sufficiently * fast. * * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we * do not trap on the (extremely unlikely) case of a page * crossing operation. * * - Furthermore, we need an efficient 64-bit compile for the * 64-bit case in order to generate the "number of bytes in * the final mask". Again, that could be replaced with a * efficient population count instruction or similar. */ #ifdef CONFIG_DCACHE_WORD_ACCESS #include <asm/word-at-a-time.h> #ifdef HASH_MIX /* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */ #elif defined(CONFIG_64BIT) /* * Register pressure in the mixing function is an issue, particularly * on 32-bit x86, but almost any function requires one state value and * one temporary. Instead, use a function designed for two state values * and no temporaries. * * This function cannot create a collision in only two iterations, so * we have two iterations to achieve avalanche. In those two iterations, * we have six layers of mixing, which is enough to spread one bit's * influence out to 2^6 = 64 state bits. * * Rotate constants are scored by considering either 64 one-bit input * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the * probability of that delta causing a change to each of the 128 output * bits, using a sample of random initial states. * * The Shannon entropy of the computed probabilities is then summed * to produce a score. Ideally, any input change has a 50% chance of * toggling any given output bit. * * Mixing scores (in bits) for (12,45): * Input delta: 1-bit 2-bit * 1 round: 713.3 42542.6 * 2 rounds: 2753.7 140389.8 * 3 rounds: 5954.1 233458.2 * 4 rounds: 7862.6 256672.2 * Perfect: 8192 258048 * (64*128) (64*63/2 * 128) */ #define HASH_MIX(x, y, a) \ ( x ^= (a), \ y ^= x, x = rol64(x,12),\ x += y, y = rol64(y,45),\ y *= 9 ) /* * Fold two longs into one 32-bit hash value. This must be fast, but * latency isn't quite as critical, as there is a fair bit of additional * work done before the hash value is used. */ static inline unsigned int fold_hash(unsigned long x, unsigned long y) { y ^= x * GOLDEN_RATIO_64; y *= GOLDEN_RATIO_64; return y >> 32; } #else /* 32-bit case */ /* * Mixing scores (in bits) for (7,20): * Input delta: 1-bit 2-bit * 1 round: 330.3 9201.6 * 2 rounds: 1246.4 25475.4 * 3 rounds: 1907.1 31295.1 * 4 rounds: 2042.3 31718.6 * Perfect: 2048 31744 * (32*64) (32*31/2 * 64) */ #define HASH_MIX(x, y, a) \ ( x ^= (a), \ y ^= x, x = rol32(x, 7),\ x += y, y = rol32(y,20),\ y *= 9 ) static inline unsigned int fold_hash(unsigned long x, unsigned long y) { /* Use arch-optimized multiply if one exists */ return __hash_32(y ^ __hash_32(x)); } #endif /* * Return the hash of a string of known length. This is carfully * designed to match hash_name(), which is the more critical function. * In particular, we must end by hashing a final word containing 0..7 * payload bytes, to match the way that hash_name() iterates until it * finds the delimiter after the name. */ unsigned int full_name_hash(const void *salt, const char *name, unsigned int len) { unsigned long a, x = 0, y = (unsigned long)salt; for (;;) { if (!len) goto done; a = load_unaligned_zeropad(name); if (len < sizeof(unsigned long)) break; HASH_MIX(x, y, a); name += sizeof(unsigned long); len -= sizeof(unsigned long); } x ^= a & bytemask_from_count(len); done: return fold_hash(x, y); } EXPORT_SYMBOL(full_name_hash); /* Return the "hash_len" (hash and length) of a null-terminated string */ u64 hashlen_string(const void *salt, const char *name) { unsigned long a = 0, x = 0, y = (unsigned long)salt; unsigned long adata, mask, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; len = 0; goto inside; do { HASH_MIX(x, y, a); len += sizeof(unsigned long); inside: a = load_unaligned_zeropad(name+len); } while (!has_zero(a, &adata, &constants)); adata = prep_zero_mask(a, adata, &constants); mask = create_zero_mask(adata); x ^= a & zero_bytemask(mask); return hashlen_create(fold_hash(x, y), len + find_zero(mask)); } EXPORT_SYMBOL(hashlen_string); /* * Calculate the length and hash of the path component, and * return the "hash_len" as the result. */ static inline u64 hash_name(const void *salt, const char *name) { unsigned long a = 0, b, x = 0, y = (unsigned long)salt; unsigned long adata, bdata, mask, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; len = 0; goto inside; do { HASH_MIX(x, y, a); len += sizeof(unsigned long); inside: a = load_unaligned_zeropad(name+len); b = a ^ REPEAT_BYTE('/'); } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants))); adata = prep_zero_mask(a, adata, &constants); bdata = prep_zero_mask(b, bdata, &constants); mask = create_zero_mask(adata | bdata); x ^= a & zero_bytemask(mask); return hashlen_create(fold_hash(x, y), len + find_zero(mask)); } #else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */ /* Return the hash of a string of known length */ unsigned int full_name_hash(const void *salt, const char *name, unsigned int len) { unsigned long hash = init_name_hash(salt); while (len--) hash = partial_name_hash((unsigned char)*name++, hash); return end_name_hash(hash); } EXPORT_SYMBOL(full_name_hash); /* Return the "hash_len" (hash and length) of a null-terminated string */ u64 hashlen_string(const void *salt, const char *name) { unsigned long hash = init_name_hash(salt); unsigned long len = 0, c; c = (unsigned char)*name; while (c) { len++; hash = partial_name_hash(c, hash); c = (unsigned char)name[len]; } return hashlen_create(end_name_hash(hash), len); } EXPORT_SYMBOL(hashlen_string); /* * We know there's a real path component here of at least * one character. */ static inline u64 hash_name(const void *salt, const char *name) { unsigned long hash = init_name_hash(salt); unsigned long len = 0, c; c = (unsigned char)*name; do { len++; hash = partial_name_hash(c, hash); c = (unsigned char)name[len]; } while (c && c != '/'); return hashlen_create(end_name_hash(hash), len); } #endif /* * Name resolution. * This is the basic name resolution function, turning a pathname into * the final dentry. We expect 'base' to be positive and a directory. * * Returns 0 and nd will have valid dentry and mnt on success. * Returns error and drops reference to input namei data on failure. */ static int link_path_walk(const char *name, struct nameidata *nd) { int depth = 0; // depth <= nd->depth int err; nd->last_type = LAST_ROOT; nd->flags |= LOOKUP_PARENT; if (IS_ERR(name)) return PTR_ERR(name); while (*name=='/') name++; if (!*name) { nd->dir_mode = 0; // short-circuit the 'hardening' idiocy return 0; } /* At this point we know we have a real path component. */ for(;;) { struct mnt_idmap *idmap; const char *link; u64 hash_len; int type; idmap = mnt_idmap(nd->path.mnt); err = may_lookup(idmap, nd); if (err) return err; hash_len = hash_name(nd->path.dentry, name); type = LAST_NORM; if (name[0] == '.') switch (hashlen_len(hash_len)) { case 2: if (name[1] == '.') { type = LAST_DOTDOT; nd->state |= ND_JUMPED; } break; case 1: type = LAST_DOT; } if (likely(type == LAST_NORM)) { struct dentry *parent = nd->path.dentry; nd->state &= ~ND_JUMPED; if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { struct qstr this = { { .hash_len = hash_len }, .name = name }; err = parent->d_op->d_hash(parent, &this); if (err < 0) return err; hash_len = this.hash_len; name = this.name; } } nd->last.hash_len = hash_len; nd->last.name = name; nd->last_type = type; name += hashlen_len(hash_len); if (!*name) goto OK; /* * If it wasn't NUL, we know it was '/'. Skip that * slash, and continue until no more slashes. */ do { name++; } while (unlikely(*name == '/')); if (unlikely(!*name)) { OK: /* pathname or trailing symlink, done */ if (!depth) { nd->dir_vfsuid = i_uid_into_vfsuid(idmap, nd->inode); nd->dir_mode = nd->inode->i_mode; nd->flags &= ~LOOKUP_PARENT; return 0; } /* last component of nested symlink */ name = nd->stack[--depth].name; link = walk_component(nd, 0); } else { /* not the last component */ link = walk_component(nd, WALK_MORE); } if (unlikely(link)) { if (IS_ERR(link)) return PTR_ERR(link); /* a symlink to follow */ nd->stack[depth++].name = name; name = link; continue; } if (unlikely(!d_can_lookup(nd->path.dentry))) { if (nd->flags & LOOKUP_RCU) { if (!try_to_unlazy(nd)) return -ECHILD; } return -ENOTDIR; } } } /* must be paired with terminate_walk() */ static const char *path_init(struct nameidata *nd, unsigned flags) { int error; const char *s = nd->name->name; /* LOOKUP_CACHED requires RCU, ask caller to retry */ if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED) return ERR_PTR(-EAGAIN); if (!*s) flags &= ~LOOKUP_RCU; if (flags & LOOKUP_RCU) rcu_read_lock(); else nd->seq = nd->next_seq = 0; nd->flags = flags; nd->state |= ND_JUMPED; nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount); nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount); smp_rmb(); if (nd->state & ND_ROOT_PRESET) { struct dentry *root = nd->root.dentry; struct inode *inode = root->d_inode; if (*s && unlikely(!d_can_lookup(root))) return ERR_PTR(-ENOTDIR); nd->path = nd->root; nd->inode = inode; if (flags & LOOKUP_RCU) { nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); nd->root_seq = nd->seq; } else { path_get(&nd->path); } return s; } nd->root.mnt = NULL; /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */ if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) { error = nd_jump_root(nd); if (unlikely(error)) return ERR_PTR(error); return s; } /* Relative pathname -- get the starting-point it is relative to. */ if (nd->dfd == AT_FDCWD) { if (flags & LOOKUP_RCU) { struct fs_struct *fs = current->fs; unsigned seq; do { seq = read_seqcount_begin(&fs->seq); nd->path = fs->pwd; nd->inode = nd->path.dentry->d_inode; nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); } while (read_seqcount_retry(&fs->seq, seq)); } else { get_fs_pwd(current->fs, &nd->path); nd->inode = nd->path.dentry->d_inode; } } else { /* Caller must check execute permissions on the starting path component */ struct fd f = fdget_raw(nd->dfd); struct dentry *dentry; if (!f.file) return ERR_PTR(-EBADF); dentry = f.file->f_path.dentry; if (*s && unlikely(!d_can_lookup(dentry))) { fdput(f); return ERR_PTR(-ENOTDIR); } nd->path = f.file->f_path; if (flags & LOOKUP_RCU) { nd->inode = nd->path.dentry->d_inode; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); } else { path_get(&nd->path); nd->inode = nd->path.dentry->d_inode; } fdput(f); } /* For scoped-lookups we need to set the root to the dirfd as well. */ if (flags & LOOKUP_IS_SCOPED) { nd->root = nd->path; if (flags & LOOKUP_RCU) { nd->root_seq = nd->seq; } else { path_get(&nd->root); nd->state |= ND_ROOT_GRABBED; } } return s; } static inline const char *lookup_last(struct nameidata *nd) { if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; return walk_component(nd, WALK_TRAILING); } static int handle_lookup_down(struct nameidata *nd) { if (!(nd->flags & LOOKUP_RCU)) dget(nd->path.dentry); nd->next_seq = nd->seq; return PTR_ERR(step_into(nd, WALK_NOFOLLOW, nd->path.dentry)); } /* Returns 0 and nd will be valid on success; Returns error, otherwise. */ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path) { const char *s = path_init(nd, flags); int err; if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) { err = handle_lookup_down(nd); if (unlikely(err < 0)) s = ERR_PTR(err); } while (!(err = link_path_walk(s, nd)) && (s = lookup_last(nd)) != NULL) ; if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) { err = handle_lookup_down(nd); nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please... } if (!err) err = complete_walk(nd); if (!err && nd->flags & LOOKUP_DIRECTORY) if (!d_can_lookup(nd->path.dentry)) err = -ENOTDIR; if (!err) { *path = nd->path; nd->path.mnt = NULL; nd->path.dentry = NULL; } terminate_walk(nd); return err; } int filename_lookup(int dfd, struct filename *name, unsigned flags, struct path *path, struct path *root) { int retval; struct nameidata nd; if (IS_ERR(name)) return PTR_ERR(name); set_nameidata(&nd, dfd, name, root); retval = path_lookupat(&nd, flags | LOOKUP_RCU, path); if (unlikely(retval == -ECHILD)) retval = path_lookupat(&nd, flags, path); if (unlikely(retval == -ESTALE)) retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path); if (likely(!retval)) audit_inode(name, path->dentry, flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0); restore_nameidata(); return retval; } /* Returns 0 and nd will be valid on success; Returns error, otherwise. */ static int path_parentat(struct nameidata *nd, unsigned flags, struct path *parent) { const char *s = path_init(nd, flags); int err = link_path_walk(s, nd); if (!err) err = complete_walk(nd); if (!err) { *parent = nd->path; nd->path.mnt = NULL; nd->path.dentry = NULL; } terminate_walk(nd); return err; } /* Note: this does not consume "name" */ static int __filename_parentat(int dfd, struct filename *name, unsigned int flags, struct path *parent, struct qstr *last, int *type, const struct path *root) { int retval; struct nameidata nd; if (IS_ERR(name)) return PTR_ERR(name); set_nameidata(&nd, dfd, name, root); retval = path_parentat(&nd, flags | LOOKUP_RCU, parent); if (unlikely(retval == -ECHILD)) retval = path_parentat(&nd, flags, parent); if (unlikely(retval == -ESTALE)) retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent); if (likely(!retval)) { *last = nd.last; *type = nd.last_type; audit_inode(name, parent->dentry, AUDIT_INODE_PARENT); } restore_nameidata(); return retval; } static int filename_parentat(int dfd, struct filename *name, unsigned int flags, struct path *parent, struct qstr *last, int *type) { return __filename_parentat(dfd, name, flags, parent, last, type, NULL); } /* does lookup, returns the object with parent locked */ static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct path *path) { struct dentry *d; struct qstr last; int type, error; error = filename_parentat(dfd, name, 0, path, &last, &type); if (error) return ERR_PTR(error); if (unlikely(type != LAST_NORM)) { path_put(path); return ERR_PTR(-EINVAL); } inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); d = lookup_one_qstr_excl(&last, path->dentry, 0); if (IS_ERR(d)) { inode_unlock(path->dentry->d_inode); path_put(path); } return d; } struct dentry *kern_path_locked(const char *name, struct path *path) { struct filename *filename = getname_kernel(name); struct dentry *res = __kern_path_locked(AT_FDCWD, filename, path); putname(filename); return res; } struct dentry *user_path_locked_at(int dfd, const char __user *name, struct path *path) { struct filename *filename = getname(name); struct dentry *res = __kern_path_locked(dfd, filename, path); putname(filename); return res; } EXPORT_SYMBOL(user_path_locked_at); int kern_path(const char *name, unsigned int flags, struct path *path) { struct filename *filename = getname_kernel(name); int ret = filename_lookup(AT_FDCWD, filename, flags, path, NULL); putname(filename); return ret; } EXPORT_SYMBOL(kern_path); /** * vfs_path_parent_lookup - lookup a parent path relative to a dentry-vfsmount pair * @filename: filename structure * @flags: lookup flags * @parent: pointer to struct path to fill * @last: last component * @type: type of the last component * @root: pointer to struct path of the base directory */ int vfs_path_parent_lookup(struct filename *filename, unsigned int flags, struct path *parent, struct qstr *last, int *type, const struct path *root) { return __filename_parentat(AT_FDCWD, filename, flags, parent, last, type, root); } EXPORT_SYMBOL(vfs_path_parent_lookup); /** * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair * @dentry: pointer to dentry of the base directory * @mnt: pointer to vfs mount of the base directory * @name: pointer to file name * @flags: lookup flags * @path: pointer to struct path to fill */ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, const char *name, unsigned int flags, struct path *path) { struct filename *filename; struct path root = {.mnt = mnt, .dentry = dentry}; int ret; filename = getname_kernel(name); /* the first argument of filename_lookup() is ignored with root */ ret = filename_lookup(AT_FDCWD, filename, flags, path, &root); putname(filename); return ret; } EXPORT_SYMBOL(vfs_path_lookup); static int lookup_one_common(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len, struct qstr *this) { this->name = name; this->len = len; this->hash = full_name_hash(base, name, len); if (!len) return -EACCES; if (is_dot_dotdot(name, len)) return -EACCES; while (len--) { unsigned int c = *(const unsigned char *)name++; if (c == '/' || c == '\0') return -EACCES; } /* * See if the low-level filesystem might want * to use its own hash.. */ if (base->d_flags & DCACHE_OP_HASH) { int err = base->d_op->d_hash(base, this); if (err < 0) return err; } return inode_permission(idmap, base->d_inode, MAY_EXEC); } /** * try_lookup_one_len - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Look up a dentry by name in the dcache, returning NULL if it does not * currently exist. The function does not try to create a dentry. * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * The caller must hold base->i_mutex. */ struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len) { struct qstr this; int err; WARN_ON_ONCE(!inode_is_locked(base->d_inode)); err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this); if (err) return ERR_PTR(err); return lookup_dcache(&this, base, 0); } EXPORT_SYMBOL(try_lookup_one_len); /** * lookup_one_len - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * The caller must hold base->i_mutex. */ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) { struct dentry *dentry; struct qstr this; int err; WARN_ON_ONCE(!inode_is_locked(base->d_inode)); err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this); if (err) return ERR_PTR(err); dentry = lookup_dcache(&this, base, 0); return dentry ? dentry : __lookup_slow(&this, base, 0); } EXPORT_SYMBOL(lookup_one_len); /** * lookup_one - filesystem helper to lookup single pathname component * @idmap: idmap of the mount the lookup is performed from * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * The caller must hold base->i_mutex. */ struct dentry *lookup_one(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len) { struct dentry *dentry; struct qstr this; int err; WARN_ON_ONCE(!inode_is_locked(base->d_inode)); err = lookup_one_common(idmap, name, base, len, &this); if (err) return ERR_PTR(err); dentry = lookup_dcache(&this, base, 0); return dentry ? dentry : __lookup_slow(&this, base, 0); } EXPORT_SYMBOL(lookup_one); /** * lookup_one_unlocked - filesystem helper to lookup single pathname component * @idmap: idmap of the mount the lookup is performed from * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * Unlike lookup_one_len, it should be called without the parent * i_mutex held, and will take the i_mutex itself if necessary. */ struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len) { struct qstr this; int err; struct dentry *ret; err = lookup_one_common(idmap, name, base, len, &this); if (err) return ERR_PTR(err); ret = lookup_dcache(&this, base, 0); if (!ret) ret = lookup_slow(&this, base, 0); return ret; } EXPORT_SYMBOL(lookup_one_unlocked); /** * lookup_one_positive_unlocked - filesystem helper to lookup single * pathname component * @idmap: idmap of the mount the lookup is performed from * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns * known positive or ERR_PTR(). This is what most of the users want. * * Note that pinned negative with unlocked parent _can_ become positive at any * time, so callers of lookup_one_unlocked() need to be very careful; pinned * positives have >d_inode stable, so this one avoids such problems. * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * The helper should be called without i_mutex held. */ struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len) { struct dentry *ret = lookup_one_unlocked(idmap, name, base, len); if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { dput(ret); ret = ERR_PTR(-ENOENT); } return ret; } EXPORT_SYMBOL(lookup_one_positive_unlocked); /** * lookup_one_len_unlocked - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * Unlike lookup_one_len, it should be called without the parent * i_mutex held, and will take the i_mutex itself if necessary. */ struct dentry *lookup_one_len_unlocked(const char *name, struct dentry *base, int len) { return lookup_one_unlocked(&nop_mnt_idmap, name, base, len); } EXPORT_SYMBOL(lookup_one_len_unlocked); /* * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT) * on negatives. Returns known positive or ERR_PTR(); that's what * most of the users want. Note that pinned negative with unlocked parent * _can_ become positive at any time, so callers of lookup_one_len_unlocked() * need to be very careful; pinned positives have ->d_inode stable, so * this one avoids such problems. */ struct dentry *lookup_positive_unlocked(const char *name, struct dentry *base, int len) { return lookup_one_positive_unlocked(&nop_mnt_idmap, name, base, len); } EXPORT_SYMBOL(lookup_positive_unlocked); #ifdef CONFIG_UNIX98_PTYS int path_pts(struct path *path) { /* Find something mounted on "pts" in the same directory as * the input path. */ struct dentry *parent = dget_parent(path->dentry); struct dentry *child; struct qstr this = QSTR_INIT("pts", 3); if (unlikely(!path_connected(path->mnt, parent))) { dput(parent); return -ENOENT; } dput(path->dentry); path->dentry = parent; child = d_hash_and_lookup(parent, &this); if (IS_ERR_OR_NULL(child)) return -ENOENT; path->dentry = child; dput(parent); follow_down(path, 0); return 0; } #endif int user_path_at_empty(int dfd, const char __user *name, unsigned flags, struct path *path, int *empty) { struct filename *filename = getname_flags(name, flags, empty); int ret = filename_lookup(dfd, filename, flags, path, NULL); putname(filename); return ret; } EXPORT_SYMBOL(user_path_at_empty); int __check_sticky(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode) { kuid_t fsuid = current_fsuid(); if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), fsuid)) return 0; if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, dir), fsuid)) return 0; return !capable_wrt_inode_uidgid(idmap, inode, CAP_FOWNER); } EXPORT_SYMBOL(__check_sticky); /* * Check whether we can remove a link victim from directory dir, check * whether the type of victim is right. * 1. We can't do it if dir is read-only (done in permission()) * 2. We should have write and exec permissions on dir * 3. We can't remove anything from append-only dir * 4. We can't do anything with immutable dir (done in permission()) * 5. If the sticky bit on dir is set we should either * a. be owner of dir, or * b. be owner of victim, or * c. have CAP_FOWNER capability * 6. If the victim is append-only or immutable we can't do antyhing with * links pointing to it. * 7. If the victim has an unknown uid or gid we can't change the inode. * 8. If we were asked to remove a directory and victim isn't one - ENOTDIR. * 9. If we were asked to remove a non-directory and victim isn't one - EISDIR. * 10. We can't remove a root or mountpoint. * 11. We don't allow removal of NFS sillyrenamed files; it's handled by * nfs_async_unlink(). */ static int may_delete(struct mnt_idmap *idmap, struct inode *dir, struct dentry *victim, bool isdir) { struct inode *inode = d_backing_inode(victim); int error; if (d_is_negative(victim)) return -ENOENT; BUG_ON(!inode); BUG_ON(victim->d_parent->d_inode != dir); /* Inode writeback is not safe when the uid or gid are invalid. */ if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) return -EOVERFLOW; audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (IS_APPEND(dir)) return -EPERM; if (check_sticky(idmap, dir, inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(idmap, inode)) return -EPERM; if (isdir) { if (!d_is_dir(victim)) return -ENOTDIR; if (IS_ROOT(victim)) return -EBUSY; } else if (d_is_dir(victim)) return -EISDIR; if (IS_DEADDIR(dir)) return -ENOENT; if (victim->d_flags & DCACHE_NFSFS_RENAMED) return -EBUSY; return 0; } /* Check whether we can create an object with dentry child in directory * dir. * 1. We can't do it if child already exists (open has special treatment for * this case, but since we are inlined it's OK) * 2. We can't do it if dir is read-only (done in permission()) * 3. We can't do it if the fs can't represent the fsuid or fsgid. * 4. We should have write and exec permissions on dir * 5. We can't do it if dir is immutable (done in permission()) */ static inline int may_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *child) { audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE); if (child->d_inode) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; if (!fsuidgid_has_mapping(dir->i_sb, idmap)) return -EOVERFLOW; return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); } // p1 != p2, both are on the same filesystem, ->s_vfs_rename_mutex is held static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2) { struct dentry *p = p1, *q = p2, *r; while ((r = p->d_parent) != p2 && r != p) p = r; if (r == p2) { // p is a child of p2 and an ancestor of p1 or p1 itself inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); inode_lock_nested(p1->d_inode, I_MUTEX_PARENT2); return p; } // p is the root of connected component that contains p1 // p2 does not occur on the path from p to p1 while ((r = q->d_parent) != p1 && r != p && r != q) q = r; if (r == p1) { // q is a child of p1 and an ancestor of p2 or p2 itself inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); return q; } else if (likely(r == p)) { // both p2 and p1 are descendents of p inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); return NULL; } else { // no common ancestor at the time we'd been called mutex_unlock(&p1->d_sb->s_vfs_rename_mutex); return ERR_PTR(-EXDEV); } } /* * p1 and p2 should be directories on the same fs. */ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) { if (p1 == p2) { inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); return NULL; } mutex_lock(&p1->d_sb->s_vfs_rename_mutex); return lock_two_directories(p1, p2); } EXPORT_SYMBOL(lock_rename); /* * c1 and p2 should be on the same fs. */ struct dentry *lock_rename_child(struct dentry *c1, struct dentry *p2) { if (READ_ONCE(c1->d_parent) == p2) { /* * hopefully won't need to touch ->s_vfs_rename_mutex at all. */ inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); /* * now that p2 is locked, nobody can move in or out of it, * so the test below is safe. */ if (likely(c1->d_parent == p2)) return NULL; /* * c1 got moved out of p2 while we'd been taking locks; * unlock and fall back to slow case. */ inode_unlock(p2->d_inode); } mutex_lock(&c1->d_sb->s_vfs_rename_mutex); /* * nobody can move out of any directories on this fs. */ if (likely(c1->d_parent != p2)) return lock_two_directories(c1->d_parent, p2); /* * c1 got moved into p2 while we were taking locks; * we need p2 locked and ->s_vfs_rename_mutex unlocked, * for consistency with lock_rename(). */ inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); mutex_unlock(&c1->d_sb->s_vfs_rename_mutex); return NULL; } EXPORT_SYMBOL(lock_rename_child); void unlock_rename(struct dentry *p1, struct dentry *p2) { inode_unlock(p1->d_inode); if (p1 != p2) { inode_unlock(p2->d_inode); mutex_unlock(&p1->d_sb->s_vfs_rename_mutex); } } EXPORT_SYMBOL(unlock_rename); /** * vfs_prepare_mode - prepare the mode to be used for a new inode * @idmap: idmap of the mount the inode was found from * @dir: parent directory of the new inode * @mode: mode of the new inode * @mask_perms: allowed permission by the vfs * @type: type of file to be created * * This helper consolidates and enforces vfs restrictions on the @mode of a new * object to be created. * * Umask stripping depends on whether the filesystem supports POSIX ACLs (see * the kernel documentation for mode_strip_umask()). Moving umask stripping * after setgid stripping allows the same ordering for both non-POSIX ACL and * POSIX ACL supporting filesystems. * * Note that it's currently valid for @type to be 0 if a directory is created. * Filesystems raise that flag individually and we need to check whether each * filesystem can deal with receiving S_IFDIR from the vfs before we enforce a * non-zero type. * * Returns: mode to be passed to the filesystem */ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap, const struct inode *dir, umode_t mode, umode_t mask_perms, umode_t type) { mode = mode_strip_sgid(idmap, dir, mode); mode = mode_strip_umask(dir, mode); /* * Apply the vfs mandated allowed permission mask and set the type of * file to be created before we call into the filesystem. */ mode &= (mask_perms & ~S_IFMT); mode |= (type & S_IFMT); return mode; } /** * vfs_create - create new file * @idmap: idmap of the mount the inode was found from * @dir: inode of @dentry * @dentry: pointer to dentry of the base directory * @mode: mode of the new file * @want_excl: whether the file must not yet exist * * Create a new file. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool want_excl) { int error; error = may_create(idmap, dir, dentry); if (error) return error; if (!dir->i_op->create) return -EACCES; /* shouldn't it be ENOSYS? */ mode = vfs_prepare_mode(idmap, dir, mode, S_IALLUGO, S_IFREG); error = security_inode_create(dir, dentry, mode); if (error) return error; error = dir->i_op->create(idmap, dir, dentry, mode, want_excl); if (!error) fsnotify_create(dir, dentry); return error; } EXPORT_SYMBOL(vfs_create); int vfs_mkobj(struct dentry *dentry, umode_t mode, int (*f)(struct dentry *, umode_t, void *), void *arg) { struct inode *dir = dentry->d_parent->d_inode; int error = may_create(&nop_mnt_idmap, dir, dentry); if (error) return error; mode &= S_IALLUGO; mode |= S_IFREG; error = security_inode_create(dir, dentry, mode); if (error) return error; error = f(dentry, mode, arg); if (!error) fsnotify_create(dir, dentry); return error; } EXPORT_SYMBOL(vfs_mkobj); bool may_open_dev(const struct path *path) { return !(path->mnt->mnt_flags & MNT_NODEV) && !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV); } static int may_open(struct mnt_idmap *idmap, const struct path *path, int acc_mode, int flag) { struct dentry *dentry = path->dentry; struct inode *inode = dentry->d_inode; int error; if (!inode) return -ENOENT; switch (inode->i_mode & S_IFMT) { case S_IFLNK: return -ELOOP; case S_IFDIR: if (acc_mode & MAY_WRITE) return -EISDIR; if (acc_mode & MAY_EXEC) return -EACCES; break; case S_IFBLK: case S_IFCHR: if (!may_open_dev(path)) return -EACCES; fallthrough; case S_IFIFO: case S_IFSOCK: if (acc_mode & MAY_EXEC) return -EACCES; flag &= ~O_TRUNC; break; case S_IFREG: if ((acc_mode & MAY_EXEC) && path_noexec(path)) return -EACCES; break; } error = inode_permission(idmap, inode, MAY_OPEN | acc_mode); if (error) return error; /* * An append-only file must be opened in append mode for writing. */ if (IS_APPEND(inode)) { if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND)) return -EPERM; if (flag & O_TRUNC) return -EPERM; } /* O_NOATIME can only be set by the owner or superuser */ if (flag & O_NOATIME && !inode_owner_or_capable(idmap, inode)) return -EPERM; return 0; } static int handle_truncate(struct mnt_idmap *idmap, struct file *filp) { const struct path *path = &filp->f_path; struct inode *inode = path->dentry->d_inode; int error = get_write_access(inode); if (error) return error; error = security_file_truncate(filp); if (!error) { error = do_truncate(idmap, path->dentry, 0, ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, filp); } put_write_access(inode); return error; } static inline int open_to_namei_flags(int flag) { if ((flag & O_ACCMODE) == 3) flag--; return flag; } static int may_o_create(struct mnt_idmap *idmap, const struct path *dir, struct dentry *dentry, umode_t mode) { int error = security_path_mknod(dir, dentry, mode, 0); if (error) return error; if (!fsuidgid_has_mapping(dir->dentry->d_sb, idmap)) return -EOVERFLOW; error = inode_permission(idmap, dir->dentry->d_inode, MAY_WRITE | MAY_EXEC); if (error) return error; return security_inode_create(dir->dentry->d_inode, dentry, mode); } /* * Attempt to atomically look up, create and open a file from a negative * dentry. * * Returns 0 if successful. The file will have been created and attached to * @file by the filesystem calling finish_open(). * * If the file was looked up only or didn't need creating, FMODE_OPENED won't * be set. The caller will need to perform the open themselves. @path will * have been updated to point to the new dentry. This may be negative. * * Returns an error code otherwise. */ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry, struct file *file, int open_flag, umode_t mode) { struct dentry *const DENTRY_NOT_SET = (void *) -1UL; struct inode *dir = nd->path.dentry->d_inode; int error; if (nd->flags & LOOKUP_DIRECTORY) open_flag |= O_DIRECTORY; file->f_path.dentry = DENTRY_NOT_SET; file->f_path.mnt = nd->path.mnt; error = dir->i_op->atomic_open(dir, dentry, file, open_to_namei_flags(open_flag), mode); d_lookup_done(dentry); if (!error) { if (file->f_mode & FMODE_OPENED) { if (unlikely(dentry != file->f_path.dentry)) { dput(dentry); dentry = dget(file->f_path.dentry); } } else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) { error = -EIO; } else { if (file->f_path.dentry) { dput(dentry); dentry = file->f_path.dentry; } if (unlikely(d_is_negative(dentry))) error = -ENOENT; } } if (error) { dput(dentry); dentry = ERR_PTR(error); } return dentry; } /* * Look up and maybe create and open the last component. * * Must be called with parent locked (exclusive in O_CREAT case). * * Returns 0 on success, that is, if * the file was successfully atomically created (if necessary) and opened, or * the file was not completely opened at this time, though lookups and * creations were performed. * These case are distinguished by presence of FMODE_OPENED on file->f_mode. * In the latter case dentry returned in @path might be negative if O_CREAT * hadn't been specified. * * An error code is returned on failure. */ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, const struct open_flags *op, bool got_write) { struct mnt_idmap *idmap; struct dentry *dir = nd->path.dentry; struct inode *dir_inode = dir->d_inode; int open_flag = op->open_flag; struct dentry *dentry; int error, create_error = 0; umode_t mode = op->mode; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); if (unlikely(IS_DEADDIR(dir_inode))) return ERR_PTR(-ENOENT); file->f_mode &= ~FMODE_CREATED; dentry = d_lookup(dir, &nd->last); for (;;) { if (!dentry) { dentry = d_alloc_parallel(dir, &nd->last, &wq); if (IS_ERR(dentry)) return dentry; } if (d_in_lookup(dentry)) break; error = d_revalidate(dentry, nd->flags); if (likely(error > 0)) break; if (error) goto out_dput; d_invalidate(dentry); dput(dentry); dentry = NULL; } if (dentry->d_inode) { /* Cached positive dentry: will open in f_op->open */ return dentry; } /* * Checking write permission is tricky, bacuse we don't know if we are * going to actually need it: O_CREAT opens should work as long as the * file exists. But checking existence breaks atomicity. The trick is * to check access and if not granted clear O_CREAT from the flags. * * Another problem is returing the "right" error value (e.g. for an * O_EXCL open we want to return EEXIST not EROFS). */ if (unlikely(!got_write)) open_flag &= ~O_TRUNC; idmap = mnt_idmap(nd->path.mnt); if (open_flag & O_CREAT) { if (open_flag & O_EXCL) open_flag &= ~O_TRUNC; mode = vfs_prepare_mode(idmap, dir->d_inode, mode, mode, mode); if (likely(got_write)) create_error = may_o_create(idmap, &nd->path, dentry, mode); else create_error = -EROFS; } if (create_error) open_flag &= ~O_CREAT; if (dir_inode->i_op->atomic_open) { dentry = atomic_open(nd, dentry, file, open_flag, mode); if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT)) dentry = ERR_PTR(create_error); return dentry; } if (d_in_lookup(dentry)) { struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry, nd->flags); d_lookup_done(dentry); if (unlikely(res)) { if (IS_ERR(res)) { error = PTR_ERR(res); goto out_dput; } dput(dentry); dentry = res; } } /* Negative dentry, just create the file */ if (!dentry->d_inode && (open_flag & O_CREAT)) { file->f_mode |= FMODE_CREATED; audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE); if (!dir_inode->i_op->create) { error = -EACCES; goto out_dput; } error = dir_inode->i_op->create(idmap, dir_inode, dentry, mode, open_flag & O_EXCL); if (error) goto out_dput; } if (unlikely(create_error) && !dentry->d_inode) { error = create_error; goto out_dput; } return dentry; out_dput: dput(dentry); return ERR_PTR(error); } static const char *open_last_lookups(struct nameidata *nd, struct file *file, const struct open_flags *op) { struct dentry *dir = nd->path.dentry; int open_flag = op->open_flag; bool got_write = false; struct dentry *dentry; const char *res; nd->flags |= op->intent; if (nd->last_type != LAST_NORM) { if (nd->depth) put_link(nd); return handle_dots(nd, nd->last_type); } if (!(open_flag & O_CREAT)) { if (nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; /* we _can_ be in RCU mode here */ dentry = lookup_fast(nd); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (likely(dentry)) goto finish_lookup; if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU)) return ERR_PTR(-ECHILD); } else { /* create side of things */ if (nd->flags & LOOKUP_RCU) { if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); } audit_inode(nd->name, dir, AUDIT_INODE_PARENT); /* trailing slashes? */ if (unlikely(nd->last.name[nd->last.len])) return ERR_PTR(-EISDIR); } if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { got_write = !mnt_want_write(nd->path.mnt); /* * do _not_ fail yet - we might not need that or fail with * a different error; let lookup_open() decide; we'll be * dropping this one anyway. */ } if (open_flag & O_CREAT) inode_lock(dir->d_inode); else inode_lock_shared(dir->d_inode); dentry = lookup_open(nd, file, op, got_write); if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED)) fsnotify_create(dir->d_inode, dentry); if (open_flag & O_CREAT) inode_unlock(dir->d_inode); else inode_unlock_shared(dir->d_inode); if (got_write) mnt_drop_write(nd->path.mnt); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) { dput(nd->path.dentry); nd->path.dentry = dentry; return NULL; } finish_lookup: if (nd->depth) put_link(nd); res = step_into(nd, WALK_TRAILING, dentry); if (unlikely(res)) nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); return res; } /* * Handle the last step of open() */ static int do_open(struct nameidata *nd, struct file *file, const struct open_flags *op) { struct mnt_idmap *idmap; int open_flag = op->open_flag; bool do_truncate; int acc_mode; int error; if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) { error = complete_walk(nd); if (error) return error; } if (!(file->f_mode & FMODE_CREATED)) audit_inode(nd->name, nd->path.dentry, 0); idmap = mnt_idmap(nd->path.mnt); if (open_flag & O_CREAT) { if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED)) return -EEXIST; if (d_is_dir(nd->path.dentry)) return -EISDIR; error = may_create_in_sticky(idmap, nd, d_backing_inode(nd->path.dentry)); if (unlikely(error)) return error; } if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry)) return -ENOTDIR; do_truncate = false; acc_mode = op->acc_mode; if (file->f_mode & FMODE_CREATED) { /* Don't check for write permission, don't truncate */ open_flag &= ~O_TRUNC; acc_mode = 0; } else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) { error = mnt_want_write(nd->path.mnt); if (error) return error; do_truncate = true; } error = may_open(idmap, &nd->path, acc_mode, open_flag); if (!error && !(file->f_mode & FMODE_OPENED)) error = vfs_open(&nd->path, file); if (!error) error = ima_file_check(file, op->acc_mode); if (!error && do_truncate) error = handle_truncate(idmap, file); if (unlikely(error > 0)) { WARN_ON(1); error = -EINVAL; } if (do_truncate) mnt_drop_write(nd->path.mnt); return error; } /** * vfs_tmpfile - create tmpfile * @idmap: idmap of the mount the inode was found from * @parentpath: pointer to the path of the base directory * @file: file descriptor of the new tmpfile * @mode: mode of the new tmpfile * * Create a temporary file. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ static int vfs_tmpfile(struct mnt_idmap *idmap, const struct path *parentpath, struct file *file, umode_t mode) { struct dentry *child; struct inode *dir = d_inode(parentpath->dentry); struct inode *inode; int error; int open_flag = file->f_flags; /* we want directory to be writable */ error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (!dir->i_op->tmpfile) return -EOPNOTSUPP; child = d_alloc(parentpath->dentry, &slash_name); if (unlikely(!child)) return -ENOMEM; file->f_path.mnt = parentpath->mnt; file->f_path.dentry = child; mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); error = dir->i_op->tmpfile(idmap, dir, file, mode); dput(child); if (error) return error; /* Don't check for other permissions, the inode was just created */ error = may_open(idmap, &file->f_path, 0, file->f_flags); if (error) return error; inode = file_inode(file); if (!(open_flag & O_EXCL)) { spin_lock(&inode->i_lock); inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); } ima_post_create_tmpfile(idmap, inode); return 0; } /** * kernel_tmpfile_open - open a tmpfile for kernel internal use * @idmap: idmap of the mount the inode was found from * @parentpath: path of the base directory * @mode: mode of the new tmpfile * @open_flag: flags * @cred: credentials for open * * Create and open a temporary file. The file is not accounted in nr_files, * hence this is only for kernel internal use, and must not be installed into * file tables or such. */ struct file *kernel_tmpfile_open(struct mnt_idmap *idmap, const struct path *parentpath, umode_t mode, int open_flag, const struct cred *cred) { struct file *file; int error; file = alloc_empty_file_noaccount(open_flag, cred); if (IS_ERR(file)) return file; error = vfs_tmpfile(idmap, parentpath, file, mode); if (error) { fput(file); file = ERR_PTR(error); } return file; } EXPORT_SYMBOL(kernel_tmpfile_open); static int do_tmpfile(struct nameidata *nd, unsigned flags, const struct open_flags *op, struct file *file) { struct path path; int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path); if (unlikely(error)) return error; error = mnt_want_write(path.mnt); if (unlikely(error)) goto out; error = vfs_tmpfile(mnt_idmap(path.mnt), &path, file, op->mode); if (error) goto out2; audit_inode(nd->name, file->f_path.dentry, 0); out2: mnt_drop_write(path.mnt); out: path_put(&path); return error; } static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file) { struct path path; int error = path_lookupat(nd, flags, &path); if (!error) { audit_inode(nd->name, path.dentry, 0); error = vfs_open(&path, file); path_put(&path); } return error; } static struct file *path_openat(struct nameidata *nd, const struct open_flags *op, unsigned flags) { struct file *file; int error; file = alloc_empty_file(op->open_flag, current_cred()); if (IS_ERR(file)) return file; if (unlikely(file->f_flags & __O_TMPFILE)) { error = do_tmpfile(nd, flags, op, file); } else if (unlikely(file->f_flags & O_PATH)) { error = do_o_path(nd, flags, file); } else { const char *s = path_init(nd, flags); while (!(error = link_path_walk(s, nd)) && (s = open_last_lookups(nd, file, op)) != NULL) ; if (!error) error = do_open(nd, file, op); terminate_walk(nd); } if (likely(!error)) { if (likely(file->f_mode & FMODE_OPENED)) return file; WARN_ON(1); error = -EINVAL; } fput(file); if (error == -EOPENSTALE) { if (flags & LOOKUP_RCU) error = -ECHILD; else error = -ESTALE; } return ERR_PTR(error); } struct file *do_filp_open(int dfd, struct filename *pathname, const struct open_flags *op) { struct nameidata nd; int flags = op->lookup_flags; struct file *filp; set_nameidata(&nd, dfd, pathname, NULL); filp = path_openat(&nd, op, flags | LOOKUP_RCU); if (unlikely(filp == ERR_PTR(-ECHILD))) filp = path_openat(&nd, op, flags); if (unlikely(filp == ERR_PTR(-ESTALE))) filp = path_openat(&nd, op, flags | LOOKUP_REVAL); restore_nameidata(); return filp; } struct file *do_file_open_root(const struct path *root, const char *name, const struct open_flags *op) { struct nameidata nd; struct file *file; struct filename *filename; int flags = op->lookup_flags; if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN) return ERR_PTR(-ELOOP); filename = getname_kernel(name); if (IS_ERR(filename)) return ERR_CAST(filename); set_nameidata(&nd, -1, filename, root); file = path_openat(&nd, op, flags | LOOKUP_RCU); if (unlikely(file == ERR_PTR(-ECHILD))) file = path_openat(&nd, op, flags); if (unlikely(file == ERR_PTR(-ESTALE))) file = path_openat(&nd, op, flags | LOOKUP_REVAL); restore_nameidata(); putname(filename); return file; } static struct dentry *filename_create(int dfd, struct filename *name, struct path *path, unsigned int lookup_flags) { struct dentry *dentry = ERR_PTR(-EEXIST); struct qstr last; bool want_dir = lookup_flags & LOOKUP_DIRECTORY; unsigned int reval_flag = lookup_flags & LOOKUP_REVAL; unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL; int type; int err2; int error; error = filename_parentat(dfd, name, reval_flag, path, &last, &type); if (error) return ERR_PTR(error); /* * Yucky last component or no last component at all? * (foo/., foo/.., /////) */ if (unlikely(type != LAST_NORM)) goto out; /* don't fail immediately if it's r/o, at least try to report other errors */ err2 = mnt_want_write(path->mnt); /* * Do the final lookup. Suppress 'create' if there is a trailing * '/', and a directory wasn't requested. */ if (last.name[last.len] && !want_dir) create_flags = 0; inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); dentry = lookup_one_qstr_excl(&last, path->dentry, reval_flag | create_flags); if (IS_ERR(dentry)) goto unlock; error = -EEXIST; if (d_is_positive(dentry)) goto fail; /* * Special case - lookup gave negative, but... we had foo/bar/ * From the vfs_mknod() POV we just have a negative dentry - * all is fine. Let's be bastards - you had / on the end, you've * been asking for (non-existent) directory. -ENOENT for you. */ if (unlikely(!create_flags)) { error = -ENOENT; goto fail; } if (unlikely(err2)) { error = err2; goto fail; } return dentry; fail: dput(dentry); dentry = ERR_PTR(error); unlock: inode_unlock(path->dentry->d_inode); if (!err2) mnt_drop_write(path->mnt); out: path_put(path); return dentry; } struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, unsigned int lookup_flags) { struct filename *filename = getname_kernel(pathname); struct dentry *res = filename_create(dfd, filename, path, lookup_flags); putname(filename); return res; } EXPORT_SYMBOL(kern_path_create); void done_path_create(struct path *path, struct dentry *dentry) { dput(dentry); inode_unlock(path->dentry->d_inode); mnt_drop_write(path->mnt); path_put(path); } EXPORT_SYMBOL(done_path_create); inline struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, unsigned int lookup_flags) { struct filename *filename = getname(pathname); struct dentry *res = filename_create(dfd, filename, path, lookup_flags); putname(filename); return res; } EXPORT_SYMBOL(user_path_create); /** * vfs_mknod - create device node or file * @idmap: idmap of the mount the inode was found from * @dir: inode of @dentry * @dentry: pointer to dentry of the base directory * @mode: mode of the new device node or file * @dev: device number of device to create * * Create a device node or file. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV; int error = may_create(idmap, dir, dentry); if (error) return error; if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout && !capable(CAP_MKNOD)) return -EPERM; if (!dir->i_op->mknod) return -EPERM; mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); error = devcgroup_inode_mknod(mode, dev); if (error) return error; error = security_inode_mknod(dir, dentry, mode, dev); if (error) return error; error = dir->i_op->mknod(idmap, dir, dentry, mode, dev); if (!error) fsnotify_create(dir, dentry); return error; } EXPORT_SYMBOL(vfs_mknod); static int may_mknod(umode_t mode) { switch (mode & S_IFMT) { case S_IFREG: case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK: case 0: /* zero mode translates to S_IFREG */ return 0; case S_IFDIR: return -EPERM; default: return -EINVAL; } } static int do_mknodat(int dfd, struct filename *name, umode_t mode, unsigned int dev) { struct mnt_idmap *idmap; struct dentry *dentry; struct path path; int error; unsigned int lookup_flags = 0; error = may_mknod(mode); if (error) goto out1; retry: dentry = filename_create(dfd, name, &path, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out1; error = security_path_mknod(&path, dentry, mode_strip_umask(path.dentry->d_inode, mode), dev); if (error) goto out2; idmap = mnt_idmap(path.mnt); switch (mode & S_IFMT) { case 0: case S_IFREG: error = vfs_create(idmap, path.dentry->d_inode, dentry, mode, true); if (!error) ima_post_path_mknod(idmap, dentry); break; case S_IFCHR: case S_IFBLK: error = vfs_mknod(idmap, path.dentry->d_inode, dentry, mode, new_decode_dev(dev)); break; case S_IFIFO: case S_IFSOCK: error = vfs_mknod(idmap, path.dentry->d_inode, dentry, mode, 0); break; } out2: done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out1: putname(name); return error; } SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, unsigned int, dev) { return do_mknodat(dfd, getname(filename), mode, dev); } SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev) { return do_mknodat(AT_FDCWD, getname(filename), mode, dev); } /** * vfs_mkdir - create directory * @idmap: idmap of the mount the inode was found from * @dir: inode of @dentry * @dentry: pointer to dentry of the base directory * @mode: mode of the new directory * * Create a directory. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { int error; unsigned max_links = dir->i_sb->s_max_links; error = may_create(idmap, dir, dentry); if (error) return error; if (!dir->i_op->mkdir) return -EPERM; mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO | S_ISVTX, 0); error = security_inode_mkdir(dir, dentry, mode); if (error) return error; if (max_links && dir->i_nlink >= max_links) return -EMLINK; error = dir->i_op->mkdir(idmap, dir, dentry, mode); if (!error) fsnotify_mkdir(dir, dentry); return error; } EXPORT_SYMBOL(vfs_mkdir); int do_mkdirat(int dfd, struct filename *name, umode_t mode) { struct dentry *dentry; struct path path; int error; unsigned int lookup_flags = LOOKUP_DIRECTORY; retry: dentry = filename_create(dfd, name, &path, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_putname; error = security_path_mkdir(&path, dentry, mode_strip_umask(path.dentry->d_inode, mode)); if (!error) { error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, mode); } done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out_putname: putname(name); return error; } SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) { return do_mkdirat(dfd, getname(pathname), mode); } SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) { return do_mkdirat(AT_FDCWD, getname(pathname), mode); } /** * vfs_rmdir - remove directory * @idmap: idmap of the mount the inode was found from * @dir: inode of @dentry * @dentry: pointer to dentry of the base directory * * Remove a directory. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry) { int error = may_delete(idmap, dir, dentry, 1); if (error) return error; if (!dir->i_op->rmdir) return -EPERM; dget(dentry); inode_lock(dentry->d_inode); error = -EBUSY; if (is_local_mountpoint(dentry) || (dentry->d_inode->i_flags & S_KERNEL_FILE)) goto out; error = security_inode_rmdir(dir, dentry); if (error) goto out; error = dir->i_op->rmdir(dir, dentry); if (error) goto out; shrink_dcache_parent(dentry); dentry->d_inode->i_flags |= S_DEAD; dont_mount(dentry); detach_mounts(dentry); out: inode_unlock(dentry->d_inode); dput(dentry); if (!error) d_delete_notify(dir, dentry); return error; } EXPORT_SYMBOL(vfs_rmdir); int do_rmdir(int dfd, struct filename *name) { int error; struct dentry *dentry; struct path path; struct qstr last; int type; unsigned int lookup_flags = 0; retry: error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type); if (error) goto exit1; switch (type) { case LAST_DOTDOT: error = -ENOTEMPTY; goto exit2; case LAST_DOT: error = -EINVAL; goto exit2; case LAST_ROOT: error = -EBUSY; goto exit2; } error = mnt_want_write(path.mnt); if (error) goto exit2; inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto exit3; if (!dentry->d_inode) { error = -ENOENT; goto exit4; } error = security_path_rmdir(&path, dentry); if (error) goto exit4; error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry); exit4: dput(dentry); exit3: inode_unlock(path.dentry->d_inode); mnt_drop_write(path.mnt); exit2: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } exit1: putname(name); return error; } SYSCALL_DEFINE1(rmdir, const char __user *, pathname) { return do_rmdir(AT_FDCWD, getname(pathname)); } /** * vfs_unlink - unlink a filesystem object * @idmap: idmap of the mount the inode was found from * @dir: parent directory * @dentry: victim * @delegated_inode: returns victim inode, if the inode is delegated. * * The caller must hold dir->i_mutex. * * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and * return a reference to the inode in delegated_inode. The caller * should then break the delegation on that inode and retry. Because * breaking a delegation may take a long time, the caller should drop * dir->i_mutex before doing so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, struct inode **delegated_inode) { struct inode *target = dentry->d_inode; int error = may_delete(idmap, dir, dentry, 0); if (error) return error; if (!dir->i_op->unlink) return -EPERM; inode_lock(target); if (IS_SWAPFILE(target)) error = -EPERM; else if (is_local_mountpoint(dentry)) error = -EBUSY; else { error = security_inode_unlink(dir, dentry); if (!error) { error = try_break_deleg(target, delegated_inode); if (error) goto out; error = dir->i_op->unlink(dir, dentry); if (!error) { dont_mount(dentry); detach_mounts(dentry); } } } out: inode_unlock(target); /* We don't d_delete() NFS sillyrenamed files--they still exist. */ if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) { fsnotify_unlink(dir, dentry); } else if (!error) { fsnotify_link_count(target); d_delete_notify(dir, dentry); } return error; } EXPORT_SYMBOL(vfs_unlink); /* * Make sure that the actual truncation of the file will occur outside its * directory's i_mutex. Truncate can take a long time if there is a lot of * writeout happening, and we don't want to prevent access to the directory * while waiting on the I/O. */ int do_unlinkat(int dfd, struct filename *name) { int error; struct dentry *dentry; struct path path; struct qstr last; int type; struct inode *inode = NULL; struct inode *delegated_inode = NULL; unsigned int lookup_flags = 0; retry: error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type); if (error) goto exit1; error = -EISDIR; if (type != LAST_NORM) goto exit2; error = mnt_want_write(path.mnt); if (error) goto exit2; retry_deleg: inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { /* Why not before? Because we want correct error value */ if (last.name[last.len] || d_is_negative(dentry)) goto slashes; inode = dentry->d_inode; ihold(inode); error = security_path_unlink(&path, dentry); if (error) goto exit3; error = vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, &delegated_inode); exit3: dput(dentry); } inode_unlock(path.dentry->d_inode); if (inode) iput(inode); /* truncate the inode here */ inode = NULL; if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } mnt_drop_write(path.mnt); exit2: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; inode = NULL; goto retry; } exit1: putname(name); return error; slashes: if (d_is_negative(dentry)) error = -ENOENT; else if (d_is_dir(dentry)) error = -EISDIR; else error = -ENOTDIR; goto exit3; } SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag) { if ((flag & ~AT_REMOVEDIR) != 0) return -EINVAL; if (flag & AT_REMOVEDIR) return do_rmdir(dfd, getname(pathname)); return do_unlinkat(dfd, getname(pathname)); } SYSCALL_DEFINE1(unlink, const char __user *, pathname) { return do_unlinkat(AT_FDCWD, getname(pathname)); } /** * vfs_symlink - create symlink * @idmap: idmap of the mount the inode was found from * @dir: inode of @dentry * @dentry: pointer to dentry of the base directory * @oldname: name of the file to link to * * Create a symlink. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *oldname) { int error; error = may_create(idmap, dir, dentry); if (error) return error; if (!dir->i_op->symlink) return -EPERM; error = security_inode_symlink(dir, dentry, oldname); if (error) return error; error = dir->i_op->symlink(idmap, dir, dentry, oldname); if (!error) fsnotify_create(dir, dentry); return error; } EXPORT_SYMBOL(vfs_symlink); int do_symlinkat(struct filename *from, int newdfd, struct filename *to) { int error; struct dentry *dentry; struct path path; unsigned int lookup_flags = 0; if (IS_ERR(from)) { error = PTR_ERR(from); goto out_putnames; } retry: dentry = filename_create(newdfd, to, &path, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_putnames; error = security_path_symlink(&path, dentry, from->name); if (!error) error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, from->name); done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out_putnames: putname(to); putname(from); return error; } SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, int, newdfd, const char __user *, newname) { return do_symlinkat(getname(oldname), newdfd, getname(newname)); } SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname) { return do_symlinkat(getname(oldname), AT_FDCWD, getname(newname)); } /** * vfs_link - create a new link * @old_dentry: object to be linked * @idmap: idmap of the mount * @dir: new parent * @new_dentry: where to create the new link * @delegated_inode: returns inode needing a delegation break * * The caller must hold dir->i_mutex * * If vfs_link discovers a delegation on the to-be-linked file in need * of breaking, it will return -EWOULDBLOCK and return a reference to the * inode in delegated_inode. The caller should then break the delegation * and retry. Because breaking a delegation may take a long time, the * caller should drop the i_mutex before doing so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode) { struct inode *inode = old_dentry->d_inode; unsigned max_links = dir->i_sb->s_max_links; int error; if (!inode) return -ENOENT; error = may_create(idmap, dir, new_dentry); if (error) return error; if (dir->i_sb != inode->i_sb) return -EXDEV; /* * A link to an append-only or immutable file cannot be created. */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return -EPERM; /* * Updating the link count will likely cause i_uid and i_gid to * be writen back improperly if their true value is unknown to * the vfs. */ if (HAS_UNMAPPED_ID(idmap, inode)) return -EPERM; if (!dir->i_op->link) return -EPERM; if (S_ISDIR(inode->i_mode)) return -EPERM; error = security_inode_link(old_dentry, dir, new_dentry); if (error) return error; inode_lock(inode); /* Make sure we don't allow creating hardlink to an unlinked file */ if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) error = -ENOENT; else if (max_links && inode->i_nlink >= max_links) error = -EMLINK; else { error = try_break_deleg(inode, delegated_inode); if (!error) error = dir->i_op->link(old_dentry, dir, new_dentry); } if (!error && (inode->i_state & I_LINKABLE)) { spin_lock(&inode->i_lock); inode->i_state &= ~I_LINKABLE; spin_unlock(&inode->i_lock); } inode_unlock(inode); if (!error) fsnotify_link(dir, inode, new_dentry); return error; } EXPORT_SYMBOL(vfs_link); /* * Hardlinks are often used in delicate situations. We avoid * security-related surprises by not following symlinks on the * newname. --KAB * * We don't follow them on the oldname either to be compatible * with linux 2.0, and to avoid hard-linking to directories * and other special files. --ADM */ int do_linkat(int olddfd, struct filename *old, int newdfd, struct filename *new, int flags) { struct mnt_idmap *idmap; struct dentry *new_dentry; struct path old_path, new_path; struct inode *delegated_inode = NULL; int how = 0; int error; if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) { error = -EINVAL; goto out_putnames; } /* * To use null names we require CAP_DAC_READ_SEARCH * This ensures that not everyone will be able to create * handlink using the passed filedescriptor. */ if (flags & AT_EMPTY_PATH && !capable(CAP_DAC_READ_SEARCH)) { error = -ENOENT; goto out_putnames; } if (flags & AT_SYMLINK_FOLLOW) how |= LOOKUP_FOLLOW; retry: error = filename_lookup(olddfd, old, how, &old_path, NULL); if (error) goto out_putnames; new_dentry = filename_create(newdfd, new, &new_path, (how & LOOKUP_REVAL)); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto out_putpath; error = -EXDEV; if (old_path.mnt != new_path.mnt) goto out_dput; idmap = mnt_idmap(new_path.mnt); error = may_linkat(idmap, &old_path); if (unlikely(error)) goto out_dput; error = security_path_link(old_path.dentry, &new_path, new_dentry); if (error) goto out_dput; error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode, new_dentry, &delegated_inode); out_dput: done_path_create(&new_path, new_dentry); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) { path_put(&old_path); goto retry; } } if (retry_estale(error, how)) { path_put(&old_path); how |= LOOKUP_REVAL; goto retry; } out_putpath: path_put(&old_path); out_putnames: putname(old); putname(new); return error; } SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname, int, flags) { return do_linkat(olddfd, getname_uflags(oldname, flags), newdfd, getname(newname), flags); } SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname) { return do_linkat(AT_FDCWD, getname(oldname), AT_FDCWD, getname(newname), 0); } /** * vfs_rename - rename a filesystem object * @rd: pointer to &struct renamedata info * * The caller must hold multiple mutexes--see lock_rename()). * * If vfs_rename discovers a delegation in need of breaking at either * the source or destination, it will return -EWOULDBLOCK and return a * reference to the inode in delegated_inode. The caller should then * break the delegation and retry. Because breaking a delegation may * take a long time, the caller should drop all locks before doing * so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. * * The worst of all namespace operations - renaming directory. "Perverted" * doesn't even start to describe it. Somebody in UCB had a heck of a trip... * Problems: * * a) we can get into loop creation. * b) race potential - two innocent renames can create a loop together. * That's where 4.4BSD screws up. Current fix: serialization on * sb->s_vfs_rename_mutex. We might be more accurate, but that's another * story. * c) we may have to lock up to _four_ objects - parents and victim (if it exists), * and source (if it's a non-directory or a subdirectory that moves to * different parent). * And that - after we got ->i_mutex on parents (until then we don't know * whether the target exists). Solution: try to be smart with locking * order for inodes. We rely on the fact that tree topology may change * only under ->s_vfs_rename_mutex _and_ that parent of the object we * move will be locked. Thus we can rank directories by the tree * (ancestors first) and rank all non-directories after them. * That works since everybody except rename does "lock parent, lookup, * lock child" and rename is under ->s_vfs_rename_mutex. * HOWEVER, it relies on the assumption that any object with ->lookup() * has no more than 1 dentry. If "hybrid" objects will ever appear, * we'd better make sure that there's no link(2) for them. * d) conversion from fhandle to dentry may come in the wrong moment - when * we are removing the target. Solution: we will have to grab ->i_mutex * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on * ->i_mutex on parents, which works but leads to some truly excessive * locking]. */ int vfs_rename(struct renamedata *rd) { int error; struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir; struct dentry *old_dentry = rd->old_dentry; struct dentry *new_dentry = rd->new_dentry; struct inode **delegated_inode = rd->delegated_inode; unsigned int flags = rd->flags; bool is_dir = d_is_dir(old_dentry); struct inode *source = old_dentry->d_inode; struct inode *target = new_dentry->d_inode; bool new_is_dir = false; unsigned max_links = new_dir->i_sb->s_max_links; struct name_snapshot old_name; bool lock_old_subdir, lock_new_subdir; if (source == target) return 0; error = may_delete(rd->old_mnt_idmap, old_dir, old_dentry, is_dir); if (error) return error; if (!target) { error = may_create(rd->new_mnt_idmap, new_dir, new_dentry); } else { new_is_dir = d_is_dir(new_dentry); if (!(flags & RENAME_EXCHANGE)) error = may_delete(rd->new_mnt_idmap, new_dir, new_dentry, is_dir); else error = may_delete(rd->new_mnt_idmap, new_dir, new_dentry, new_is_dir); } if (error) return error; if (!old_dir->i_op->rename) return -EPERM; /* * If we are going to change the parent - check write permissions, * we'll need to flip '..'. */ if (new_dir != old_dir) { if (is_dir) { error = inode_permission(rd->old_mnt_idmap, source, MAY_WRITE); if (error) return error; } if ((flags & RENAME_EXCHANGE) && new_is_dir) { error = inode_permission(rd->new_mnt_idmap, target, MAY_WRITE); if (error) return error; } } error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry, flags); if (error) return error; take_dentry_name_snapshot(&old_name, old_dentry); dget(new_dentry); /* * Lock children. * The source subdirectory needs to be locked on cross-directory * rename or cross-directory exchange since its parent changes. * The target subdirectory needs to be locked on cross-directory * exchange due to parent change and on any rename due to becoming * a victim. * Non-directories need locking in all cases (for NFS reasons); * they get locked after any subdirectories (in inode address order). * * NOTE: WE ONLY LOCK UNRELATED DIRECTORIES IN CROSS-DIRECTORY CASE. * NEVER, EVER DO THAT WITHOUT ->s_vfs_rename_mutex. */ lock_old_subdir = new_dir != old_dir; lock_new_subdir = new_dir != old_dir || !(flags & RENAME_EXCHANGE); if (is_dir) { if (lock_old_subdir) inode_lock_nested(source, I_MUTEX_CHILD); if (target && (!new_is_dir || lock_new_subdir)) inode_lock(target); } else if (new_is_dir) { if (lock_new_subdir) inode_lock_nested(target, I_MUTEX_CHILD); inode_lock(source); } else { lock_two_nondirectories(source, target); } error = -EPERM; if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target))) goto out; error = -EBUSY; if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry)) goto out; if (max_links && new_dir != old_dir) { error = -EMLINK; if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links) goto out; if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir && old_dir->i_nlink >= max_links) goto out; } if (!is_dir) { error = try_break_deleg(source, delegated_inode); if (error) goto out; } if (target && !new_is_dir) { error = try_break_deleg(target, delegated_inode); if (error) goto out; } error = old_dir->i_op->rename(rd->new_mnt_idmap, old_dir, old_dentry, new_dir, new_dentry, flags); if (error) goto out; if (!(flags & RENAME_EXCHANGE) && target) { if (is_dir) { shrink_dcache_parent(new_dentry); target->i_flags |= S_DEAD; } dont_mount(new_dentry); detach_mounts(new_dentry); } if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) { if (!(flags & RENAME_EXCHANGE)) d_move(old_dentry, new_dentry); else d_exchange(old_dentry, new_dentry); } out: if (!is_dir || lock_old_subdir) inode_unlock(source); if (target && (!new_is_dir || lock_new_subdir)) inode_unlock(target); dput(new_dentry); if (!error) { fsnotify_move(old_dir, new_dir, &old_name.name, is_dir, !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry); if (flags & RENAME_EXCHANGE) { fsnotify_move(new_dir, old_dir, &old_dentry->d_name, new_is_dir, NULL, new_dentry); } } release_dentry_name_snapshot(&old_name); return error; } EXPORT_SYMBOL(vfs_rename); int do_renameat2(int olddfd, struct filename *from, int newdfd, struct filename *to, unsigned int flags) { struct renamedata rd; struct dentry *old_dentry, *new_dentry; struct dentry *trap; struct path old_path, new_path; struct qstr old_last, new_last; int old_type, new_type; struct inode *delegated_inode = NULL; unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET; bool should_retry = false; int error = -EINVAL; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) goto put_names; if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) && (flags & RENAME_EXCHANGE)) goto put_names; if (flags & RENAME_EXCHANGE) target_flags = 0; retry: error = filename_parentat(olddfd, from, lookup_flags, &old_path, &old_last, &old_type); if (error) goto put_names; error = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last, &new_type); if (error) goto exit1; error = -EXDEV; if (old_path.mnt != new_path.mnt) goto exit2; error = -EBUSY; if (old_type != LAST_NORM) goto exit2; if (flags & RENAME_NOREPLACE) error = -EEXIST; if (new_type != LAST_NORM) goto exit2; error = mnt_want_write(old_path.mnt); if (error) goto exit2; retry_deleg: trap = lock_rename(new_path.dentry, old_path.dentry); if (IS_ERR(trap)) { error = PTR_ERR(trap); goto exit_lock_rename; } old_dentry = lookup_one_qstr_excl(&old_last, old_path.dentry, lookup_flags); error = PTR_ERR(old_dentry); if (IS_ERR(old_dentry)) goto exit3; /* source must exist */ error = -ENOENT; if (d_is_negative(old_dentry)) goto exit4; new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry, lookup_flags | target_flags); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto exit4; error = -EEXIST; if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry)) goto exit5; if (flags & RENAME_EXCHANGE) { error = -ENOENT; if (d_is_negative(new_dentry)) goto exit5; if (!d_is_dir(new_dentry)) { error = -ENOTDIR; if (new_last.name[new_last.len]) goto exit5; } } /* unless the source is a directory trailing slashes give -ENOTDIR */ if (!d_is_dir(old_dentry)) { error = -ENOTDIR; if (old_last.name[old_last.len]) goto exit5; if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len]) goto exit5; } /* source should not be ancestor of target */ error = -EINVAL; if (old_dentry == trap) goto exit5; /* target should not be an ancestor of source */ if (!(flags & RENAME_EXCHANGE)) error = -ENOTEMPTY; if (new_dentry == trap) goto exit5; error = security_path_rename(&old_path, old_dentry, &new_path, new_dentry, flags); if (error) goto exit5; rd.old_dir = old_path.dentry->d_inode; rd.old_dentry = old_dentry; rd.old_mnt_idmap = mnt_idmap(old_path.mnt); rd.new_dir = new_path.dentry->d_inode; rd.new_dentry = new_dentry; rd.new_mnt_idmap = mnt_idmap(new_path.mnt); rd.delegated_inode = &delegated_inode; rd.flags = flags; error = vfs_rename(&rd); exit5: dput(new_dentry); exit4: dput(old_dentry); exit3: unlock_rename(new_path.dentry, old_path.dentry); exit_lock_rename: if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } mnt_drop_write(old_path.mnt); exit2: if (retry_estale(error, lookup_flags)) should_retry = true; path_put(&new_path); exit1: path_put(&old_path); if (should_retry) { should_retry = false; lookup_flags |= LOOKUP_REVAL; goto retry; } put_names: putname(from); putname(to); return error; } SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname, unsigned int, flags) { return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname), flags); } SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname) { return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname), 0); } SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) { return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD, getname(newname), 0); } int readlink_copy(char __user *buffer, int buflen, const char *link) { int len = PTR_ERR(link); if (IS_ERR(link)) goto out; len = strlen(link); if (len > (unsigned) buflen) len = buflen; if (copy_to_user(buffer, link, len)) len = -EFAULT; out: return len; } /** * vfs_readlink - copy symlink body into userspace buffer * @dentry: dentry on which to get symbolic link * @buffer: user memory pointer * @buflen: size of buffer * * Does not touch atime. That's up to the caller if necessary * * Does not call security hook. */ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen) { struct inode *inode = d_inode(dentry); DEFINE_DELAYED_CALL(done); const char *link; int res; if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) { if (unlikely(inode->i_op->readlink)) return inode->i_op->readlink(dentry, buffer, buflen); if (!d_is_symlink(dentry)) return -EINVAL; spin_lock(&inode->i_lock); inode->i_opflags |= IOP_DEFAULT_READLINK; spin_unlock(&inode->i_lock); } link = READ_ONCE(inode->i_link); if (!link) { link = inode->i_op->get_link(dentry, inode, &done); if (IS_ERR(link)) return PTR_ERR(link); } res = readlink_copy(buffer, buflen, link); do_delayed_call(&done); return res; } EXPORT_SYMBOL(vfs_readlink); /** * vfs_get_link - get symlink body * @dentry: dentry on which to get symbolic link * @done: caller needs to free returned data with this * * Calls security hook and i_op->get_link() on the supplied inode. * * It does not touch atime. That's up to the caller if necessary. * * Does not work on "special" symlinks like /proc/$$/fd/N */ const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done) { const char *res = ERR_PTR(-EINVAL); struct inode *inode = d_inode(dentry); if (d_is_symlink(dentry)) { res = ERR_PTR(security_inode_readlink(dentry)); if (!res) res = inode->i_op->get_link(dentry, inode, done); } return res; } EXPORT_SYMBOL(vfs_get_link); /* get the link contents into pagecache */ const char *page_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *callback) { char *kaddr; struct page *page; struct address_space *mapping = inode->i_mapping; if (!dentry) { page = find_get_page(mapping, 0); if (!page) return ERR_PTR(-ECHILD); if (!PageUptodate(page)) { put_page(page); return ERR_PTR(-ECHILD); } } else { page = read_mapping_page(mapping, 0, NULL); if (IS_ERR(page)) return (char*)page; } set_delayed_call(callback, page_put_link, page); BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM); kaddr = page_address(page); nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1); return kaddr; } EXPORT_SYMBOL(page_get_link); void page_put_link(void *arg) { put_page(arg); } EXPORT_SYMBOL(page_put_link); int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) { DEFINE_DELAYED_CALL(done); int res = readlink_copy(buffer, buflen, page_get_link(dentry, d_inode(dentry), &done)); do_delayed_call(&done); return res; } EXPORT_SYMBOL(page_readlink); int page_symlink(struct inode *inode, const char *symname, int len) { struct address_space *mapping = inode->i_mapping; const struct address_space_operations *aops = mapping->a_ops; bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS); struct page *page; void *fsdata = NULL; int err; unsigned int flags; retry: if (nofs) flags = memalloc_nofs_save(); err = aops->write_begin(NULL, mapping, 0, len-1, &page, &fsdata); if (nofs) memalloc_nofs_restore(flags); if (err) goto fail; memcpy(page_address(page), symname, len-1); err = aops->write_end(NULL, mapping, 0, len-1, len-1, page, fsdata); if (err < 0) goto fail; if (err < len-1) goto retry; mark_inode_dirty(inode); return 0; fail: return err; } EXPORT_SYMBOL(page_symlink); const struct inode_operations page_symlink_inode_operations = { .get_link = page_get_link, }; EXPORT_SYMBOL(page_symlink_inode_operations);
3359 3368 3361 3377 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/audit.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include "common.h" #include <linux/slab.h> /** * tomoyo_print_bprm - Print "struct linux_binprm" for auditing. * * @bprm: Pointer to "struct linux_binprm". * @dump: Pointer to "struct tomoyo_page_dump". * * Returns the contents of @bprm on success, NULL otherwise. * * This function uses kzalloc(), so caller must kfree() if this function * didn't return NULL. */ static char *tomoyo_print_bprm(struct linux_binprm *bprm, struct tomoyo_page_dump *dump) { static const int tomoyo_buffer_len = 4096 * 2; char *buffer = kzalloc(tomoyo_buffer_len, GFP_NOFS); char *cp; char *last_start; int len; unsigned long pos = bprm->p; int offset = pos % PAGE_SIZE; int argv_count = bprm->argc; int envp_count = bprm->envc; bool truncated = false; if (!buffer) return NULL; len = snprintf(buffer, tomoyo_buffer_len - 1, "argv[]={ "); cp = buffer + len; if (!argv_count) { memmove(cp, "} envp[]={ ", 11); cp += 11; } last_start = cp; while (argv_count || envp_count) { if (!tomoyo_dump_page(bprm, pos, dump)) goto out; pos += PAGE_SIZE - offset; /* Read. */ while (offset < PAGE_SIZE) { const char *kaddr = dump->data; const unsigned char c = kaddr[offset++]; if (cp == last_start) *cp++ = '"'; if (cp >= buffer + tomoyo_buffer_len - 32) { /* Reserve some room for "..." string. */ truncated = true; } else if (c == '\\') { *cp++ = '\\'; *cp++ = '\\'; } else if (c > ' ' && c < 127) { *cp++ = c; } else if (!c) { *cp++ = '"'; *cp++ = ' '; last_start = cp; } else { *cp++ = '\\'; *cp++ = (c >> 6) + '0'; *cp++ = ((c >> 3) & 7) + '0'; *cp++ = (c & 7) + '0'; } if (c) continue; if (argv_count) { if (--argv_count == 0) { if (truncated) { cp = last_start; memmove(cp, "... ", 4); cp += 4; } memmove(cp, "} envp[]={ ", 11); cp += 11; last_start = cp; truncated = false; } } else if (envp_count) { if (--envp_count == 0) { if (truncated) { cp = last_start; memmove(cp, "... ", 4); cp += 4; } } } if (!argv_count && !envp_count) break; } offset = 0; } *cp++ = '}'; *cp = '\0'; return buffer; out: snprintf(buffer, tomoyo_buffer_len - 1, "argv[]={ ... } envp[]= { ... }"); return buffer; } /** * tomoyo_filetype - Get string representation of file type. * * @mode: Mode value for stat(). * * Returns file type string. */ static inline const char *tomoyo_filetype(const umode_t mode) { switch (mode & S_IFMT) { case S_IFREG: case 0: return tomoyo_condition_keyword[TOMOYO_TYPE_IS_FILE]; case S_IFDIR: return tomoyo_condition_keyword[TOMOYO_TYPE_IS_DIRECTORY]; case S_IFLNK: return tomoyo_condition_keyword[TOMOYO_TYPE_IS_SYMLINK]; case S_IFIFO: return tomoyo_condition_keyword[TOMOYO_TYPE_IS_FIFO]; case S_IFSOCK: return tomoyo_condition_keyword[TOMOYO_TYPE_IS_SOCKET]; case S_IFBLK: return tomoyo_condition_keyword[TOMOYO_TYPE_IS_BLOCK_DEV]; case S_IFCHR: return tomoyo_condition_keyword[TOMOYO_TYPE_IS_CHAR_DEV]; } return "unknown"; /* This should not happen. */ } /** * tomoyo_print_header - Get header line of audit log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns string representation. * * This function uses kmalloc(), so caller must kfree() if this function * didn't return NULL. */ static char *tomoyo_print_header(struct tomoyo_request_info *r) { struct tomoyo_time stamp; const pid_t gpid = task_pid_nr(current); struct tomoyo_obj_info *obj = r->obj; static const int tomoyo_buffer_len = 4096; char *buffer = kmalloc(tomoyo_buffer_len, GFP_NOFS); int pos; u8 i; if (!buffer) return NULL; tomoyo_convert_time(ktime_get_real_seconds(), &stamp); pos = snprintf(buffer, tomoyo_buffer_len - 1, "#%04u/%02u/%02u %02u:%02u:%02u# profile=%u mode=%s granted=%s (global-pid=%u) task={ pid=%u ppid=%u uid=%u gid=%u euid=%u egid=%u suid=%u sgid=%u fsuid=%u fsgid=%u }", stamp.year, stamp.month, stamp.day, stamp.hour, stamp.min, stamp.sec, r->profile, tomoyo_mode[r->mode], str_yes_no(r->granted), gpid, tomoyo_sys_getpid(), tomoyo_sys_getppid(), from_kuid(&init_user_ns, current_uid()), from_kgid(&init_user_ns, current_gid()), from_kuid(&init_user_ns, current_euid()), from_kgid(&init_user_ns, current_egid()), from_kuid(&init_user_ns, current_suid()), from_kgid(&init_user_ns, current_sgid()), from_kuid(&init_user_ns, current_fsuid()), from_kgid(&init_user_ns, current_fsgid())); if (!obj) goto no_obj_info; if (!obj->validate_done) { tomoyo_get_attributes(obj); obj->validate_done = true; } for (i = 0; i < TOMOYO_MAX_PATH_STAT; i++) { struct tomoyo_mini_stat *stat; unsigned int dev; umode_t mode; if (!obj->stat_valid[i]) continue; stat = &obj->stat[i]; dev = stat->dev; mode = stat->mode; if (i & 1) { pos += snprintf(buffer + pos, tomoyo_buffer_len - 1 - pos, " path%u.parent={ uid=%u gid=%u ino=%lu perm=0%o }", (i >> 1) + 1, from_kuid(&init_user_ns, stat->uid), from_kgid(&init_user_ns, stat->gid), (unsigned long)stat->ino, stat->mode & S_IALLUGO); continue; } pos += snprintf(buffer + pos, tomoyo_buffer_len - 1 - pos, " path%u={ uid=%u gid=%u ino=%lu major=%u minor=%u perm=0%o type=%s", (i >> 1) + 1, from_kuid(&init_user_ns, stat->uid), from_kgid(&init_user_ns, stat->gid), (unsigned long)stat->ino, MAJOR(dev), MINOR(dev), mode & S_IALLUGO, tomoyo_filetype(mode)); if (S_ISCHR(mode) || S_ISBLK(mode)) { dev = stat->rdev; pos += snprintf(buffer + pos, tomoyo_buffer_len - 1 - pos, " dev_major=%u dev_minor=%u", MAJOR(dev), MINOR(dev)); } pos += snprintf(buffer + pos, tomoyo_buffer_len - 1 - pos, " }"); } no_obj_info: if (pos < tomoyo_buffer_len - 1) return buffer; kfree(buffer); return NULL; } /** * tomoyo_init_log - Allocate buffer for audit logs. * * @r: Pointer to "struct tomoyo_request_info". * @len: Buffer size needed for @fmt and @args. * @fmt: The printf()'s format string. * @args: va_list structure for @fmt. * * Returns pointer to allocated memory. * * This function uses kzalloc(), so caller must kfree() if this function * didn't return NULL. */ char *tomoyo_init_log(struct tomoyo_request_info *r, int len, const char *fmt, va_list args) { char *buf = NULL; char *bprm_info = NULL; const char *header = NULL; char *realpath = NULL; const char *symlink = NULL; int pos; const char *domainname = r->domain->domainname->name; header = tomoyo_print_header(r); if (!header) return NULL; /* +10 is for '\n' etc. and '\0'. */ len += strlen(domainname) + strlen(header) + 10; if (r->ee) { struct file *file = r->ee->bprm->file; realpath = tomoyo_realpath_from_path(&file->f_path); bprm_info = tomoyo_print_bprm(r->ee->bprm, &r->ee->dump); if (!realpath || !bprm_info) goto out; /* +80 is for " exec={ realpath=\"%s\" argc=%d envc=%d %s }" */ len += strlen(realpath) + 80 + strlen(bprm_info); } else if (r->obj && r->obj->symlink_target) { symlink = r->obj->symlink_target->name; /* +18 is for " symlink.target=\"%s\"" */ len += 18 + strlen(symlink); } len = kmalloc_size_roundup(len); buf = kzalloc(len, GFP_NOFS); if (!buf) goto out; len--; pos = snprintf(buf, len, "%s", header); if (realpath) { struct linux_binprm *bprm = r->ee->bprm; pos += snprintf(buf + pos, len - pos, " exec={ realpath=\"%s\" argc=%d envc=%d %s }", realpath, bprm->argc, bprm->envc, bprm_info); } else if (symlink) pos += snprintf(buf + pos, len - pos, " symlink.target=\"%s\"", symlink); pos += snprintf(buf + pos, len - pos, "\n%s\n", domainname); vsnprintf(buf + pos, len - pos, fmt, args); out: kfree(realpath); kfree(bprm_info); kfree(header); return buf; } /* Wait queue for /sys/kernel/security/tomoyo/audit. */ static DECLARE_WAIT_QUEUE_HEAD(tomoyo_log_wait); /* Structure for audit log. */ struct tomoyo_log { struct list_head list; char *log; int size; }; /* The list for "struct tomoyo_log". */ static LIST_HEAD(tomoyo_log); /* Lock for "struct list_head tomoyo_log". */ static DEFINE_SPINLOCK(tomoyo_log_lock); /* Length of "struct list_head tomoyo_log". */ static unsigned int tomoyo_log_count; /** * tomoyo_get_audit - Get audit mode. * * @ns: Pointer to "struct tomoyo_policy_namespace". * @profile: Profile number. * @index: Index number of functionality. * @matched_acl: Pointer to "struct tomoyo_acl_info". * @is_granted: True if granted log, false otherwise. * * Returns true if this request should be audited, false otherwise. */ static bool tomoyo_get_audit(const struct tomoyo_policy_namespace *ns, const u8 profile, const u8 index, const struct tomoyo_acl_info *matched_acl, const bool is_granted) { u8 mode; const u8 category = tomoyo_index2category[index] + TOMOYO_MAX_MAC_INDEX; struct tomoyo_profile *p; if (!tomoyo_policy_loaded) return false; p = tomoyo_profile(ns, profile); if (tomoyo_log_count >= p->pref[TOMOYO_PREF_MAX_AUDIT_LOG]) return false; if (is_granted && matched_acl && matched_acl->cond && matched_acl->cond->grant_log != TOMOYO_GRANTLOG_AUTO) return matched_acl->cond->grant_log == TOMOYO_GRANTLOG_YES; mode = p->config[index]; if (mode == TOMOYO_CONFIG_USE_DEFAULT) mode = p->config[category]; if (mode == TOMOYO_CONFIG_USE_DEFAULT) mode = p->default_config; if (is_granted) return mode & TOMOYO_CONFIG_WANT_GRANT_LOG; return mode & TOMOYO_CONFIG_WANT_REJECT_LOG; } /** * tomoyo_write_log2 - Write an audit log. * * @r: Pointer to "struct tomoyo_request_info". * @len: Buffer size needed for @fmt and @args. * @fmt: The printf()'s format string. * @args: va_list structure for @fmt. * * Returns nothing. */ void tomoyo_write_log2(struct tomoyo_request_info *r, int len, const char *fmt, va_list args) { char *buf; struct tomoyo_log *entry; bool quota_exceeded = false; if (!tomoyo_get_audit(r->domain->ns, r->profile, r->type, r->matched_acl, r->granted)) goto out; buf = tomoyo_init_log(r, len, fmt, args); if (!buf) goto out; entry = kzalloc(sizeof(*entry), GFP_NOFS); if (!entry) { kfree(buf); goto out; } entry->log = buf; len = kmalloc_size_roundup(strlen(buf) + 1); /* * The entry->size is used for memory quota checks. * Don't go beyond strlen(entry->log). */ entry->size = len + kmalloc_size_roundup(sizeof(*entry)); spin_lock(&tomoyo_log_lock); if (tomoyo_memory_quota[TOMOYO_MEMORY_AUDIT] && tomoyo_memory_used[TOMOYO_MEMORY_AUDIT] + entry->size >= tomoyo_memory_quota[TOMOYO_MEMORY_AUDIT]) { quota_exceeded = true; } else { tomoyo_memory_used[TOMOYO_MEMORY_AUDIT] += entry->size; list_add_tail(&entry->list, &tomoyo_log); tomoyo_log_count++; } spin_unlock(&tomoyo_log_lock); if (quota_exceeded) { kfree(buf); kfree(entry); goto out; } wake_up(&tomoyo_log_wait); out: return; } /** * tomoyo_write_log - Write an audit log. * * @r: Pointer to "struct tomoyo_request_info". * @fmt: The printf()'s format string, followed by parameters. * * Returns nothing. */ void tomoyo_write_log(struct tomoyo_request_info *r, const char *fmt, ...) { va_list args; int len; va_start(args, fmt); len = vsnprintf(NULL, 0, fmt, args) + 1; va_end(args); va_start(args, fmt); tomoyo_write_log2(r, len, fmt, args); va_end(args); } /** * tomoyo_read_log - Read an audit log. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns nothing. */ void tomoyo_read_log(struct tomoyo_io_buffer *head) { struct tomoyo_log *ptr = NULL; if (head->r.w_pos) return; kfree(head->read_buf); head->read_buf = NULL; spin_lock(&tomoyo_log_lock); if (!list_empty(&tomoyo_log)) { ptr = list_entry(tomoyo_log.next, typeof(*ptr), list); list_del(&ptr->list); tomoyo_log_count--; tomoyo_memory_used[TOMOYO_MEMORY_AUDIT] -= ptr->size; } spin_unlock(&tomoyo_log_lock); if (ptr) { head->read_buf = ptr->log; head->r.w[head->r.w_pos++] = head->read_buf; kfree(ptr); } } /** * tomoyo_poll_log - Wait for an audit log. * * @file: Pointer to "struct file". * @wait: Pointer to "poll_table". Maybe NULL. * * Returns EPOLLIN | EPOLLRDNORM when ready to read an audit log. */ __poll_t tomoyo_poll_log(struct file *file, poll_table *wait) { if (tomoyo_log_count) return EPOLLIN | EPOLLRDNORM; poll_wait(file, &tomoyo_log_wait, wait); if (tomoyo_log_count) return EPOLLIN | EPOLLRDNORM; return 0; }
452 454 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) * Copyright Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) * Copyright Darryl Miles G7LED (dlm@g7led.demon.co.uk) */ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/stat.h> #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/net_namespace.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/termios.h> /* For TIOCINQ/OUTQ */ #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/notifier.h> #include <net/netrom.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <net/ip.h> #include <net/tcp_states.h> #include <net/arp.h> #include <linux/init.h> static int nr_ndevs = 4; int sysctl_netrom_default_path_quality = NR_DEFAULT_QUAL; int sysctl_netrom_obsolescence_count_initialiser = NR_DEFAULT_OBS; int sysctl_netrom_network_ttl_initialiser = NR_DEFAULT_TTL; int sysctl_netrom_transport_timeout = NR_DEFAULT_T1; int sysctl_netrom_transport_maximum_tries = NR_DEFAULT_N2; int sysctl_netrom_transport_acknowledge_delay = NR_DEFAULT_T2; int sysctl_netrom_transport_busy_delay = NR_DEFAULT_T4; int sysctl_netrom_transport_requested_window_size = NR_DEFAULT_WINDOW; int sysctl_netrom_transport_no_activity_timeout = NR_DEFAULT_IDLE; int sysctl_netrom_routing_control = NR_DEFAULT_ROUTING; int sysctl_netrom_link_fails_count = NR_DEFAULT_FAILS; int sysctl_netrom_reset_circuit = NR_DEFAULT_RESET; static unsigned short circuit = 0x101; static HLIST_HEAD(nr_list); static DEFINE_SPINLOCK(nr_list_lock); static const struct proto_ops nr_proto_ops; /* * NETROM network devices are virtual network devices encapsulating NETROM * frames into AX.25 which will be sent through an AX.25 device, so form a * special "super class" of normal net devices; split their locks off into a * separate class since they always nest. */ static struct lock_class_key nr_netdev_xmit_lock_key; static struct lock_class_key nr_netdev_addr_lock_key; static void nr_set_lockdep_one(struct net_device *dev, struct netdev_queue *txq, void *_unused) { lockdep_set_class(&txq->_xmit_lock, &nr_netdev_xmit_lock_key); } static void nr_set_lockdep_key(struct net_device *dev) { lockdep_set_class(&dev->addr_list_lock, &nr_netdev_addr_lock_key); netdev_for_each_tx_queue(dev, nr_set_lockdep_one, NULL); } /* * Socket removal during an interrupt is now safe. */ static void nr_remove_socket(struct sock *sk) { spin_lock_bh(&nr_list_lock); sk_del_node_init(sk); spin_unlock_bh(&nr_list_lock); } /* * Kill all bound sockets on a dropped device. */ static void nr_kill_by_device(struct net_device *dev) { struct sock *s; spin_lock_bh(&nr_list_lock); sk_for_each(s, &nr_list) if (nr_sk(s)->device == dev) nr_disconnect(s, ENETUNREACH); spin_unlock_bh(&nr_list_lock); } /* * Handle device status changes. */ static int nr_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; if (event != NETDEV_DOWN) return NOTIFY_DONE; nr_kill_by_device(dev); nr_rt_device_down(dev); return NOTIFY_DONE; } /* * Add a socket to the bound sockets list. */ static void nr_insert_socket(struct sock *sk) { spin_lock_bh(&nr_list_lock); sk_add_node(sk, &nr_list); spin_unlock_bh(&nr_list_lock); } /* * Find a socket that wants to accept the Connect Request we just * received. */ static struct sock *nr_find_listener(ax25_address *addr) { struct sock *s; spin_lock_bh(&nr_list_lock); sk_for_each(s, &nr_list) if (!ax25cmp(&nr_sk(s)->source_addr, addr) && s->sk_state == TCP_LISTEN) { sock_hold(s); goto found; } s = NULL; found: spin_unlock_bh(&nr_list_lock); return s; } /* * Find a connected NET/ROM socket given my circuit IDs. */ static struct sock *nr_find_socket(unsigned char index, unsigned char id) { struct sock *s; spin_lock_bh(&nr_list_lock); sk_for_each(s, &nr_list) { struct nr_sock *nr = nr_sk(s); if (nr->my_index == index && nr->my_id == id) { sock_hold(s); goto found; } } s = NULL; found: spin_unlock_bh(&nr_list_lock); return s; } /* * Find a connected NET/ROM socket given their circuit IDs. */ static struct sock *nr_find_peer(unsigned char index, unsigned char id, ax25_address *dest) { struct sock *s; spin_lock_bh(&nr_list_lock); sk_for_each(s, &nr_list) { struct nr_sock *nr = nr_sk(s); if (nr->your_index == index && nr->your_id == id && !ax25cmp(&nr->dest_addr, dest)) { sock_hold(s); goto found; } } s = NULL; found: spin_unlock_bh(&nr_list_lock); return s; } /* * Find next free circuit ID. */ static unsigned short nr_find_next_circuit(void) { unsigned short id = circuit; unsigned char i, j; struct sock *sk; for (;;) { i = id / 256; j = id % 256; if (i != 0 && j != 0) { if ((sk=nr_find_socket(i, j)) == NULL) break; sock_put(sk); } id++; } return id; } /* * Deferred destroy. */ void nr_destroy_socket(struct sock *); /* * Handler for deferred kills. */ static void nr_destroy_timer(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); bh_lock_sock(sk); sock_hold(sk); nr_destroy_socket(sk); bh_unlock_sock(sk); sock_put(sk); } /* * This is called from user mode and the timers. Thus it protects itself * against interrupt users but doesn't worry about being called during * work. Once it is removed from the queue no interrupt or bottom half * will touch it and we are (fairly 8-) ) safe. */ void nr_destroy_socket(struct sock *sk) { struct sk_buff *skb; nr_remove_socket(sk); nr_stop_heartbeat(sk); nr_stop_t1timer(sk); nr_stop_t2timer(sk); nr_stop_t4timer(sk); nr_stop_idletimer(sk); nr_clear_queues(sk); /* Flush the queues */ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { if (skb->sk != sk) { /* A pending connection */ /* Queue the unaccepted socket for death */ sock_set_flag(skb->sk, SOCK_DEAD); nr_start_heartbeat(skb->sk); nr_sk(skb->sk)->state = NR_STATE_0; } kfree_skb(skb); } if (sk_has_allocations(sk)) { /* Defer: outstanding buffers */ sk->sk_timer.function = nr_destroy_timer; sk->sk_timer.expires = jiffies + 2 * HZ; add_timer(&sk->sk_timer); } else sock_put(sk); } /* * Handling for system calls applied via the various interfaces to a * NET/ROM socket object. */ static int nr_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); unsigned int opt; if (level != SOL_NETROM) return -ENOPROTOOPT; if (optlen < sizeof(unsigned int)) return -EINVAL; if (copy_from_sockptr(&opt, optval, sizeof(opt))) return -EFAULT; switch (optname) { case NETROM_T1: if (opt < 1 || opt > UINT_MAX / HZ) return -EINVAL; nr->t1 = opt * HZ; return 0; case NETROM_T2: if (opt < 1 || opt > UINT_MAX / HZ) return -EINVAL; nr->t2 = opt * HZ; return 0; case NETROM_N2: if (opt < 1 || opt > 31) return -EINVAL; nr->n2 = opt; return 0; case NETROM_T4: if (opt < 1 || opt > UINT_MAX / HZ) return -EINVAL; nr->t4 = opt * HZ; return 0; case NETROM_IDLE: if (opt > UINT_MAX / (60 * HZ)) return -EINVAL; nr->idle = opt * 60 * HZ; return 0; default: return -ENOPROTOOPT; } } static int nr_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); int val = 0; int len; if (level != SOL_NETROM) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case NETROM_T1: val = nr->t1 / HZ; break; case NETROM_T2: val = nr->t2 / HZ; break; case NETROM_N2: val = nr->n2; break; case NETROM_T4: val = nr->t4 / HZ; break; case NETROM_IDLE: val = nr->idle / (60 * HZ); break; default: return -ENOPROTOOPT; } len = min_t(unsigned int, len, sizeof(int)); if (put_user(len, optlen)) return -EFAULT; return copy_to_user(optval, &val, len) ? -EFAULT : 0; } static int nr_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; lock_sock(sk); if (sock->state != SS_UNCONNECTED) { release_sock(sk); return -EINVAL; } if (sk->sk_state != TCP_LISTEN) { memset(&nr_sk(sk)->user_addr, 0, AX25_ADDR_LEN); sk->sk_max_ack_backlog = backlog; sk->sk_state = TCP_LISTEN; release_sock(sk); return 0; } release_sock(sk); return -EOPNOTSUPP; } static struct proto nr_proto = { .name = "NETROM", .owner = THIS_MODULE, .obj_size = sizeof(struct nr_sock), }; static int nr_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct nr_sock *nr; if (!net_eq(net, &init_net)) return -EAFNOSUPPORT; if (sock->type != SOCK_SEQPACKET || protocol != 0) return -ESOCKTNOSUPPORT; sk = sk_alloc(net, PF_NETROM, GFP_ATOMIC, &nr_proto, kern); if (sk == NULL) return -ENOMEM; nr = nr_sk(sk); sock_init_data(sock, sk); sock->ops = &nr_proto_ops; sk->sk_protocol = protocol; skb_queue_head_init(&nr->ack_queue); skb_queue_head_init(&nr->reseq_queue); skb_queue_head_init(&nr->frag_queue); nr_init_timers(sk); nr->t1 = msecs_to_jiffies(READ_ONCE(sysctl_netrom_transport_timeout)); nr->t2 = msecs_to_jiffies(READ_ONCE(sysctl_netrom_transport_acknowledge_delay)); nr->n2 = msecs_to_jiffies(READ_ONCE(sysctl_netrom_transport_maximum_tries)); nr->t4 = msecs_to_jiffies(READ_ONCE(sysctl_netrom_transport_busy_delay)); nr->idle = msecs_to_jiffies(READ_ONCE(sysctl_netrom_transport_no_activity_timeout)); nr->window = READ_ONCE(sysctl_netrom_transport_requested_window_size); nr->bpqext = 1; nr->state = NR_STATE_0; return 0; } static struct sock *nr_make_new(struct sock *osk) { struct sock *sk; struct nr_sock *nr, *onr; if (osk->sk_type != SOCK_SEQPACKET) return NULL; sk = sk_alloc(sock_net(osk), PF_NETROM, GFP_ATOMIC, osk->sk_prot, 0); if (sk == NULL) return NULL; nr = nr_sk(sk); sock_init_data(NULL, sk); sk->sk_type = osk->sk_type; sk->sk_priority = READ_ONCE(osk->sk_priority); sk->sk_protocol = osk->sk_protocol; sk->sk_rcvbuf = osk->sk_rcvbuf; sk->sk_sndbuf = osk->sk_sndbuf; sk->sk_state = TCP_ESTABLISHED; sock_copy_flags(sk, osk); skb_queue_head_init(&nr->ack_queue); skb_queue_head_init(&nr->reseq_queue); skb_queue_head_init(&nr->frag_queue); nr_init_timers(sk); onr = nr_sk(osk); nr->t1 = onr->t1; nr->t2 = onr->t2; nr->n2 = onr->n2; nr->t4 = onr->t4; nr->idle = onr->idle; nr->window = onr->window; nr->device = onr->device; nr->bpqext = onr->bpqext; return sk; } static int nr_release(struct socket *sock) { struct sock *sk = sock->sk; struct nr_sock *nr; if (sk == NULL) return 0; sock_hold(sk); sock_orphan(sk); lock_sock(sk); nr = nr_sk(sk); switch (nr->state) { case NR_STATE_0: case NR_STATE_1: case NR_STATE_2: nr_disconnect(sk, 0); nr_destroy_socket(sk); break; case NR_STATE_3: nr_clear_queues(sk); nr->n2count = 0; nr_write_internal(sk, NR_DISCREQ); nr_start_t1timer(sk); nr_stop_t2timer(sk); nr_stop_t4timer(sk); nr_stop_idletimer(sk); nr->state = NR_STATE_2; sk->sk_state = TCP_CLOSE; sk->sk_shutdown |= SEND_SHUTDOWN; sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DESTROY); break; default: break; } sock->sk = NULL; release_sock(sk); sock_put(sk); return 0; } static int nr_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr; struct net_device *dev; ax25_uid_assoc *user; ax25_address *source; lock_sock(sk); if (!sock_flag(sk, SOCK_ZAPPED)) { release_sock(sk); return -EINVAL; } if (addr_len < sizeof(struct sockaddr_ax25) || addr_len > sizeof(struct full_sockaddr_ax25)) { release_sock(sk); return -EINVAL; } if (addr_len < (addr->fsa_ax25.sax25_ndigis * sizeof(ax25_address) + sizeof(struct sockaddr_ax25))) { release_sock(sk); return -EINVAL; } if (addr->fsa_ax25.sax25_family != AF_NETROM) { release_sock(sk); return -EINVAL; } if ((dev = nr_dev_get(&addr->fsa_ax25.sax25_call)) == NULL) { release_sock(sk); return -EADDRNOTAVAIL; } /* * Only the super user can set an arbitrary user callsign. */ if (addr->fsa_ax25.sax25_ndigis == 1) { if (!capable(CAP_NET_BIND_SERVICE)) { dev_put(dev); release_sock(sk); return -EPERM; } nr->user_addr = addr->fsa_digipeater[0]; nr->source_addr = addr->fsa_ax25.sax25_call; } else { source = &addr->fsa_ax25.sax25_call; user = ax25_findbyuid(current_euid()); if (user) { nr->user_addr = user->call; ax25_uid_put(user); } else { if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) { release_sock(sk); dev_put(dev); return -EPERM; } nr->user_addr = *source; } nr->source_addr = *source; } nr->device = dev; nr_insert_socket(sk); sock_reset_flag(sk, SOCK_ZAPPED); dev_put(dev); release_sock(sk); return 0; } static int nr_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); struct sockaddr_ax25 *addr = (struct sockaddr_ax25 *)uaddr; const ax25_address *source = NULL; ax25_uid_assoc *user; struct net_device *dev; int err = 0; lock_sock(sk); if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) { sock->state = SS_CONNECTED; goto out_release; /* Connect completed during a ERESTARTSYS event */ } if (sk->sk_state == TCP_CLOSE && sock->state == SS_CONNECTING) { sock->state = SS_UNCONNECTED; err = -ECONNREFUSED; goto out_release; } if (sk->sk_state == TCP_ESTABLISHED) { err = -EISCONN; /* No reconnect on a seqpacket socket */ goto out_release; } if (sock->state == SS_CONNECTING) { err = -EALREADY; goto out_release; } sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; if (addr_len != sizeof(struct sockaddr_ax25) && addr_len != sizeof(struct full_sockaddr_ax25)) { err = -EINVAL; goto out_release; } if (addr->sax25_family != AF_NETROM) { err = -EINVAL; goto out_release; } if (sock_flag(sk, SOCK_ZAPPED)) { /* Must bind first - autobinding in this may or may not work */ sock_reset_flag(sk, SOCK_ZAPPED); if ((dev = nr_dev_first()) == NULL) { err = -ENETUNREACH; goto out_release; } source = (const ax25_address *)dev->dev_addr; user = ax25_findbyuid(current_euid()); if (user) { nr->user_addr = user->call; ax25_uid_put(user); } else { if (ax25_uid_policy && !capable(CAP_NET_ADMIN)) { dev_put(dev); err = -EPERM; goto out_release; } nr->user_addr = *source; } nr->source_addr = *source; nr->device = dev; dev_put(dev); nr_insert_socket(sk); /* Finish the bind */ } nr->dest_addr = addr->sax25_call; release_sock(sk); circuit = nr_find_next_circuit(); lock_sock(sk); nr->my_index = circuit / 256; nr->my_id = circuit % 256; circuit++; /* Move to connecting socket, start sending Connect Requests */ sock->state = SS_CONNECTING; sk->sk_state = TCP_SYN_SENT; nr_establish_data_link(sk); nr->state = NR_STATE_1; nr_start_heartbeat(sk); /* Now the loop */ if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) { err = -EINPROGRESS; goto out_release; } /* * A Connect Ack with Choke or timeout or failed routing will go to * closed. */ if (sk->sk_state == TCP_SYN_SENT) { DEFINE_WAIT(wait); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (sk->sk_state != TCP_SYN_SENT) break; if (!signal_pending(current)) { release_sock(sk); schedule(); lock_sock(sk); continue; } err = -ERESTARTSYS; break; } finish_wait(sk_sleep(sk), &wait); if (err) goto out_release; } if (sk->sk_state != TCP_ESTABLISHED) { sock->state = SS_UNCONNECTED; err = sock_error(sk); /* Always set at this point */ goto out_release; } sock->state = SS_CONNECTED; out_release: release_sock(sk); return err; } static int nr_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { struct sk_buff *skb; struct sock *newsk; DEFINE_WAIT(wait); struct sock *sk; int err = 0; if ((sk = sock->sk) == NULL) return -EINVAL; lock_sock(sk); if (sk->sk_type != SOCK_SEQPACKET) { err = -EOPNOTSUPP; goto out_release; } if (sk->sk_state != TCP_LISTEN) { err = -EINVAL; goto out_release; } /* * The write queue this time is holding sockets ready to use * hooked into the SABM we saved */ for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); skb = skb_dequeue(&sk->sk_receive_queue); if (skb) break; if (flags & O_NONBLOCK) { err = -EWOULDBLOCK; break; } if (!signal_pending(current)) { release_sock(sk); schedule(); lock_sock(sk); continue; } err = -ERESTARTSYS; break; } finish_wait(sk_sleep(sk), &wait); if (err) goto out_release; newsk = skb->sk; sock_graft(newsk, newsock); /* Now attach up the new socket */ kfree_skb(skb); sk_acceptq_removed(sk); out_release: release_sock(sk); return err; } static int nr_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct full_sockaddr_ax25 *sax = (struct full_sockaddr_ax25 *)uaddr; struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); int uaddr_len; memset(&sax->fsa_ax25, 0, sizeof(struct sockaddr_ax25)); lock_sock(sk); if (peer != 0) { if (sk->sk_state != TCP_ESTABLISHED) { release_sock(sk); return -ENOTCONN; } sax->fsa_ax25.sax25_family = AF_NETROM; sax->fsa_ax25.sax25_ndigis = 1; sax->fsa_ax25.sax25_call = nr->user_addr; memset(sax->fsa_digipeater, 0, sizeof(sax->fsa_digipeater)); sax->fsa_digipeater[0] = nr->dest_addr; uaddr_len = sizeof(struct full_sockaddr_ax25); } else { sax->fsa_ax25.sax25_family = AF_NETROM; sax->fsa_ax25.sax25_ndigis = 0; sax->fsa_ax25.sax25_call = nr->source_addr; uaddr_len = sizeof(struct sockaddr_ax25); } release_sock(sk); return uaddr_len; } int nr_rx_frame(struct sk_buff *skb, struct net_device *dev) { struct sock *sk; struct sock *make; struct nr_sock *nr_make; ax25_address *src, *dest, *user; unsigned short circuit_index, circuit_id; unsigned short peer_circuit_index, peer_circuit_id; unsigned short frametype, flags, window, timeout; int ret; skb_orphan(skb); /* * skb->data points to the netrom frame start */ src = (ax25_address *)(skb->data + 0); dest = (ax25_address *)(skb->data + 7); circuit_index = skb->data[15]; circuit_id = skb->data[16]; peer_circuit_index = skb->data[17]; peer_circuit_id = skb->data[18]; frametype = skb->data[19] & 0x0F; flags = skb->data[19] & 0xF0; /* * Check for an incoming IP over NET/ROM frame. */ if (frametype == NR_PROTOEXT && circuit_index == NR_PROTO_IP && circuit_id == NR_PROTO_IP) { skb_pull(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN); skb_reset_transport_header(skb); return nr_rx_ip(skb, dev); } /* * Find an existing socket connection, based on circuit ID, if it's * a Connect Request base it on their circuit ID. * * Circuit ID 0/0 is not valid but it could still be a "reset" for a * circuit that no longer exists at the other end ... */ sk = NULL; if (circuit_index == 0 && circuit_id == 0) { if (frametype == NR_CONNACK && flags == NR_CHOKE_FLAG) sk = nr_find_peer(peer_circuit_index, peer_circuit_id, src); } else { if (frametype == NR_CONNREQ) sk = nr_find_peer(circuit_index, circuit_id, src); else sk = nr_find_socket(circuit_index, circuit_id); } if (sk != NULL) { bh_lock_sock(sk); skb_reset_transport_header(skb); if (frametype == NR_CONNACK && skb->len == 22) nr_sk(sk)->bpqext = 1; else nr_sk(sk)->bpqext = 0; ret = nr_process_rx_frame(sk, skb); bh_unlock_sock(sk); sock_put(sk); return ret; } /* * Now it should be a CONNREQ. */ if (frametype != NR_CONNREQ) { /* * Here it would be nice to be able to send a reset but * NET/ROM doesn't have one. We've tried to extend the protocol * by sending NR_CONNACK | NR_CHOKE_FLAGS replies but that * apparently kills BPQ boxes... :-( * So now we try to follow the established behaviour of * G8PZT's Xrouter which is sending packets with command type 7 * as an extension of the protocol. */ if (READ_ONCE(sysctl_netrom_reset_circuit) && (frametype != NR_RESET || flags != 0)) nr_transmit_reset(skb, 1); return 0; } sk = nr_find_listener(dest); user = (ax25_address *)(skb->data + 21); if (sk == NULL || sk_acceptq_is_full(sk) || (make = nr_make_new(sk)) == NULL) { nr_transmit_refusal(skb, 0); if (sk) sock_put(sk); return 0; } bh_lock_sock(sk); window = skb->data[20]; sock_hold(make); skb->sk = make; skb->destructor = sock_efree; make->sk_state = TCP_ESTABLISHED; /* Fill in his circuit details */ nr_make = nr_sk(make); nr_make->source_addr = *dest; nr_make->dest_addr = *src; nr_make->user_addr = *user; nr_make->your_index = circuit_index; nr_make->your_id = circuit_id; bh_unlock_sock(sk); circuit = nr_find_next_circuit(); bh_lock_sock(sk); nr_make->my_index = circuit / 256; nr_make->my_id = circuit % 256; circuit++; /* Window negotiation */ if (window < nr_make->window) nr_make->window = window; /* L4 timeout negotiation */ if (skb->len == 37) { timeout = skb->data[36] * 256 + skb->data[35]; if (timeout * HZ < nr_make->t1) nr_make->t1 = timeout * HZ; nr_make->bpqext = 1; } else { nr_make->bpqext = 0; } nr_write_internal(make, NR_CONNACK); nr_make->condition = 0x00; nr_make->vs = 0; nr_make->va = 0; nr_make->vr = 0; nr_make->vl = 0; nr_make->state = NR_STATE_3; sk_acceptq_added(sk); skb_queue_head(&sk->sk_receive_queue, skb); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); bh_unlock_sock(sk); sock_put(sk); nr_insert_socket(make); nr_start_heartbeat(make); nr_start_idletimer(make); return 1; } static int nr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); DECLARE_SOCKADDR(struct sockaddr_ax25 *, usax, msg->msg_name); int err; struct sockaddr_ax25 sax; struct sk_buff *skb; unsigned char *asmptr; int size; if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT)) return -EINVAL; lock_sock(sk); if (sock_flag(sk, SOCK_ZAPPED)) { err = -EADDRNOTAVAIL; goto out; } if (sk->sk_shutdown & SEND_SHUTDOWN) { send_sig(SIGPIPE, current, 0); err = -EPIPE; goto out; } if (nr->device == NULL) { err = -ENETUNREACH; goto out; } if (usax) { if (msg->msg_namelen < sizeof(sax)) { err = -EINVAL; goto out; } sax = *usax; if (ax25cmp(&nr->dest_addr, &sax.sax25_call) != 0) { err = -EISCONN; goto out; } if (sax.sax25_family != AF_NETROM) { err = -EINVAL; goto out; } } else { if (sk->sk_state != TCP_ESTABLISHED) { err = -ENOTCONN; goto out; } sax.sax25_family = AF_NETROM; sax.sax25_call = nr->dest_addr; } /* Build a packet - the conventional user limit is 236 bytes. We can do ludicrously large NetROM frames but must not overflow */ if (len > 65536) { err = -EMSGSIZE; goto out; } size = len + NR_NETWORK_LEN + NR_TRANSPORT_LEN; if ((skb = sock_alloc_send_skb(sk, size, msg->msg_flags & MSG_DONTWAIT, &err)) == NULL) goto out; skb_reserve(skb, size - len); skb_reset_transport_header(skb); /* * Push down the NET/ROM header */ asmptr = skb_push(skb, NR_TRANSPORT_LEN); /* Build a NET/ROM Transport header */ *asmptr++ = nr->your_index; *asmptr++ = nr->your_id; *asmptr++ = 0; /* To be filled in later */ *asmptr++ = 0; /* Ditto */ *asmptr++ = NR_INFO; /* * Put the data on the end */ skb_put(skb, len); /* User data follows immediately after the NET/ROM transport header */ if (memcpy_from_msg(skb_transport_header(skb), msg, len)) { kfree_skb(skb); err = -EFAULT; goto out; } if (sk->sk_state != TCP_ESTABLISHED) { kfree_skb(skb); err = -ENOTCONN; goto out; } nr_output(sk, skb); /* Shove it onto the queue */ err = len; out: release_sock(sk); return err; } static int nr_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; DECLARE_SOCKADDR(struct sockaddr_ax25 *, sax, msg->msg_name); size_t copied; struct sk_buff *skb; int er; /* * This works for seqpacket too. The receiver has ordered the queue for * us! We do one quick check first though */ lock_sock(sk); if (sk->sk_state != TCP_ESTABLISHED) { release_sock(sk); return -ENOTCONN; } /* Now we can treat all alike */ skb = skb_recv_datagram(sk, flags, &er); if (!skb) { release_sock(sk); return er; } skb_reset_transport_header(skb); copied = skb->len; if (copied > size) { copied = size; msg->msg_flags |= MSG_TRUNC; } er = skb_copy_datagram_msg(skb, 0, msg, copied); if (er < 0) { skb_free_datagram(sk, skb); release_sock(sk); return er; } if (sax != NULL) { memset(sax, 0, sizeof(*sax)); sax->sax25_family = AF_NETROM; skb_copy_from_linear_data_offset(skb, 7, sax->sax25_call.ax25_call, AX25_ADDR_LEN); msg->msg_namelen = sizeof(*sax); } skb_free_datagram(sk, skb); release_sock(sk); return copied; } static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; void __user *argp = (void __user *)arg; switch (cmd) { case TIOCOUTQ: { long amount; lock_sock(sk); amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk); if (amount < 0) amount = 0; release_sock(sk); return put_user(amount, (int __user *)argp); } case TIOCINQ: { struct sk_buff *skb; long amount = 0L; lock_sock(sk); /* These two are safe on a single CPU system as only user tasks fiddle here */ if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) amount = skb->len; release_sock(sk); return put_user(amount, (int __user *)argp); } case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCGIFMETRIC: case SIOCSIFMETRIC: return -EINVAL; case SIOCADDRT: case SIOCDELRT: case SIOCNRDECOBS: if (!capable(CAP_NET_ADMIN)) return -EPERM; return nr_rt_ioctl(cmd, argp); default: return -ENOIOCTLCMD; } return 0; } #ifdef CONFIG_PROC_FS static void *nr_info_start(struct seq_file *seq, loff_t *pos) __acquires(&nr_list_lock) { spin_lock_bh(&nr_list_lock); return seq_hlist_start_head(&nr_list, *pos); } static void *nr_info_next(struct seq_file *seq, void *v, loff_t *pos) { return seq_hlist_next(v, &nr_list, pos); } static void nr_info_stop(struct seq_file *seq, void *v) __releases(&nr_list_lock) { spin_unlock_bh(&nr_list_lock); } static int nr_info_show(struct seq_file *seq, void *v) { struct sock *s = sk_entry(v); struct net_device *dev; struct nr_sock *nr; const char *devname; char buf[11]; if (v == SEQ_START_TOKEN) seq_puts(seq, "user_addr dest_node src_node dev my your st vs vr va t1 t2 t4 idle n2 wnd Snd-Q Rcv-Q inode\n"); else { bh_lock_sock(s); nr = nr_sk(s); if ((dev = nr->device) == NULL) devname = "???"; else devname = dev->name; seq_printf(seq, "%-9s ", ax2asc(buf, &nr->user_addr)); seq_printf(seq, "%-9s ", ax2asc(buf, &nr->dest_addr)); seq_printf(seq, "%-9s %-3s %02X/%02X %02X/%02X %2d %3d %3d %3d %3lu/%03lu %2lu/%02lu %3lu/%03lu %3lu/%03lu %2d/%02d %3d %5d %5d %ld\n", ax2asc(buf, &nr->source_addr), devname, nr->my_index, nr->my_id, nr->your_index, nr->your_id, nr->state, nr->vs, nr->vr, nr->va, ax25_display_timer(&nr->t1timer) / HZ, nr->t1 / HZ, ax25_display_timer(&nr->t2timer) / HZ, nr->t2 / HZ, ax25_display_timer(&nr->t4timer) / HZ, nr->t4 / HZ, ax25_display_timer(&nr->idletimer) / (60 * HZ), nr->idle / (60 * HZ), nr->n2count, nr->n2, nr->window, sk_wmem_alloc_get(s), sk_rmem_alloc_get(s), s->sk_socket ? SOCK_INODE(s->sk_socket)->i_ino : 0L); bh_unlock_sock(s); } return 0; } static const struct seq_operations nr_info_seqops = { .start = nr_info_start, .next = nr_info_next, .stop = nr_info_stop, .show = nr_info_show, }; #endif /* CONFIG_PROC_FS */ static const struct net_proto_family nr_family_ops = { .family = PF_NETROM, .create = nr_create, .owner = THIS_MODULE, }; static const struct proto_ops nr_proto_ops = { .family = PF_NETROM, .owner = THIS_MODULE, .release = nr_release, .bind = nr_bind, .connect = nr_connect, .socketpair = sock_no_socketpair, .accept = nr_accept, .getname = nr_getname, .poll = datagram_poll, .ioctl = nr_ioctl, .gettstamp = sock_gettstamp, .listen = nr_listen, .shutdown = sock_no_shutdown, .setsockopt = nr_setsockopt, .getsockopt = nr_getsockopt, .sendmsg = nr_sendmsg, .recvmsg = nr_recvmsg, .mmap = sock_no_mmap, }; static struct notifier_block nr_dev_notifier = { .notifier_call = nr_device_event, }; static struct net_device **dev_nr; static struct ax25_protocol nr_pid = { .pid = AX25_P_NETROM, .func = nr_route_frame }; static struct ax25_linkfail nr_linkfail_notifier = { .func = nr_link_failed, }; static int __init nr_proto_init(void) { int i; int rc = proto_register(&nr_proto, 0); if (rc) return rc; if (nr_ndevs > 0x7fffffff/sizeof(struct net_device *)) { pr_err("NET/ROM: %s - nr_ndevs parameter too large\n", __func__); rc = -EINVAL; goto unregister_proto; } dev_nr = kcalloc(nr_ndevs, sizeof(struct net_device *), GFP_KERNEL); if (!dev_nr) { pr_err("NET/ROM: %s - unable to allocate device array\n", __func__); rc = -ENOMEM; goto unregister_proto; } for (i = 0; i < nr_ndevs; i++) { char name[IFNAMSIZ]; struct net_device *dev; sprintf(name, "nr%d", i); dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, nr_setup); if (!dev) { rc = -ENOMEM; goto fail; } dev->base_addr = i; rc = register_netdev(dev); if (rc) { free_netdev(dev); goto fail; } nr_set_lockdep_key(dev); dev_nr[i] = dev; } rc = sock_register(&nr_family_ops); if (rc) goto fail; rc = register_netdevice_notifier(&nr_dev_notifier); if (rc) goto out_sock; ax25_register_pid(&nr_pid); ax25_linkfail_register(&nr_linkfail_notifier); #ifdef CONFIG_SYSCTL rc = nr_register_sysctl(); if (rc) goto out_sysctl; #endif nr_loopback_init(); rc = -ENOMEM; if (!proc_create_seq("nr", 0444, init_net.proc_net, &nr_info_seqops)) goto proc_remove1; if (!proc_create_seq("nr_neigh", 0444, init_net.proc_net, &nr_neigh_seqops)) goto proc_remove2; if (!proc_create_seq("nr_nodes", 0444, init_net.proc_net, &nr_node_seqops)) goto proc_remove3; return 0; proc_remove3: remove_proc_entry("nr_neigh", init_net.proc_net); proc_remove2: remove_proc_entry("nr", init_net.proc_net); proc_remove1: nr_loopback_clear(); nr_rt_free(); #ifdef CONFIG_SYSCTL nr_unregister_sysctl(); out_sysctl: #endif ax25_linkfail_release(&nr_linkfail_notifier); ax25_protocol_release(AX25_P_NETROM); unregister_netdevice_notifier(&nr_dev_notifier); out_sock: sock_unregister(PF_NETROM); fail: while (--i >= 0) { unregister_netdev(dev_nr[i]); free_netdev(dev_nr[i]); } kfree(dev_nr); unregister_proto: proto_unregister(&nr_proto); return rc; } module_init(nr_proto_init); module_param(nr_ndevs, int, 0); MODULE_PARM_DESC(nr_ndevs, "number of NET/ROM devices"); MODULE_AUTHOR("Jonathan Naylor G4KLX <g4klx@g4klx.demon.co.uk>"); MODULE_DESCRIPTION("The amateur radio NET/ROM network and transport layer protocol"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_NETROM); static void __exit nr_exit(void) { int i; remove_proc_entry("nr", init_net.proc_net); remove_proc_entry("nr_neigh", init_net.proc_net); remove_proc_entry("nr_nodes", init_net.proc_net); nr_loopback_clear(); nr_rt_free(); #ifdef CONFIG_SYSCTL nr_unregister_sysctl(); #endif ax25_linkfail_release(&nr_linkfail_notifier); ax25_protocol_release(AX25_P_NETROM); unregister_netdevice_notifier(&nr_dev_notifier); sock_unregister(PF_NETROM); for (i = 0; i < nr_ndevs; i++) { struct net_device *dev = dev_nr[i]; if (dev) { unregister_netdev(dev); free_netdev(dev); } } kfree(dev_nr); proto_unregister(&nr_proto); } module_exit(nr_exit);
11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 // SPDX-License-Identifier: GPL-2.0 /* RTT/RTO calculation. * * Adapted from TCP for AF_RXRPC by David Howells (dhowells@redhat.com) * * https://tools.ietf.org/html/rfc6298 * https://tools.ietf.org/html/rfc1122#section-4.2.3.1 * http://ccr.sigcomm.org/archive/1995/jan95/ccr-9501-partridge87.pdf */ #include <linux/net.h> #include "ar-internal.h" #define RXRPC_RTO_MAX (120 * USEC_PER_SEC) #define RXRPC_TIMEOUT_INIT ((unsigned int)(1 * MSEC_PER_SEC)) /* RFC6298 2.1 initial RTO value */ #define rxrpc_jiffies32 ((u32)jiffies) /* As rxrpc_jiffies32 */ static u32 rxrpc_rto_min_us(struct rxrpc_peer *peer) { return 200; } static u32 __rxrpc_set_rto(const struct rxrpc_peer *peer) { return (peer->srtt_us >> 3) + peer->rttvar_us; } static u32 rxrpc_bound_rto(u32 rto) { return min(rto, RXRPC_RTO_MAX); } /* * Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 * piece by Van Jacobson. * NOTE: the next three routines used to be one big routine. * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us) { long m = sample_rtt_us; /* RTT */ u32 srtt = peer->srtt_us; /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev * are scaled versions of rtt and mean deviation. * This is designed to be as fast as possible * m stands for "measurement". * * On a 1990 paper the rto value is changed to: * RTO = rtt + 4 * mdev * * Funny. This algorithm seems to be very broken. * These formulae increase RTO, when it should be decreased, increase * too slowly, when it should be increased quickly, decrease too quickly * etc. I guess in BSD RTO takes ONE value, so that it is absolutely * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) */ if (srtt != 0) { m -= (srtt >> 3); /* m is now error in rtt est */ srtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) { m = -m; /* m is now abs(error) */ m -= (peer->mdev_us >> 2); /* similar update on mdev */ /* This is similar to one of Eifel findings. * Eifel blocks mdev updates when rtt decreases. * This solution is a bit different: we use finer gain * for mdev in this case (alpha*beta). * Like Eifel it also prevents growth of rto, * but also it limits too fast rto decreases, * happening in pure Eifel. */ if (m > 0) m >>= 3; } else { m -= (peer->mdev_us >> 2); /* similar update on mdev */ } peer->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ if (peer->mdev_us > peer->mdev_max_us) { peer->mdev_max_us = peer->mdev_us; if (peer->mdev_max_us > peer->rttvar_us) peer->rttvar_us = peer->mdev_max_us; } } else { /* no previous measure. */ srtt = m << 3; /* take the measured time to be rtt */ peer->mdev_us = m << 1; /* make sure rto = 3*rtt */ peer->rttvar_us = max(peer->mdev_us, rxrpc_rto_min_us(peer)); peer->mdev_max_us = peer->rttvar_us; } peer->srtt_us = max(1U, srtt); } /* * Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ static void rxrpc_set_rto(struct rxrpc_peer *peer) { u32 rto; /* 1. If rtt variance happened to be less 50msec, it is hallucination. * It cannot be less due to utterly erratic ACK generation made * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ * to do with delayed acks, because at cwnd>2 true delack timeout * is invisible. Actually, Linux-2.4 also generates erratic * ACKs in some circumstances. */ rto = __rxrpc_set_rto(peer); /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, * all the algo is pure shit and should be replaced * with correct one. It is exactly, which we pretend to do. */ /* NOTE: clamping at RXRPC_RTO_MIN is not required, current algo * guarantees that rto is higher. */ peer->rto_us = rxrpc_bound_rto(rto); } static void rxrpc_ack_update_rtt(struct rxrpc_peer *peer, long rtt_us) { if (rtt_us < 0) return; //rxrpc_update_rtt_min(peer, rtt_us); rxrpc_rtt_estimator(peer, rtt_us); rxrpc_set_rto(peer); /* RFC6298: only reset backoff on valid RTT measurement. */ peer->backoff = 0; } /* * Add RTT information to cache. This is called in softirq mode and has * exclusive access to the peer RTT data. */ void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, int rtt_slot, rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, ktime_t send_time, ktime_t resp_time) { struct rxrpc_peer *peer = call->peer; s64 rtt_us; rtt_us = ktime_to_us(ktime_sub(resp_time, send_time)); if (rtt_us < 0) return; spin_lock(&peer->rtt_input_lock); rxrpc_ack_update_rtt(peer, rtt_us); if (peer->rtt_count < 3) peer->rtt_count++; spin_unlock(&peer->rtt_input_lock); trace_rxrpc_rtt_rx(call, why, rtt_slot, send_serial, resp_serial, peer->srtt_us >> 3, peer->rto_us); } /* * Get the retransmission timeout to set in nanoseconds, backing it off each * time we retransmit. */ ktime_t rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans) { u64 timo_us; u32 backoff = READ_ONCE(peer->backoff); timo_us = peer->rto_us; timo_us <<= backoff; if (retrans && timo_us * 2 <= RXRPC_RTO_MAX) WRITE_ONCE(peer->backoff, backoff + 1); if (timo_us < 1) timo_us = 1; return ns_to_ktime(timo_us * NSEC_PER_USEC); } void rxrpc_peer_init_rtt(struct rxrpc_peer *peer) { peer->rto_us = RXRPC_TIMEOUT_INIT; peer->mdev_us = RXRPC_TIMEOUT_INIT; peer->backoff = 0; //minmax_reset(&peer->rtt_min, rxrpc_jiffies32, ~0U); }
14 5 3 2 41 42 42 30 8 3 2 4 2 34 21 22 22 1 21 11 4 18 18 18 7 11 42 42 18 6 3 3 904 821 802 19 1247 87 12 85 4 39 21 1 19 2 24 188 126 5 276 19 1 1 12 267 846 848 1194 60 6 24 71 1810 220 1677 1703 335 19 26 1306 172 1092 735 140 1027 1026 188 1678 136 115 98 15 15 61 63 4 4 4 575 576 576 568 565 568 1078 1076 1076 673 670 670 19 19 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 // SPDX-License-Identifier: GPL-2.0 /* * NETLINK Netlink attributes * * Authors: Thomas Graf <tgraf@suug.ch> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/jiffies.h> #include <linux/nospec.h> #include <linux/skbuff.h> #include <linux/string.h> #include <linux/types.h> #include <net/netlink.h> /* For these data types, attribute length should be exactly the given * size. However, to maintain compatibility with broken commands, if the * attribute length does not match the expected size a warning is emitted * to the user that the command is sending invalid data and needs to be fixed. */ static const u8 nla_attr_len[NLA_TYPE_MAX+1] = { [NLA_U8] = sizeof(u8), [NLA_U16] = sizeof(u16), [NLA_U32] = sizeof(u32), [NLA_U64] = sizeof(u64), [NLA_S8] = sizeof(s8), [NLA_S16] = sizeof(s16), [NLA_S32] = sizeof(s32), [NLA_S64] = sizeof(s64), [NLA_BE16] = sizeof(__be16), [NLA_BE32] = sizeof(__be32), }; static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = { [NLA_U8] = sizeof(u8), [NLA_U16] = sizeof(u16), [NLA_U32] = sizeof(u32), [NLA_U64] = sizeof(u64), [NLA_MSECS] = sizeof(u64), [NLA_NESTED] = NLA_HDRLEN, [NLA_S8] = sizeof(s8), [NLA_S16] = sizeof(s16), [NLA_S32] = sizeof(s32), [NLA_S64] = sizeof(s64), [NLA_BE16] = sizeof(__be16), [NLA_BE32] = sizeof(__be32), }; /* * Nested policies might refer back to the original * policy in some cases, and userspace could try to * abuse that and recurse by nesting in the right * ways. Limit recursion to avoid this problem. */ #define MAX_POLICY_RECURSION_DEPTH 10 static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, struct nlattr **tb, unsigned int depth); static int validate_nla_bitfield32(const struct nlattr *nla, const u32 valid_flags_mask) { const struct nla_bitfield32 *bf = nla_data(nla); if (!valid_flags_mask) return -EINVAL; /*disallow invalid bit selector */ if (bf->selector & ~valid_flags_mask) return -EINVAL; /*disallow invalid bit values */ if (bf->value & ~valid_flags_mask) return -EINVAL; /*disallow valid bit values that are not selected*/ if (bf->value & ~bf->selector) return -EINVAL; return 0; } static int nla_validate_array(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack, unsigned int validate, unsigned int depth) { const struct nlattr *entry; int rem; nla_for_each_attr(entry, head, len, rem) { int ret; if (nla_len(entry) == 0) continue; if (nla_len(entry) < NLA_HDRLEN) { NL_SET_ERR_MSG_ATTR_POL(extack, entry, policy, "Array element too short"); return -ERANGE; } ret = __nla_validate_parse(nla_data(entry), nla_len(entry), maxtype, policy, validate, extack, NULL, depth + 1); if (ret < 0) return ret; } return 0; } void nla_get_range_unsigned(const struct nla_policy *pt, struct netlink_range_validation *range) { WARN_ON_ONCE(pt->validation_type != NLA_VALIDATE_RANGE_PTR && (pt->min < 0 || pt->max < 0)); range->min = 0; switch (pt->type) { case NLA_U8: range->max = U8_MAX; break; case NLA_U16: case NLA_BE16: case NLA_BINARY: range->max = U16_MAX; break; case NLA_U32: case NLA_BE32: range->max = U32_MAX; break; case NLA_U64: case NLA_UINT: case NLA_MSECS: range->max = U64_MAX; break; default: WARN_ON_ONCE(1); return; } switch (pt->validation_type) { case NLA_VALIDATE_RANGE: case NLA_VALIDATE_RANGE_WARN_TOO_LONG: range->min = pt->min; range->max = pt->max; break; case NLA_VALIDATE_RANGE_PTR: *range = *pt->range; break; case NLA_VALIDATE_MIN: range->min = pt->min; break; case NLA_VALIDATE_MAX: range->max = pt->max; break; default: break; } } static int nla_validate_range_unsigned(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack, unsigned int validate) { struct netlink_range_validation range; u64 value; switch (pt->type) { case NLA_U8: value = nla_get_u8(nla); break; case NLA_U16: value = nla_get_u16(nla); break; case NLA_U32: value = nla_get_u32(nla); break; case NLA_U64: value = nla_get_u64(nla); break; case NLA_UINT: value = nla_get_uint(nla); break; case NLA_MSECS: value = nla_get_u64(nla); break; case NLA_BINARY: value = nla_len(nla); break; case NLA_BE16: value = ntohs(nla_get_be16(nla)); break; case NLA_BE32: value = ntohl(nla_get_be32(nla)); break; default: return -EINVAL; } nla_get_range_unsigned(pt, &range); if (pt->validation_type == NLA_VALIDATE_RANGE_WARN_TOO_LONG && pt->type == NLA_BINARY && value > range.max) { pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n", current->comm, pt->type); if (validate & NL_VALIDATE_STRICT_ATTRS) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } /* this assumes min <= max (don't validate against min) */ return 0; } if (value < range.min || value > range.max) { bool binary = pt->type == NLA_BINARY; if (binary) NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "binary attribute size out of range"); else NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "integer out of range"); return -ERANGE; } return 0; } void nla_get_range_signed(const struct nla_policy *pt, struct netlink_range_validation_signed *range) { switch (pt->type) { case NLA_S8: range->min = S8_MIN; range->max = S8_MAX; break; case NLA_S16: range->min = S16_MIN; range->max = S16_MAX; break; case NLA_S32: range->min = S32_MIN; range->max = S32_MAX; break; case NLA_S64: case NLA_SINT: range->min = S64_MIN; range->max = S64_MAX; break; default: WARN_ON_ONCE(1); return; } switch (pt->validation_type) { case NLA_VALIDATE_RANGE: range->min = pt->min; range->max = pt->max; break; case NLA_VALIDATE_RANGE_PTR: *range = *pt->range_signed; break; case NLA_VALIDATE_MIN: range->min = pt->min; break; case NLA_VALIDATE_MAX: range->max = pt->max; break; default: break; } } static int nla_validate_int_range_signed(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack) { struct netlink_range_validation_signed range; s64 value; switch (pt->type) { case NLA_S8: value = nla_get_s8(nla); break; case NLA_S16: value = nla_get_s16(nla); break; case NLA_S32: value = nla_get_s32(nla); break; case NLA_S64: value = nla_get_s64(nla); break; case NLA_SINT: value = nla_get_sint(nla); break; default: return -EINVAL; } nla_get_range_signed(pt, &range); if (value < range.min || value > range.max) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "integer out of range"); return -ERANGE; } return 0; } static int nla_validate_int_range(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack, unsigned int validate) { switch (pt->type) { case NLA_U8: case NLA_U16: case NLA_U32: case NLA_U64: case NLA_UINT: case NLA_MSECS: case NLA_BINARY: case NLA_BE16: case NLA_BE32: return nla_validate_range_unsigned(pt, nla, extack, validate); case NLA_S8: case NLA_S16: case NLA_S32: case NLA_S64: case NLA_SINT: return nla_validate_int_range_signed(pt, nla, extack); default: WARN_ON(1); return -EINVAL; } } static int nla_validate_mask(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack) { u64 value; switch (pt->type) { case NLA_U8: value = nla_get_u8(nla); break; case NLA_U16: value = nla_get_u16(nla); break; case NLA_U32: value = nla_get_u32(nla); break; case NLA_U64: value = nla_get_u64(nla); break; case NLA_UINT: value = nla_get_uint(nla); break; case NLA_BE16: value = ntohs(nla_get_be16(nla)); break; case NLA_BE32: value = ntohl(nla_get_be32(nla)); break; default: return -EINVAL; } if (value & ~(u64)pt->mask) { NL_SET_ERR_MSG_ATTR(extack, nla, "reserved bit set"); return -EINVAL; } return 0; } static int validate_nla(const struct nlattr *nla, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, unsigned int depth) { u16 strict_start_type = policy[0].strict_start_type; const struct nla_policy *pt; int minlen = 0, attrlen = nla_len(nla), type = nla_type(nla); int err = -ERANGE; if (strict_start_type && type >= strict_start_type) validate |= NL_VALIDATE_STRICT; if (type <= 0 || type > maxtype) return 0; type = array_index_nospec(type, maxtype + 1); pt = &policy[type]; BUG_ON(pt->type > NLA_TYPE_MAX); if (nla_attr_len[pt->type] && attrlen != nla_attr_len[pt->type]) { pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n", current->comm, type); if (validate & NL_VALIDATE_STRICT_ATTRS) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } } if (validate & NL_VALIDATE_NESTED) { if ((pt->type == NLA_NESTED || pt->type == NLA_NESTED_ARRAY) && !(nla->nla_type & NLA_F_NESTED)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "NLA_F_NESTED is missing"); return -EINVAL; } if (pt->type != NLA_NESTED && pt->type != NLA_NESTED_ARRAY && pt->type != NLA_UNSPEC && (nla->nla_type & NLA_F_NESTED)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "NLA_F_NESTED not expected"); return -EINVAL; } } switch (pt->type) { case NLA_REJECT: if (extack && pt->reject_message) { NL_SET_BAD_ATTR(extack, nla); extack->_msg = pt->reject_message; return -EINVAL; } err = -EINVAL; goto out_err; case NLA_FLAG: if (attrlen > 0) goto out_err; break; case NLA_SINT: case NLA_UINT: if (attrlen != sizeof(u32) && attrlen != sizeof(u64)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } break; case NLA_BITFIELD32: if (attrlen != sizeof(struct nla_bitfield32)) goto out_err; err = validate_nla_bitfield32(nla, pt->bitfield32_valid); if (err) goto out_err; break; case NLA_NUL_STRING: if (pt->len) minlen = min_t(int, attrlen, pt->len + 1); else minlen = attrlen; if (!minlen || memchr(nla_data(nla), '\0', minlen) == NULL) { err = -EINVAL; goto out_err; } fallthrough; case NLA_STRING: if (attrlen < 1) goto out_err; if (pt->len) { char *buf = nla_data(nla); if (buf[attrlen - 1] == '\0') attrlen--; if (attrlen > pt->len) goto out_err; } break; case NLA_BINARY: if (pt->len && attrlen > pt->len) goto out_err; break; case NLA_NESTED: /* a nested attributes is allowed to be empty; if its not, * it must have a size of at least NLA_HDRLEN. */ if (attrlen == 0) break; if (attrlen < NLA_HDRLEN) goto out_err; if (pt->nested_policy) { err = __nla_validate_parse(nla_data(nla), nla_len(nla), pt->len, pt->nested_policy, validate, extack, NULL, depth + 1); if (err < 0) { /* * return directly to preserve the inner * error message/attribute pointer */ return err; } } break; case NLA_NESTED_ARRAY: /* a nested array attribute is allowed to be empty; if its not, * it must have a size of at least NLA_HDRLEN. */ if (attrlen == 0) break; if (attrlen < NLA_HDRLEN) goto out_err; if (pt->nested_policy) { int err; err = nla_validate_array(nla_data(nla), nla_len(nla), pt->len, pt->nested_policy, extack, validate, depth); if (err < 0) { /* * return directly to preserve the inner * error message/attribute pointer */ return err; } } break; case NLA_UNSPEC: if (validate & NL_VALIDATE_UNSPEC) { NL_SET_ERR_MSG_ATTR(extack, nla, "Unsupported attribute"); return -EINVAL; } if (attrlen < pt->len) goto out_err; break; default: if (pt->len) minlen = pt->len; else minlen = nla_attr_minlen[pt->type]; if (attrlen < minlen) goto out_err; } /* further validation */ switch (pt->validation_type) { case NLA_VALIDATE_NONE: /* nothing to do */ break; case NLA_VALIDATE_RANGE_PTR: case NLA_VALIDATE_RANGE: case NLA_VALIDATE_RANGE_WARN_TOO_LONG: case NLA_VALIDATE_MIN: case NLA_VALIDATE_MAX: err = nla_validate_int_range(pt, nla, extack, validate); if (err) return err; break; case NLA_VALIDATE_MASK: err = nla_validate_mask(pt, nla, extack); if (err) return err; break; case NLA_VALIDATE_FUNCTION: if (pt->validate) { err = pt->validate(nla, extack); if (err) return err; } break; } return 0; out_err: NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "Attribute failed policy validation"); return err; } static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, struct nlattr **tb, unsigned int depth) { const struct nlattr *nla; int rem; if (depth >= MAX_POLICY_RECURSION_DEPTH) { NL_SET_ERR_MSG(extack, "allowed policy recursion depth exceeded"); return -EINVAL; } if (tb) memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); nla_for_each_attr(nla, head, len, rem) { u16 type = nla_type(nla); if (type == 0 || type > maxtype) { if (validate & NL_VALIDATE_MAXTYPE) { NL_SET_ERR_MSG_ATTR(extack, nla, "Unknown attribute type"); return -EINVAL; } continue; } type = array_index_nospec(type, maxtype + 1); if (policy) { int err = validate_nla(nla, maxtype, policy, validate, extack, depth); if (err < 0) return err; } if (tb) tb[type] = (struct nlattr *)nla; } if (unlikely(rem > 0)) { pr_warn_ratelimited("netlink: %d bytes leftover after parsing attributes in process `%s'.\n", rem, current->comm); NL_SET_ERR_MSG(extack, "bytes leftover after parsing attributes"); if (validate & NL_VALIDATE_TRAILING) return -EINVAL; } return 0; } /** * __nla_validate - Validate a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @maxtype: maximum attribute type to be expected * @policy: validation policy * @validate: validation strictness * @extack: extended ACK report struct * * Validates all attributes in the specified attribute stream against the * specified policy. Validation depends on the validate flags passed, see * &enum netlink_validation for more details on that. * See documentation of struct nla_policy for more details. * * Returns 0 on success or a negative error code. */ int __nla_validate(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { return __nla_validate_parse(head, len, maxtype, policy, validate, extack, NULL, 0); } EXPORT_SYMBOL(__nla_validate); /** * nla_policy_len - Determine the max. length of a policy * @p: policy to use * @n: number of policies * * Determines the max. length of the policy. It is currently used * to allocated Netlink buffers roughly the size of the actual * message. * * Returns 0 on success or a negative error code. */ int nla_policy_len(const struct nla_policy *p, int n) { int i, len = 0; for (i = 0; i < n; i++, p++) { if (p->len) len += nla_total_size(p->len); else if (nla_attr_len[p->type]) len += nla_total_size(nla_attr_len[p->type]); else if (nla_attr_minlen[p->type]) len += nla_total_size(nla_attr_minlen[p->type]); } return len; } EXPORT_SYMBOL(nla_policy_len); /** * __nla_parse - Parse a stream of attributes into a tb buffer * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @head: head of attribute stream * @len: length of attribute stream * @policy: validation policy * @validate: validation strictness * @extack: extended ACK pointer * * Parses a stream of attributes and stores a pointer to each attribute in * the tb array accessible via the attribute type. * Validation is controlled by the @validate parameter. * * Returns 0 on success or a negative error code. */ int __nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { return __nla_validate_parse(head, len, maxtype, policy, validate, extack, tb, 0); } EXPORT_SYMBOL(__nla_parse); /** * nla_find - Find a specific attribute in a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @attrtype: type of attribute to look for * * Returns the first attribute in the stream matching the specified type. */ struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype) { const struct nlattr *nla; int rem; nla_for_each_attr(nla, head, len, rem) if (nla_type(nla) == attrtype) return (struct nlattr *)nla; return NULL; } EXPORT_SYMBOL(nla_find); /** * nla_strscpy - Copy string attribute payload into a sized buffer * @dst: Where to copy the string to. * @nla: Attribute to copy the string from. * @dstsize: Size of destination buffer. * * Copies at most dstsize - 1 bytes into the destination buffer. * Unlike strscpy() the destination buffer is always padded out. * * Return: * * srclen - Returns @nla length (not including the trailing %NUL). * * -E2BIG - If @dstsize is 0 or greater than U16_MAX or @nla length greater * than @dstsize. */ ssize_t nla_strscpy(char *dst, const struct nlattr *nla, size_t dstsize) { size_t srclen = nla_len(nla); char *src = nla_data(nla); ssize_t ret; size_t len; if (dstsize == 0 || WARN_ON_ONCE(dstsize > U16_MAX)) return -E2BIG; if (srclen > 0 && src[srclen - 1] == '\0') srclen--; if (srclen >= dstsize) { len = dstsize - 1; ret = -E2BIG; } else { len = srclen; ret = len; } memcpy(dst, src, len); /* Zero pad end of dst. */ memset(dst + len, 0, dstsize - len); return ret; } EXPORT_SYMBOL(nla_strscpy); /** * nla_strdup - Copy string attribute payload into a newly allocated buffer * @nla: attribute to copy the string from * @flags: the type of memory to allocate (see kmalloc). * * Returns a pointer to the allocated buffer or NULL on error. */ char *nla_strdup(const struct nlattr *nla, gfp_t flags) { size_t srclen = nla_len(nla); char *src = nla_data(nla), *dst; if (srclen > 0 && src[srclen - 1] == '\0') srclen--; dst = kmalloc(srclen + 1, flags); if (dst != NULL) { memcpy(dst, src, srclen); dst[srclen] = '\0'; } return dst; } EXPORT_SYMBOL(nla_strdup); /** * nla_memcpy - Copy a netlink attribute into another memory area * @dest: where to copy to memcpy * @src: netlink attribute to copy from * @count: size of the destination area * * Note: The number of bytes copied is limited by the length of * attribute's payload. memcpy * * Returns the number of bytes copied. */ int nla_memcpy(void *dest, const struct nlattr *src, int count) { int minlen = min_t(int, count, nla_len(src)); memcpy(dest, nla_data(src), minlen); if (count > minlen) memset(dest + minlen, 0, count - minlen); return minlen; } EXPORT_SYMBOL(nla_memcpy); /** * nla_memcmp - Compare an attribute with sized memory area * @nla: netlink attribute * @data: memory area * @size: size of memory area */ int nla_memcmp(const struct nlattr *nla, const void *data, size_t size) { int d = nla_len(nla) - size; if (d == 0) d = memcmp(nla_data(nla), data, size); return d; } EXPORT_SYMBOL(nla_memcmp); /** * nla_strcmp - Compare a string attribute against a string * @nla: netlink string attribute * @str: another string */ int nla_strcmp(const struct nlattr *nla, const char *str) { int len = strlen(str); char *buf = nla_data(nla); int attrlen = nla_len(nla); int d; while (attrlen > 0 && buf[attrlen - 1] == '\0') attrlen--; d = attrlen - len; if (d == 0) d = memcmp(nla_data(nla), str, len); return d; } EXPORT_SYMBOL(nla_strcmp); #ifdef CONFIG_NET /** * __nla_reserve - reserve room for attribute on the skb * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) { struct nlattr *nla; nla = skb_put(skb, nla_total_size(attrlen)); nla->nla_type = attrtype; nla->nla_len = nla_attr_size(attrlen); memset((unsigned char *) nla + nla->nla_len, 0, nla_padlen(attrlen)); return nla; } EXPORT_SYMBOL(__nla_reserve); /** * __nla_reserve_64bit - reserve room for attribute on the skb and align it * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * @padattr: attribute type for the padding * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. It also ensure that this * attribute will have a 64-bit aligned nla_data() area. * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ struct nlattr *__nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr) { nla_align_64bit(skb, padattr); return __nla_reserve(skb, attrtype, attrlen); } EXPORT_SYMBOL(__nla_reserve_64bit); /** * __nla_reserve_nohdr - reserve room for attribute without header * @skb: socket buffer to reserve room on * @attrlen: length of attribute payload * * Reserves room for attribute payload without a header. * * The caller is responsible to ensure that the skb provides enough * tailroom for the payload. */ void *__nla_reserve_nohdr(struct sk_buff *skb, int attrlen) { return skb_put_zero(skb, NLA_ALIGN(attrlen)); } EXPORT_SYMBOL(__nla_reserve_nohdr); /** * nla_reserve - reserve room for attribute on the skb * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute header and payload. */ struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) { if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen))) return NULL; return __nla_reserve(skb, attrtype, attrlen); } EXPORT_SYMBOL(nla_reserve); /** * nla_reserve_64bit - reserve room for attribute on the skb and align it * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * @padattr: attribute type for the padding * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. It also ensure that this * attribute will have a 64-bit aligned nla_data() area. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute header and payload. */ struct nlattr *nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr) { size_t len; if (nla_need_padding_for_64bit(skb)) len = nla_total_size_64bit(attrlen); else len = nla_total_size(attrlen); if (unlikely(skb_tailroom(skb) < len)) return NULL; return __nla_reserve_64bit(skb, attrtype, attrlen, padattr); } EXPORT_SYMBOL(nla_reserve_64bit); /** * nla_reserve_nohdr - reserve room for attribute without header * @skb: socket buffer to reserve room on * @attrlen: length of attribute payload * * Reserves room for attribute payload without a header. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute payload. */ void *nla_reserve_nohdr(struct sk_buff *skb, int attrlen) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return NULL; return __nla_reserve_nohdr(skb, attrlen); } EXPORT_SYMBOL(nla_reserve_nohdr); /** * __nla_put - Add a netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ void __nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data) { struct nlattr *nla; nla = __nla_reserve(skb, attrtype, attrlen); memcpy(nla_data(nla), data, attrlen); } EXPORT_SYMBOL(__nla_put); /** * __nla_put_64bit - Add a netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * @padattr: attribute type for the padding * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ void __nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr) { struct nlattr *nla; nla = __nla_reserve_64bit(skb, attrtype, attrlen, padattr); memcpy(nla_data(nla), data, attrlen); } EXPORT_SYMBOL(__nla_put_64bit); /** * __nla_put_nohdr - Add a netlink attribute without header * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute payload. */ void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data) { void *start; start = __nla_reserve_nohdr(skb, attrlen); memcpy(start, data, attrlen); } EXPORT_SYMBOL(__nla_put_nohdr); /** * nla_put - Add a netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute header and payload. */ int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen))) return -EMSGSIZE; __nla_put(skb, attrtype, attrlen, data); return 0; } EXPORT_SYMBOL(nla_put); /** * nla_put_64bit - Add a netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * @padattr: attribute type for the padding * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute header and payload. */ int nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr) { size_t len; if (nla_need_padding_for_64bit(skb)) len = nla_total_size_64bit(attrlen); else len = nla_total_size(attrlen); if (unlikely(skb_tailroom(skb) < len)) return -EMSGSIZE; __nla_put_64bit(skb, attrtype, attrlen, data, padattr); return 0; } EXPORT_SYMBOL(nla_put_64bit); /** * nla_put_nohdr - Add a netlink attribute without header * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute payload. */ int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return -EMSGSIZE; __nla_put_nohdr(skb, attrlen, data); return 0; } EXPORT_SYMBOL(nla_put_nohdr); /** * nla_append - Add a netlink attribute without header or padding * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute payload. */ int nla_append(struct sk_buff *skb, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return -EMSGSIZE; skb_put_data(skb, data, attrlen); return 0; } EXPORT_SYMBOL(nla_append); #endif
187 163 92 63 92 93 3 253 4 4 5 4 50 52 52 307 108 68 149 49 120 119 149 140 11 148 8 8 8 8 8 7 1 3 5 8 1 6 2 8 7 4 3 1 1 1 2 4 4 4 3 1 1 1 1 1 1 1 16 16 4 5 24 2 2 1 2 15 2 157 2 3 2 2 142 12 5 5 2 16 11 40 7 13 1 35 6 45 59 78 18 121 16 2 7 1 120 10 24 12 3 9 1 133 14 119 7 7 2 1 5 5 6 4 6 7 63 47 31 47 47 4 41 7 12 35 38 1 38 31 9 27 27 13 18 18 5 5 17 18 9 27 27 22 21 108 65 52 35 18 27 8 457 35 456 181 386 53 66 96 3 25 25 9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * IPv4 Forwarding Information Base: FIB frontend. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #include <linux/module.h> #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/capability.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_addr.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/cache.h> #include <linux/init.h> #include <linux/list.h> #include <linux/slab.h> #include <net/inet_dscp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <net/tcp.h> #include <net/sock.h> #include <net/arp.h> #include <net/ip_fib.h> #include <net/nexthop.h> #include <net/rtnetlink.h> #include <net/xfrm.h> #include <net/l3mdev.h> #include <net/lwtunnel.h> #include <trace/events/fib.h> #ifndef CONFIG_IP_MULTIPLE_TABLES static int __net_init fib4_rules_init(struct net *net) { struct fib_table *local_table, *main_table; main_table = fib_trie_table(RT_TABLE_MAIN, NULL); if (!main_table) return -ENOMEM; local_table = fib_trie_table(RT_TABLE_LOCAL, main_table); if (!local_table) goto fail; hlist_add_head_rcu(&local_table->tb_hlist, &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]); hlist_add_head_rcu(&main_table->tb_hlist, &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]); return 0; fail: fib_free_table(main_table); return -ENOMEM; } #else struct fib_table *fib_new_table(struct net *net, u32 id) { struct fib_table *tb, *alias = NULL; unsigned int h; if (id == 0) id = RT_TABLE_MAIN; tb = fib_get_table(net, id); if (tb) return tb; if (id == RT_TABLE_LOCAL && !net->ipv4.fib_has_custom_rules) alias = fib_new_table(net, RT_TABLE_MAIN); tb = fib_trie_table(id, alias); if (!tb) return NULL; switch (id) { case RT_TABLE_MAIN: rcu_assign_pointer(net->ipv4.fib_main, tb); break; case RT_TABLE_DEFAULT: rcu_assign_pointer(net->ipv4.fib_default, tb); break; default: break; } h = id & (FIB_TABLE_HASHSZ - 1); hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]); return tb; } EXPORT_SYMBOL_GPL(fib_new_table); /* caller must hold either rtnl or rcu read lock */ struct fib_table *fib_get_table(struct net *net, u32 id) { struct fib_table *tb; struct hlist_head *head; unsigned int h; if (id == 0) id = RT_TABLE_MAIN; h = id & (FIB_TABLE_HASHSZ - 1); head = &net->ipv4.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb_hlist, lockdep_rtnl_is_held()) { if (tb->tb_id == id) return tb; } return NULL; } #endif /* CONFIG_IP_MULTIPLE_TABLES */ static void fib_replace_table(struct net *net, struct fib_table *old, struct fib_table *new) { #ifdef CONFIG_IP_MULTIPLE_TABLES switch (new->tb_id) { case RT_TABLE_MAIN: rcu_assign_pointer(net->ipv4.fib_main, new); break; case RT_TABLE_DEFAULT: rcu_assign_pointer(net->ipv4.fib_default, new); break; default: break; } #endif /* replace the old table in the hlist */ hlist_replace_rcu(&old->tb_hlist, &new->tb_hlist); } int fib_unmerge(struct net *net) { struct fib_table *old, *new, *main_table; /* attempt to fetch local table if it has been allocated */ old = fib_get_table(net, RT_TABLE_LOCAL); if (!old) return 0; new = fib_trie_unmerge(old); if (!new) return -ENOMEM; /* table is already unmerged */ if (new == old) return 0; /* replace merged table with clean table */ fib_replace_table(net, old, new); fib_free_table(old); /* attempt to fetch main table if it has been allocated */ main_table = fib_get_table(net, RT_TABLE_MAIN); if (!main_table) return 0; /* flush local entries from main table */ fib_table_flush_external(main_table); return 0; } void fib_flush(struct net *net) { int flushed = 0; unsigned int h; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct hlist_node *tmp; struct fib_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) flushed += fib_table_flush(net, tb, false); } if (flushed) rt_cache_flush(net); } /* * Find address type as if only "dev" was present in the system. If * on_dev is NULL then all interfaces are taken into consideration. */ static inline unsigned int __inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr, u32 tb_id) { struct flowi4 fl4 = { .daddr = addr }; struct fib_result res; unsigned int ret = RTN_BROADCAST; struct fib_table *table; if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr)) return RTN_BROADCAST; if (ipv4_is_multicast(addr)) return RTN_MULTICAST; rcu_read_lock(); table = fib_get_table(net, tb_id); if (table) { ret = RTN_UNICAST; if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) { struct fib_nh_common *nhc = fib_info_nhc(res.fi, 0); if (!dev || dev == nhc->nhc_dev) ret = res.type; } } rcu_read_unlock(); return ret; } unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id) { return __inet_dev_addr_type(net, NULL, addr, tb_id); } EXPORT_SYMBOL(inet_addr_type_table); unsigned int inet_addr_type(struct net *net, __be32 addr) { return __inet_dev_addr_type(net, NULL, addr, RT_TABLE_LOCAL); } EXPORT_SYMBOL(inet_addr_type); unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr) { u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL; return __inet_dev_addr_type(net, dev, addr, rt_table); } EXPORT_SYMBOL(inet_dev_addr_type); /* inet_addr_type with dev == NULL but using the table from a dev * if one is associated */ unsigned int inet_addr_type_dev_table(struct net *net, const struct net_device *dev, __be32 addr) { u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL; return __inet_dev_addr_type(net, NULL, addr, rt_table); } EXPORT_SYMBOL(inet_addr_type_dev_table); __be32 fib_compute_spec_dst(struct sk_buff *skb) { struct net_device *dev = skb->dev; struct in_device *in_dev; struct fib_result res; struct rtable *rt; struct net *net; int scope; rt = skb_rtable(skb); if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) == RTCF_LOCAL) return ip_hdr(skb)->daddr; in_dev = __in_dev_get_rcu(dev); net = dev_net(dev); scope = RT_SCOPE_UNIVERSE; if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) { bool vmark = in_dev && IN_DEV_SRC_VMARK(in_dev); struct flowi4 fl4 = { .flowi4_iif = LOOPBACK_IFINDEX, .flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev), .daddr = ip_hdr(skb)->saddr, .flowi4_tos = ip_hdr(skb)->tos & IPTOS_RT_MASK, .flowi4_scope = scope, .flowi4_mark = vmark ? skb->mark : 0, }; if (!fib_lookup(net, &fl4, &res, 0)) return fib_result_prefsrc(net, &res); } else { scope = RT_SCOPE_LINK; } return inet_select_addr(dev, ip_hdr(skb)->saddr, scope); } bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev) { bool dev_match = false; #ifdef CONFIG_IP_ROUTE_MULTIPATH if (unlikely(fi->nh)) { dev_match = nexthop_uses_dev(fi->nh, dev); } else { int ret; for (ret = 0; ret < fib_info_num_path(fi); ret++) { const struct fib_nh_common *nhc = fib_info_nhc(fi, ret); if (nhc_l3mdev_matches_dev(nhc, dev)) { dev_match = true; break; } } } #else if (fib_info_nhc(fi, 0)->nhc_dev == dev) dev_match = true; #endif return dev_match; } EXPORT_SYMBOL_GPL(fib_info_nh_uses_dev); /* Given (packet source, input interface) and optional (dst, oif, tos): * - (main) check, that source is valid i.e. not broadcast or our local * address. * - figure out what "logical" interface this packet arrived * and calculate "specific destination" address. * - check, that packet arrived from expected physical interface. * called with rcu_read_lock() */ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, int oif, struct net_device *dev, int rpf, struct in_device *idev, u32 *itag) { struct net *net = dev_net(dev); struct flow_keys flkeys; int ret, no_addr; struct fib_result res; struct flowi4 fl4; bool dev_match; fl4.flowi4_oif = 0; fl4.flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev); fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; fl4.daddr = src; fl4.saddr = dst; fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_flags = 0; fl4.flowi4_uid = sock_net_uid(net, NULL); fl4.flowi4_multipath_hash = 0; no_addr = idev->ifa_list == NULL; fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; if (!fib4_rules_early_flow_dissect(net, skb, &fl4, &flkeys)) { fl4.flowi4_proto = 0; fl4.fl4_sport = 0; fl4.fl4_dport = 0; } else { swap(fl4.fl4_sport, fl4.fl4_dport); } if (fib_lookup(net, &fl4, &res, 0)) goto last_resort; if (res.type != RTN_UNICAST && (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev))) goto e_inval; fib_combine_itag(itag, &res); dev_match = fib_info_nh_uses_dev(res.fi, dev); /* This is not common, loopback packets retain skb_dst so normally they * would not even hit this slow path. */ dev_match = dev_match || (res.type == RTN_LOCAL && dev == net->loopback_dev); if (dev_match) { ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST; return ret; } if (no_addr) goto last_resort; if (rpf == 1) goto e_rpf; fl4.flowi4_oif = dev->ifindex; ret = 0; if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) { if (res.type == RTN_UNICAST) ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST; } return ret; last_resort: if (rpf) goto e_rpf; *itag = 0; return 0; e_inval: return -EINVAL; e_rpf: return -EXDEV; } /* Ignore rp_filter for packets protected by IPsec. */ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, int oif, struct net_device *dev, struct in_device *idev, u32 *itag) { int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); struct net *net = dev_net(dev); if (!r && !fib_num_tclassid_users(net) && (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) { if (IN_DEV_ACCEPT_LOCAL(idev)) goto ok; /* with custom local routes in place, checking local addresses * only will be too optimistic, with custom rules, checking * local addresses only can be too strict, e.g. due to vrf */ if (net->ipv4.fib_has_custom_local_routes || fib4_has_custom_rules(net)) goto full_check; /* Within the same container, it is regarded as a martian source, * and the same host but different containers are not. */ if (inet_lookup_ifaddr_rcu(net, src)) return -EINVAL; ok: *itag = 0; return 0; } full_check: return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag); } static inline __be32 sk_extract_addr(struct sockaddr *addr) { return ((struct sockaddr_in *) addr)->sin_addr.s_addr; } static int put_rtax(struct nlattr *mx, int len, int type, u32 value) { struct nlattr *nla; nla = (struct nlattr *) ((char *) mx + len); nla->nla_type = type; nla->nla_len = nla_attr_size(4); *(u32 *) nla_data(nla) = value; return len + nla_total_size(4); } static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, struct fib_config *cfg) { __be32 addr; int plen; memset(cfg, 0, sizeof(*cfg)); cfg->fc_nlinfo.nl_net = net; if (rt->rt_dst.sa_family != AF_INET) return -EAFNOSUPPORT; /* * Check mask for validity: * a) it must be contiguous. * b) destination must have all host bits clear. * c) if application forgot to set correct family (AF_INET), * reject request unless it is absolutely clear i.e. * both family and mask are zero. */ plen = 32; addr = sk_extract_addr(&rt->rt_dst); if (!(rt->rt_flags & RTF_HOST)) { __be32 mask = sk_extract_addr(&rt->rt_genmask); if (rt->rt_genmask.sa_family != AF_INET) { if (mask || rt->rt_genmask.sa_family) return -EAFNOSUPPORT; } if (bad_mask(mask, addr)) return -EINVAL; plen = inet_mask_len(mask); } cfg->fc_dst_len = plen; cfg->fc_dst = addr; if (cmd != SIOCDELRT) { cfg->fc_nlflags = NLM_F_CREATE; cfg->fc_protocol = RTPROT_BOOT; } if (rt->rt_metric) cfg->fc_priority = rt->rt_metric - 1; if (rt->rt_flags & RTF_REJECT) { cfg->fc_scope = RT_SCOPE_HOST; cfg->fc_type = RTN_UNREACHABLE; return 0; } cfg->fc_scope = RT_SCOPE_NOWHERE; cfg->fc_type = RTN_UNICAST; if (rt->rt_dev) { char *colon; struct net_device *dev; char devname[IFNAMSIZ]; if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1)) return -EFAULT; devname[IFNAMSIZ-1] = 0; colon = strchr(devname, ':'); if (colon) *colon = 0; dev = __dev_get_by_name(net, devname); if (!dev) return -ENODEV; cfg->fc_oif = dev->ifindex; cfg->fc_table = l3mdev_fib_table(dev); if (colon) { const struct in_ifaddr *ifa; struct in_device *in_dev; in_dev = __in_dev_get_rtnl(dev); if (!in_dev) return -ENODEV; *colon = ':'; rcu_read_lock(); in_dev_for_each_ifa_rcu(ifa, in_dev) { if (strcmp(ifa->ifa_label, devname) == 0) break; } rcu_read_unlock(); if (!ifa) return -ENODEV; cfg->fc_prefsrc = ifa->ifa_local; } } addr = sk_extract_addr(&rt->rt_gateway); if (rt->rt_gateway.sa_family == AF_INET && addr) { unsigned int addr_type; cfg->fc_gw4 = addr; cfg->fc_gw_family = AF_INET; addr_type = inet_addr_type_table(net, addr, cfg->fc_table); if (rt->rt_flags & RTF_GATEWAY && addr_type == RTN_UNICAST) cfg->fc_scope = RT_SCOPE_UNIVERSE; } if (!cfg->fc_table) cfg->fc_table = RT_TABLE_MAIN; if (cmd == SIOCDELRT) return 0; if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw_family) return -EINVAL; if (cfg->fc_scope == RT_SCOPE_NOWHERE) cfg->fc_scope = RT_SCOPE_LINK; if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) { struct nlattr *mx; int len = 0; mx = kcalloc(3, nla_total_size(4), GFP_KERNEL); if (!mx) return -ENOMEM; if (rt->rt_flags & RTF_MTU) len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40); if (rt->rt_flags & RTF_WINDOW) len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window); if (rt->rt_flags & RTF_IRTT) len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3); cfg->fc_mx = mx; cfg->fc_mx_len = len; } return 0; } /* * Handle IP routing ioctl calls. * These are used to manipulate the routing tables */ int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt) { struct fib_config cfg; int err; switch (cmd) { case SIOCADDRT: /* Add a route */ case SIOCDELRT: /* Delete a route */ if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; rtnl_lock(); err = rtentry_to_fib_config(net, cmd, rt, &cfg); if (err == 0) { struct fib_table *tb; if (cmd == SIOCDELRT) { tb = fib_get_table(net, cfg.fc_table); if (tb) err = fib_table_delete(net, tb, &cfg, NULL); else err = -ESRCH; } else { tb = fib_new_table(net, cfg.fc_table); if (tb) err = fib_table_insert(net, tb, &cfg, NULL); else err = -ENOBUFS; } /* allocated by rtentry_to_fib_config() */ kfree(cfg.fc_mx); } rtnl_unlock(); return err; } return -EINVAL; } const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 }, [RTA_DST] = { .type = NLA_U32 }, [RTA_SRC] = { .type = NLA_U32 }, [RTA_IIF] = { .type = NLA_U32 }, [RTA_OIF] = { .type = NLA_U32 }, [RTA_GATEWAY] = { .type = NLA_U32 }, [RTA_PRIORITY] = { .type = NLA_U32 }, [RTA_PREFSRC] = { .type = NLA_U32 }, [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, [RTA_FLOW] = { .type = NLA_U32 }, [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, [RTA_ENCAP] = { .type = NLA_NESTED }, [RTA_UID] = { .type = NLA_U32 }, [RTA_MARK] = { .type = NLA_U32 }, [RTA_TABLE] = { .type = NLA_U32 }, [RTA_IP_PROTO] = { .type = NLA_U8 }, [RTA_SPORT] = { .type = NLA_U16 }, [RTA_DPORT] = { .type = NLA_U16 }, [RTA_NH_ID] = { .type = NLA_U32 }, }; int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla, struct netlink_ext_ack *extack) { struct rtvia *via; int alen; if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) { NL_SET_ERR_MSG(extack, "Invalid attribute length for RTA_VIA"); return -EINVAL; } via = nla_data(nla); alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr); switch (via->rtvia_family) { case AF_INET: if (alen != sizeof(__be32)) { NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_VIA"); return -EINVAL; } cfg->fc_gw_family = AF_INET; cfg->fc_gw4 = *((__be32 *)via->rtvia_addr); break; case AF_INET6: #if IS_ENABLED(CONFIG_IPV6) if (alen != sizeof(struct in6_addr)) { NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_VIA"); return -EINVAL; } cfg->fc_gw_family = AF_INET6; cfg->fc_gw6 = *((struct in6_addr *)via->rtvia_addr); #else NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel"); return -EINVAL; #endif break; default: NL_SET_ERR_MSG(extack, "Unsupported address family in RTA_VIA"); return -EINVAL; } return 0; } static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct fib_config *cfg, struct netlink_ext_ack *extack) { bool has_gw = false, has_via = false; struct nlattr *attr; int err, remaining; struct rtmsg *rtm; err = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy, extack); if (err < 0) goto errout; memset(cfg, 0, sizeof(*cfg)); rtm = nlmsg_data(nlh); if (!inet_validate_dscp(rtm->rtm_tos)) { NL_SET_ERR_MSG(extack, "Invalid dsfield (tos): ECN bits must be 0"); err = -EINVAL; goto errout; } cfg->fc_dscp = inet_dsfield_to_dscp(rtm->rtm_tos); cfg->fc_dst_len = rtm->rtm_dst_len; cfg->fc_table = rtm->rtm_table; cfg->fc_protocol = rtm->rtm_protocol; cfg->fc_scope = rtm->rtm_scope; cfg->fc_type = rtm->rtm_type; cfg->fc_flags = rtm->rtm_flags; cfg->fc_nlflags = nlh->nlmsg_flags; cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; cfg->fc_nlinfo.nlh = nlh; cfg->fc_nlinfo.nl_net = net; if (cfg->fc_type > RTN_MAX) { NL_SET_ERR_MSG(extack, "Invalid route type"); err = -EINVAL; goto errout; } nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) { switch (nla_type(attr)) { case RTA_DST: cfg->fc_dst = nla_get_be32(attr); break; case RTA_OIF: cfg->fc_oif = nla_get_u32(attr); break; case RTA_GATEWAY: has_gw = true; cfg->fc_gw4 = nla_get_be32(attr); if (cfg->fc_gw4) cfg->fc_gw_family = AF_INET; break; case RTA_VIA: has_via = true; err = fib_gw_from_via(cfg, attr, extack); if (err) goto errout; break; case RTA_PRIORITY: cfg->fc_priority = nla_get_u32(attr); break; case RTA_PREFSRC: cfg->fc_prefsrc = nla_get_be32(attr); break; case RTA_METRICS: cfg->fc_mx = nla_data(attr); cfg->fc_mx_len = nla_len(attr); break; case RTA_MULTIPATH: err = lwtunnel_valid_encap_type_attr(nla_data(attr), nla_len(attr), extack); if (err < 0) goto errout; cfg->fc_mp = nla_data(attr); cfg->fc_mp_len = nla_len(attr); break; case RTA_FLOW: cfg->fc_flow = nla_get_u32(attr); break; case RTA_TABLE: cfg->fc_table = nla_get_u32(attr); break; case RTA_ENCAP: cfg->fc_encap = attr; break; case RTA_ENCAP_TYPE: cfg->fc_encap_type = nla_get_u16(attr); err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); if (err < 0) goto errout; break; case RTA_NH_ID: cfg->fc_nh_id = nla_get_u32(attr); break; } } if (cfg->fc_nh_id) { if (cfg->fc_oif || cfg->fc_gw_family || cfg->fc_encap || cfg->fc_mp) { NL_SET_ERR_MSG(extack, "Nexthop specification and nexthop id are mutually exclusive"); return -EINVAL; } } if (has_gw && has_via) { NL_SET_ERR_MSG(extack, "Nexthop configuration can not contain both GATEWAY and VIA"); return -EINVAL; } if (!cfg->fc_table) cfg->fc_table = RT_TABLE_MAIN; return 0; errout: return err; } static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_config cfg; struct fib_table *tb; int err; err = rtm_to_fib_config(net, skb, nlh, &cfg, extack); if (err < 0) goto errout; if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) { NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); err = -EINVAL; goto errout; } tb = fib_get_table(net, cfg.fc_table); if (!tb) { NL_SET_ERR_MSG(extack, "FIB table does not exist"); err = -ESRCH; goto errout; } err = fib_table_delete(net, tb, &cfg, extack); errout: return err; } static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_config cfg; struct fib_table *tb; int err; err = rtm_to_fib_config(net, skb, nlh, &cfg, extack); if (err < 0) goto errout; tb = fib_new_table(net, cfg.fc_table); if (!tb) { err = -ENOBUFS; goto errout; } err = fib_table_insert(net, tb, &cfg, extack); if (!err && cfg.fc_type == RTN_LOCAL) net->ipv4.fib_has_custom_local_routes = true; errout: return err; } int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, struct fib_dump_filter *filter, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; struct nlattr *tb[RTA_MAX + 1]; struct rtmsg *rtm; int err, i; if (filter->rtnl_held) ASSERT_RTNL(); if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request"); return -EINVAL; } rtm = nlmsg_data(nlh); if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos || rtm->rtm_scope) { NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request"); return -EINVAL; } if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) { NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request"); return -EINVAL; } if (rtm->rtm_flags & RTM_F_CLONED) filter->dump_routes = false; else filter->dump_exceptions = false; filter->flags = rtm->rtm_flags; filter->protocol = rtm->rtm_protocol; filter->rt_type = rtm->rtm_type; filter->table_id = rtm->rtm_table; err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); if (err < 0) return err; for (i = 0; i <= RTA_MAX; ++i) { int ifindex; if (!tb[i]) continue; switch (i) { case RTA_TABLE: filter->table_id = nla_get_u32(tb[i]); break; case RTA_OIF: ifindex = nla_get_u32(tb[i]); if (filter->rtnl_held) filter->dev = __dev_get_by_index(net, ifindex); else filter->dev = dev_get_by_index_rcu(net, ifindex); if (!filter->dev) return -ENODEV; break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request"); return -EINVAL; } } if (filter->flags || filter->protocol || filter->rt_type || filter->table_id || filter->dev) { filter->filter_set = 1; cb->answer_flags = NLM_F_DUMP_FILTERED; } return 0; } EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req); static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { struct fib_dump_filter filter = { .dump_routes = true, .dump_exceptions = true, .rtnl_held = false, }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); unsigned int h, s_h; unsigned int e = 0, s_e; struct fib_table *tb; struct hlist_head *head; int dumped = 0, err = 0; rcu_read_lock(); if (cb->strict_check) { err = ip_valid_fib_dump_req(net, nlh, &filter, cb); if (err < 0) goto unlock; } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { struct rtmsg *rtm = nlmsg_data(nlh); filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED); } /* ipv4 does not use prefix flag */ if (filter.flags & RTM_F_PREFIX) goto unlock; if (filter.table_id) { tb = fib_get_table(net, filter.table_id); if (!tb) { if (rtnl_msg_family(cb->nlh) != PF_INET) goto unlock; NL_SET_ERR_MSG(cb->extack, "ipv4: FIB table does not exist"); err = -ENOENT; goto unlock; } err = fib_table_dump(tb, skb, cb, &filter); goto unlock; } s_h = cb->args[0]; s_e = cb->args[1]; err = 0; for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv4.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb_hlist) { if (e < s_e) goto next; if (dumped) memset(&cb->args[2], 0, sizeof(cb->args) - 2 * sizeof(cb->args[0])); err = fib_table_dump(tb, skb, cb, &filter); if (err < 0) goto out; dumped = 1; next: e++; } } out: cb->args[1] = e; cb->args[0] = h; unlock: rcu_read_unlock(); return err; } /* Prepare and feed intra-kernel routing request. * Really, it should be netlink message, but :-( netlink * can be not configured, so that we feed it directly * to fib engine. It is legal, because all events occur * only when netlink is already locked. */ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa, u32 rt_priority) { struct net *net = dev_net(ifa->ifa_dev->dev); u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev); struct fib_table *tb; struct fib_config cfg = { .fc_protocol = RTPROT_KERNEL, .fc_type = type, .fc_dst = dst, .fc_dst_len = dst_len, .fc_priority = rt_priority, .fc_prefsrc = ifa->ifa_local, .fc_oif = ifa->ifa_dev->dev->ifindex, .fc_nlflags = NLM_F_CREATE | NLM_F_APPEND, .fc_nlinfo = { .nl_net = net, }, }; if (!tb_id) tb_id = (type == RTN_UNICAST) ? RT_TABLE_MAIN : RT_TABLE_LOCAL; tb = fib_new_table(net, tb_id); if (!tb) return; cfg.fc_table = tb->tb_id; if (type != RTN_LOCAL) cfg.fc_scope = RT_SCOPE_LINK; else cfg.fc_scope = RT_SCOPE_HOST; if (cmd == RTM_NEWROUTE) fib_table_insert(net, tb, &cfg, NULL); else fib_table_delete(net, tb, &cfg, NULL); } void fib_add_ifaddr(struct in_ifaddr *ifa) { struct in_device *in_dev = ifa->ifa_dev; struct net_device *dev = in_dev->dev; struct in_ifaddr *prim = ifa; __be32 mask = ifa->ifa_mask; __be32 addr = ifa->ifa_local; __be32 prefix = ifa->ifa_address & mask; if (ifa->ifa_flags & IFA_F_SECONDARY) { prim = inet_ifa_byprefix(in_dev, prefix, mask); if (!prim) { pr_warn("%s: bug: prim == NULL\n", __func__); return; } } fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim, 0); if (!(dev->flags & IFF_UP)) return; /* Add broadcast address, if it is explicitly assigned. */ if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) { fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim, 0); arp_invalidate(dev, ifa->ifa_broadcast, false); } if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) && (prefix != addr || ifa->ifa_prefixlen < 32)) { if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE)) fib_magic(RTM_NEWROUTE, dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim, ifa->ifa_rt_priority); /* Add the network broadcast address, when it makes sense */ if (ifa->ifa_prefixlen < 31) { fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask, 32, prim, 0); arp_invalidate(dev, prefix | ~mask, false); } } } void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric) { __be32 prefix = ifa->ifa_address & ifa->ifa_mask; struct in_device *in_dev = ifa->ifa_dev; struct net_device *dev = in_dev->dev; if (!(dev->flags & IFF_UP) || ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) || ipv4_is_zeronet(prefix) || (prefix == ifa->ifa_local && ifa->ifa_prefixlen == 32)) return; /* add the new */ fib_magic(RTM_NEWROUTE, dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, prefix, ifa->ifa_prefixlen, ifa, new_metric); /* delete the old */ fib_magic(RTM_DELROUTE, dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, prefix, ifa->ifa_prefixlen, ifa, ifa->ifa_rt_priority); } /* Delete primary or secondary address. * Optionally, on secondary address promotion consider the addresses * from subnet iprim as deleted, even if they are in device list. * In this case the secondary ifa can be in device list. */ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) { struct in_device *in_dev = ifa->ifa_dev; struct net_device *dev = in_dev->dev; struct in_ifaddr *ifa1; struct in_ifaddr *prim = ifa, *prim1 = NULL; __be32 brd = ifa->ifa_address | ~ifa->ifa_mask; __be32 any = ifa->ifa_address & ifa->ifa_mask; #define LOCAL_OK 1 #define BRD_OK 2 #define BRD0_OK 4 #define BRD1_OK 8 unsigned int ok = 0; int subnet = 0; /* Primary network */ int gone = 1; /* Address is missing */ int same_prefsrc = 0; /* Another primary with same IP */ if (ifa->ifa_flags & IFA_F_SECONDARY) { prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); if (!prim) { /* if the device has been deleted, we don't perform * address promotion */ if (!in_dev->dead) pr_warn("%s: bug: prim == NULL\n", __func__); return; } if (iprim && iprim != prim) { pr_warn("%s: bug: iprim != prim\n", __func__); return; } } else if (!ipv4_is_zeronet(any) && (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) { if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE)) fib_magic(RTM_DELROUTE, dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, any, ifa->ifa_prefixlen, prim, 0); subnet = 1; } if (in_dev->dead) goto no_promotions; /* Deletion is more complicated than add. * We should take care of not to delete too much :-) * * Scan address list to be sure that addresses are really gone. */ rcu_read_lock(); in_dev_for_each_ifa_rcu(ifa1, in_dev) { if (ifa1 == ifa) { /* promotion, keep the IP */ gone = 0; continue; } /* Ignore IFAs from our subnet */ if (iprim && ifa1->ifa_mask == iprim->ifa_mask && inet_ifa_match(ifa1->ifa_address, iprim)) continue; /* Ignore ifa1 if it uses different primary IP (prefsrc) */ if (ifa1->ifa_flags & IFA_F_SECONDARY) { /* Another address from our subnet? */ if (ifa1->ifa_mask == prim->ifa_mask && inet_ifa_match(ifa1->ifa_address, prim)) prim1 = prim; else { /* We reached the secondaries, so * same_prefsrc should be determined. */ if (!same_prefsrc) continue; /* Search new prim1 if ifa1 is not * using the current prim1 */ if (!prim1 || ifa1->ifa_mask != prim1->ifa_mask || !inet_ifa_match(ifa1->ifa_address, prim1)) prim1 = inet_ifa_byprefix(in_dev, ifa1->ifa_address, ifa1->ifa_mask); if (!prim1) continue; if (prim1->ifa_local != prim->ifa_local) continue; } } else { if (prim->ifa_local != ifa1->ifa_local) continue; prim1 = ifa1; if (prim != prim1) same_prefsrc = 1; } if (ifa->ifa_local == ifa1->ifa_local) ok |= LOCAL_OK; if (ifa->ifa_broadcast == ifa1->ifa_broadcast) ok |= BRD_OK; if (brd == ifa1->ifa_broadcast) ok |= BRD1_OK; if (any == ifa1->ifa_broadcast) ok |= BRD0_OK; /* primary has network specific broadcasts */ if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) { __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask; __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask; if (!ipv4_is_zeronet(any1)) { if (ifa->ifa_broadcast == brd1 || ifa->ifa_broadcast == any1) ok |= BRD_OK; if (brd == brd1 || brd == any1) ok |= BRD1_OK; if (any == brd1 || any == any1) ok |= BRD0_OK; } } } rcu_read_unlock(); no_promotions: if (!(ok & BRD_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim, 0); if (subnet && ifa->ifa_prefixlen < 31) { if (!(ok & BRD1_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim, 0); if (!(ok & BRD0_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim, 0); } if (!(ok & LOCAL_OK)) { unsigned int addr_type; fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim, 0); /* Check, that this local address finally disappeared. */ addr_type = inet_addr_type_dev_table(dev_net(dev), dev, ifa->ifa_local); if (gone && addr_type != RTN_LOCAL) { /* And the last, but not the least thing. * We must flush stray FIB entries. * * First of all, we scan fib_info list searching * for stray nexthop entries, then ignite fib_flush. */ if (fib_sync_down_addr(dev, ifa->ifa_local)) fib_flush(dev_net(dev)); } } #undef LOCAL_OK #undef BRD_OK #undef BRD0_OK #undef BRD1_OK } static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn) { struct fib_result res; struct flowi4 fl4 = { .flowi4_mark = frn->fl_mark, .daddr = frn->fl_addr, .flowi4_tos = frn->fl_tos, .flowi4_scope = frn->fl_scope, }; struct fib_table *tb; rcu_read_lock(); tb = fib_get_table(net, frn->tb_id_in); frn->err = -ENOENT; if (tb) { local_bh_disable(); frn->tb_id = tb->tb_id; frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); if (!frn->err) { frn->prefixlen = res.prefixlen; frn->nh_sel = res.nh_sel; frn->type = res.type; frn->scope = res.scope; } local_bh_enable(); } rcu_read_unlock(); } static void nl_fib_input(struct sk_buff *skb) { struct net *net; struct fib_result_nl *frn; struct nlmsghdr *nlh; u32 portid; net = sock_net(skb->sk); nlh = nlmsg_hdr(skb); if (skb->len < nlmsg_total_size(sizeof(*frn)) || skb->len < nlh->nlmsg_len || nlmsg_len(nlh) < sizeof(*frn)) return; skb = netlink_skb_clone(skb, GFP_KERNEL); if (!skb) return; nlh = nlmsg_hdr(skb); frn = nlmsg_data(nlh); nl_fib_lookup(net, frn); portid = NETLINK_CB(skb).portid; /* netlink portid */ NETLINK_CB(skb).portid = 0; /* from kernel */ NETLINK_CB(skb).dst_group = 0; /* unicast */ nlmsg_unicast(net->ipv4.fibnl, skb, portid); } static int __net_init nl_fib_lookup_init(struct net *net) { struct sock *sk; struct netlink_kernel_cfg cfg = { .input = nl_fib_input, }; sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg); if (!sk) return -EAFNOSUPPORT; net->ipv4.fibnl = sk; return 0; } static void nl_fib_lookup_exit(struct net *net) { netlink_kernel_release(net->ipv4.fibnl); net->ipv4.fibnl = NULL; } static void fib_disable_ip(struct net_device *dev, unsigned long event, bool force) { if (fib_sync_down_dev(dev, event, force)) fib_flush(dev_net(dev)); else rt_cache_flush(dev_net(dev)); arp_ifdown(dev); } static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct in_ifaddr *ifa = ptr; struct net_device *dev = ifa->ifa_dev->dev; struct net *net = dev_net(dev); switch (event) { case NETDEV_UP: fib_add_ifaddr(ifa); #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev, RTNH_F_DEAD); #endif atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(dev_net(dev)); break; case NETDEV_DOWN: fib_del_ifaddr(ifa, NULL); atomic_inc(&net->ipv4.dev_addr_genid); if (!ifa->ifa_dev->ifa_list) { /* Last address was deleted from this interface. * Disable IP. */ fib_disable_ip(dev, event, true); } else { rt_cache_flush(dev_net(dev)); } break; } return NOTIFY_DONE; } static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netdev_notifier_changeupper_info *upper_info = ptr; struct netdev_notifier_info_ext *info_ext = ptr; struct in_device *in_dev; struct net *net = dev_net(dev); struct in_ifaddr *ifa; unsigned int flags; if (event == NETDEV_UNREGISTER) { fib_disable_ip(dev, event, true); rt_flush_dev(dev); return NOTIFY_DONE; } in_dev = __in_dev_get_rtnl(dev); if (!in_dev) return NOTIFY_DONE; switch (event) { case NETDEV_UP: in_dev_for_each_ifa_rtnl(ifa, in_dev) { fib_add_ifaddr(ifa); } #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev, RTNH_F_DEAD); #endif atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(net); break; case NETDEV_DOWN: fib_disable_ip(dev, event, false); break; case NETDEV_CHANGE: flags = dev_get_flags(dev); if (flags & (IFF_RUNNING | IFF_LOWER_UP)) fib_sync_up(dev, RTNH_F_LINKDOWN); else fib_sync_down_dev(dev, event, false); rt_cache_flush(net); break; case NETDEV_CHANGEMTU: fib_sync_mtu(dev, info_ext->ext.mtu); rt_cache_flush(net); break; case NETDEV_CHANGEUPPER: upper_info = ptr; /* flush all routes if dev is linked to or unlinked from * an L3 master device (e.g., VRF) */ if (upper_info->upper_dev && netif_is_l3_master(upper_info->upper_dev)) fib_disable_ip(dev, NETDEV_DOWN, true); break; } return NOTIFY_DONE; } static struct notifier_block fib_inetaddr_notifier = { .notifier_call = fib_inetaddr_event, }; static struct notifier_block fib_netdev_notifier = { .notifier_call = fib_netdev_event, }; static int __net_init ip_fib_net_init(struct net *net) { int err; size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ; err = fib4_notifier_init(net); if (err) return err; #ifdef CONFIG_IP_ROUTE_MULTIPATH /* Default to 3-tuple */ net->ipv4.sysctl_fib_multipath_hash_fields = FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK; #endif /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL); if (!net->ipv4.fib_table_hash) { err = -ENOMEM; goto err_table_hash_alloc; } err = fib4_rules_init(net); if (err < 0) goto err_rules_init; return 0; err_rules_init: kfree(net->ipv4.fib_table_hash); err_table_hash_alloc: fib4_notifier_exit(net); return err; } static void ip_fib_net_exit(struct net *net) { int i; ASSERT_RTNL(); #ifdef CONFIG_IP_MULTIPLE_TABLES RCU_INIT_POINTER(net->ipv4.fib_main, NULL); RCU_INIT_POINTER(net->ipv4.fib_default, NULL); #endif /* Destroy the tables in reverse order to guarantee that the * local table, ID 255, is destroyed before the main table, ID * 254. This is necessary as the local table may contain * references to data contained in the main table. */ for (i = FIB_TABLE_HASHSZ - 1; i >= 0; i--) { struct hlist_head *head = &net->ipv4.fib_table_hash[i]; struct hlist_node *tmp; struct fib_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) { hlist_del(&tb->tb_hlist); fib_table_flush(net, tb, true); fib_free_table(tb); } } #ifdef CONFIG_IP_MULTIPLE_TABLES fib4_rules_exit(net); #endif kfree(net->ipv4.fib_table_hash); fib4_notifier_exit(net); } static int __net_init fib_net_init(struct net *net) { int error; #ifdef CONFIG_IP_ROUTE_CLASSID atomic_set(&net->ipv4.fib_num_tclassid_users, 0); #endif error = ip_fib_net_init(net); if (error < 0) goto out; error = nl_fib_lookup_init(net); if (error < 0) goto out_nlfl; error = fib_proc_init(net); if (error < 0) goto out_proc; out: return error; out_proc: nl_fib_lookup_exit(net); out_nlfl: rtnl_lock(); ip_fib_net_exit(net); rtnl_unlock(); goto out; } static void __net_exit fib_net_exit(struct net *net) { fib_proc_exit(net); nl_fib_lookup_exit(net); } static void __net_exit fib_net_exit_batch(struct list_head *net_list) { struct net *net; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) ip_fib_net_exit(net); rtnl_unlock(); } static struct pernet_operations fib_net_ops = { .init = fib_net_init, .exit = fib_net_exit, .exit_batch = fib_net_exit_batch, }; void __init ip_fib_init(void) { fib_trie_init(); register_pernet_subsys(&fib_net_ops); register_netdevice_notifier(&fib_netdev_notifier); register_inetaddr_notifier(&fib_inetaddr_notifier); rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, 0); rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, 0); rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, RTNL_FLAG_DUMP_UNLOCKED); }
9 9 9 17 13 3 12 4 10 6 10 6 13 3 2 7 16 16 3 1 3 3 1 3 6 6 2 2 3 3 3 6 6 6 1 1 7 7 7 7 7 7 1 1 1 1 1 1 16 2 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 // SPDX-License-Identifier: GPL-2.0 #include <linux/jhash.h> #include <linux/netfilter.h> #include <linux/rcupdate.h> #include <linux/rhashtable.h> #include <linux/vmalloc.h> #include <net/genetlink.h> #include <net/netns/generic.h> #include <uapi/linux/genetlink.h> #include "ila.h" struct ila_xlat_params { struct ila_params ip; int ifindex; }; struct ila_map { struct ila_xlat_params xp; struct rhash_head node; struct ila_map __rcu *next; struct rcu_head rcu; }; #define MAX_LOCKS 1024 #define LOCKS_PER_CPU 10 static int alloc_ila_locks(struct ila_net *ilan) { return alloc_bucket_spinlocks(&ilan->xlat.locks, &ilan->xlat.locks_mask, MAX_LOCKS, LOCKS_PER_CPU, GFP_KERNEL); } static u32 hashrnd __read_mostly; static __always_inline void __ila_hash_secret_init(void) { net_get_random_once(&hashrnd, sizeof(hashrnd)); } static inline u32 ila_locator_hash(struct ila_locator loc) { u32 *v = (u32 *)loc.v32; __ila_hash_secret_init(); return jhash_2words(v[0], v[1], hashrnd); } static inline spinlock_t *ila_get_lock(struct ila_net *ilan, struct ila_locator loc) { return &ilan->xlat.locks[ila_locator_hash(loc) & ilan->xlat.locks_mask]; } static inline int ila_cmp_wildcards(struct ila_map *ila, struct ila_addr *iaddr, int ifindex) { return (ila->xp.ifindex && ila->xp.ifindex != ifindex); } static inline int ila_cmp_params(struct ila_map *ila, struct ila_xlat_params *xp) { return (ila->xp.ifindex != xp->ifindex); } static int ila_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) { const struct ila_map *ila = obj; return (ila->xp.ip.locator_match.v64 != *(__be64 *)arg->key); } static inline int ila_order(struct ila_map *ila) { int score = 0; if (ila->xp.ifindex) score += 1 << 1; return score; } static const struct rhashtable_params rht_params = { .nelem_hint = 1024, .head_offset = offsetof(struct ila_map, node), .key_offset = offsetof(struct ila_map, xp.ip.locator_match), .key_len = sizeof(u64), /* identifier */ .max_size = 1048576, .min_size = 256, .automatic_shrinking = true, .obj_cmpfn = ila_cmpfn, }; static int parse_nl_config(struct genl_info *info, struct ila_xlat_params *xp) { memset(xp, 0, sizeof(*xp)); if (info->attrs[ILA_ATTR_LOCATOR]) xp->ip.locator.v64 = (__force __be64)nla_get_u64( info->attrs[ILA_ATTR_LOCATOR]); if (info->attrs[ILA_ATTR_LOCATOR_MATCH]) xp->ip.locator_match.v64 = (__force __be64)nla_get_u64( info->attrs[ILA_ATTR_LOCATOR_MATCH]); if (info->attrs[ILA_ATTR_CSUM_MODE]) xp->ip.csum_mode = nla_get_u8(info->attrs[ILA_ATTR_CSUM_MODE]); else xp->ip.csum_mode = ILA_CSUM_NO_ACTION; if (info->attrs[ILA_ATTR_IDENT_TYPE]) xp->ip.ident_type = nla_get_u8( info->attrs[ILA_ATTR_IDENT_TYPE]); else xp->ip.ident_type = ILA_ATYPE_USE_FORMAT; if (info->attrs[ILA_ATTR_IFINDEX]) xp->ifindex = nla_get_s32(info->attrs[ILA_ATTR_IFINDEX]); return 0; } /* Must be called with rcu readlock */ static inline struct ila_map *ila_lookup_wildcards(struct ila_addr *iaddr, int ifindex, struct ila_net *ilan) { struct ila_map *ila; ila = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &iaddr->loc, rht_params); while (ila) { if (!ila_cmp_wildcards(ila, iaddr, ifindex)) return ila; ila = rcu_access_pointer(ila->next); } return NULL; } /* Must be called with rcu readlock */ static inline struct ila_map *ila_lookup_by_params(struct ila_xlat_params *xp, struct ila_net *ilan) { struct ila_map *ila; ila = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &xp->ip.locator_match, rht_params); while (ila) { if (!ila_cmp_params(ila, xp)) return ila; ila = rcu_access_pointer(ila->next); } return NULL; } static inline void ila_release(struct ila_map *ila) { kfree_rcu(ila, rcu); } static void ila_free_node(struct ila_map *ila) { struct ila_map *next; /* Assume rcu_readlock held */ while (ila) { next = rcu_access_pointer(ila->next); ila_release(ila); ila = next; } } static void ila_free_cb(void *ptr, void *arg) { ila_free_node((struct ila_map *)ptr); } static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila); static unsigned int ila_nf_input(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { ila_xlat_addr(skb, false); return NF_ACCEPT; } static const struct nf_hook_ops ila_nf_hook_ops[] = { { .hook = ila_nf_input, .pf = NFPROTO_IPV6, .hooknum = NF_INET_PRE_ROUTING, .priority = -1, }, }; static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) { struct ila_net *ilan = net_generic(net, ila_net_id); struct ila_map *ila, *head; spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match); int err = 0, order; if (!ilan->xlat.hooks_registered) { /* We defer registering net hooks in the namespace until the * first mapping is added. */ err = nf_register_net_hooks(net, ila_nf_hook_ops, ARRAY_SIZE(ila_nf_hook_ops)); if (err) return err; ilan->xlat.hooks_registered = true; } ila = kzalloc(sizeof(*ila), GFP_KERNEL); if (!ila) return -ENOMEM; ila_init_saved_csum(&xp->ip); ila->xp = *xp; order = ila_order(ila); spin_lock(lock); head = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &xp->ip.locator_match, rht_params); if (!head) { /* New entry for the rhash_table */ err = rhashtable_lookup_insert_fast(&ilan->xlat.rhash_table, &ila->node, rht_params); } else { struct ila_map *tila = head, *prev = NULL; do { if (!ila_cmp_params(tila, xp)) { err = -EEXIST; goto out; } if (order > ila_order(tila)) break; prev = tila; tila = rcu_dereference_protected(tila->next, lockdep_is_held(lock)); } while (tila); if (prev) { /* Insert in sub list of head */ RCU_INIT_POINTER(ila->next, tila); rcu_assign_pointer(prev->next, ila); } else { /* Make this ila new head */ RCU_INIT_POINTER(ila->next, head); err = rhashtable_replace_fast(&ilan->xlat.rhash_table, &head->node, &ila->node, rht_params); if (err) goto out; } } out: spin_unlock(lock); if (err) kfree(ila); return err; } static int ila_del_mapping(struct net *net, struct ila_xlat_params *xp) { struct ila_net *ilan = net_generic(net, ila_net_id); struct ila_map *ila, *head, *prev; spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match); int err = -ENOENT; spin_lock(lock); head = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &xp->ip.locator_match, rht_params); ila = head; prev = NULL; while (ila) { if (ila_cmp_params(ila, xp)) { prev = ila; ila = rcu_dereference_protected(ila->next, lockdep_is_held(lock)); continue; } err = 0; if (prev) { /* Not head, just delete from list */ rcu_assign_pointer(prev->next, ila->next); } else { /* It is the head. If there is something in the * sublist we need to make a new head. */ head = rcu_dereference_protected(ila->next, lockdep_is_held(lock)); if (head) { /* Put first entry in the sublist into the * table */ err = rhashtable_replace_fast( &ilan->xlat.rhash_table, &ila->node, &head->node, rht_params); if (err) goto out; } else { /* Entry no longer used */ err = rhashtable_remove_fast( &ilan->xlat.rhash_table, &ila->node, rht_params); } } ila_release(ila); break; } out: spin_unlock(lock); return err; } int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct ila_xlat_params p; int err; err = parse_nl_config(info, &p); if (err) return err; return ila_add_mapping(net, &p); } int ila_xlat_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct ila_xlat_params xp; int err; err = parse_nl_config(info, &xp); if (err) return err; ila_del_mapping(net, &xp); return 0; } static inline spinlock_t *lock_from_ila_map(struct ila_net *ilan, struct ila_map *ila) { return ila_get_lock(ilan, ila->xp.ip.locator_match); } int ila_xlat_nl_cmd_flush(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct ila_net *ilan = net_generic(net, ila_net_id); struct rhashtable_iter iter; struct ila_map *ila; spinlock_t *lock; int ret = 0; rhashtable_walk_enter(&ilan->xlat.rhash_table, &iter); rhashtable_walk_start(&iter); for (;;) { ila = rhashtable_walk_next(&iter); if (IS_ERR(ila)) { if (PTR_ERR(ila) == -EAGAIN) continue; ret = PTR_ERR(ila); goto done; } else if (!ila) { break; } lock = lock_from_ila_map(ilan, ila); spin_lock(lock); ret = rhashtable_remove_fast(&ilan->xlat.rhash_table, &ila->node, rht_params); if (!ret) ila_free_node(ila); spin_unlock(lock); if (ret) break; } done: rhashtable_walk_stop(&iter); rhashtable_walk_exit(&iter); return ret; } static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg) { if (nla_put_u64_64bit(msg, ILA_ATTR_LOCATOR, (__force u64)ila->xp.ip.locator.v64, ILA_ATTR_PAD) || nla_put_u64_64bit(msg, ILA_ATTR_LOCATOR_MATCH, (__force u64)ila->xp.ip.locator_match.v64, ILA_ATTR_PAD) || nla_put_s32(msg, ILA_ATTR_IFINDEX, ila->xp.ifindex) || nla_put_u8(msg, ILA_ATTR_CSUM_MODE, ila->xp.ip.csum_mode) || nla_put_u8(msg, ILA_ATTR_IDENT_TYPE, ila->xp.ip.ident_type)) return -1; return 0; } static int ila_dump_info(struct ila_map *ila, u32 portid, u32 seq, u32 flags, struct sk_buff *skb, u8 cmd) { void *hdr; hdr = genlmsg_put(skb, portid, seq, &ila_nl_family, flags, cmd); if (!hdr) return -ENOMEM; if (ila_fill_info(ila, skb) < 0) goto nla_put_failure; genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } int ila_xlat_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct ila_net *ilan = net_generic(net, ila_net_id); struct sk_buff *msg; struct ila_xlat_params xp; struct ila_map *ila; int ret; ret = parse_nl_config(info, &xp); if (ret) return ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; rcu_read_lock(); ret = -ESRCH; ila = ila_lookup_by_params(&xp, ilan); if (ila) { ret = ila_dump_info(ila, info->snd_portid, info->snd_seq, 0, msg, info->genlhdr->cmd); } rcu_read_unlock(); if (ret < 0) goto out_free; return genlmsg_reply(msg, info); out_free: nlmsg_free(msg); return ret; } struct ila_dump_iter { struct rhashtable_iter rhiter; int skip; }; int ila_xlat_nl_dump_start(struct netlink_callback *cb) { struct net *net = sock_net(cb->skb->sk); struct ila_net *ilan = net_generic(net, ila_net_id); struct ila_dump_iter *iter; iter = kmalloc(sizeof(*iter), GFP_KERNEL); if (!iter) return -ENOMEM; rhashtable_walk_enter(&ilan->xlat.rhash_table, &iter->rhiter); iter->skip = 0; cb->args[0] = (long)iter; return 0; } int ila_xlat_nl_dump_done(struct netlink_callback *cb) { struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0]; rhashtable_walk_exit(&iter->rhiter); kfree(iter); return 0; } int ila_xlat_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0]; struct rhashtable_iter *rhiter = &iter->rhiter; int skip = iter->skip; struct ila_map *ila; int ret; rhashtable_walk_start(rhiter); /* Get first entry */ ila = rhashtable_walk_peek(rhiter); if (ila && !IS_ERR(ila) && skip) { /* Skip over visited entries */ while (ila && skip) { /* Skip over any ila entries in this list that we * have already dumped. */ ila = rcu_access_pointer(ila->next); skip--; } } skip = 0; for (;;) { if (IS_ERR(ila)) { ret = PTR_ERR(ila); if (ret == -EAGAIN) { /* Table has changed and iter has reset. Return * -EAGAIN to the application even if we have * written data to the skb. The application * needs to deal with this. */ goto out_ret; } else { break; } } else if (!ila) { ret = 0; break; } while (ila) { ret = ila_dump_info(ila, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, skb, ILA_CMD_GET); if (ret) goto out; skip++; ila = rcu_access_pointer(ila->next); } skip = 0; ila = rhashtable_walk_next(rhiter); } out: iter->skip = skip; ret = (skb->len ? : ret); out_ret: rhashtable_walk_stop(rhiter); return ret; } int ila_xlat_init_net(struct net *net) { struct ila_net *ilan = net_generic(net, ila_net_id); int err; err = alloc_ila_locks(ilan); if (err) return err; err = rhashtable_init(&ilan->xlat.rhash_table, &rht_params); if (err) { free_bucket_spinlocks(ilan->xlat.locks); return err; } return 0; } void ila_xlat_exit_net(struct net *net) { struct ila_net *ilan = net_generic(net, ila_net_id); rhashtable_free_and_destroy(&ilan->xlat.rhash_table, ila_free_cb, NULL); free_bucket_spinlocks(ilan->xlat.locks); if (ilan->xlat.hooks_registered) nf_unregister_net_hooks(net, ila_nf_hook_ops, ARRAY_SIZE(ila_nf_hook_ops)); } static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila) { struct ila_map *ila; struct ipv6hdr *ip6h = ipv6_hdr(skb); struct net *net = dev_net(skb->dev); struct ila_net *ilan = net_generic(net, ila_net_id); struct ila_addr *iaddr = ila_a2i(&ip6h->daddr); /* Assumes skb contains a valid IPv6 header that is pulled */ /* No check here that ILA type in the mapping matches what is in the * address. We assume that whatever sender gaves us can be translated. * The checksum mode however is relevant. */ rcu_read_lock(); ila = ila_lookup_wildcards(iaddr, skb->dev->ifindex, ilan); if (ila) ila_update_ipv6_locator(skb, &ila->xp.ip, sir2ila); rcu_read_unlock(); return 0; }
145 2392 2711 64 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PID_H #define _LINUX_PID_H #include <linux/pid_types.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/refcount.h> #include <linux/sched.h> #include <linux/wait.h> /* * What is struct pid? * * A struct pid is the kernel's internal notion of a process identifier. * It refers to individual tasks, process groups, and sessions. While * there are processes attached to it the struct pid lives in a hash * table, so it and then the processes that it refers to can be found * quickly from the numeric pid value. The attached processes may be * quickly accessed by following pointers from struct pid. * * Storing pid_t values in the kernel and referring to them later has a * problem. The process originally with that pid may have exited and the * pid allocator wrapped, and another process could have come along * and been assigned that pid. * * Referring to user space processes by holding a reference to struct * task_struct has a problem. When the user space process exits * the now useless task_struct is still kept. A task_struct plus a * stack consumes around 10K of low kernel memory. More precisely * this is THREAD_SIZE + sizeof(struct task_struct). By comparison * a struct pid is about 64 bytes. * * Holding a reference to struct pid solves both of these problems. * It is small so holding a reference does not consume a lot of * resources, and since a new struct pid is allocated when the numeric pid * value is reused (when pids wrap around) we don't mistakenly refer to new * processes. */ /* * struct upid is used to get the id of the struct pid, as it is * seen in particular namespace. Later the struct pid is found with * find_pid_ns() using the int nr and struct pid_namespace *ns. */ struct upid { int nr; struct pid_namespace *ns; }; struct pid { refcount_t count; unsigned int level; spinlock_t lock; #ifdef CONFIG_FS_PID struct dentry *stashed; unsigned long ino; #endif /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX]; struct hlist_head inodes; /* wait queue for pidfd notifications */ wait_queue_head_t wait_pidfd; struct rcu_head rcu; struct upid numbers[]; }; extern struct pid init_struct_pid; struct file; struct pid *pidfd_pid(const struct file *file); struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags); struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags); int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret); void do_notify_pidfd(struct task_struct *task); static inline struct pid *get_pid(struct pid *pid) { if (pid) refcount_inc(&pid->count); return pid; } extern void put_pid(struct pid *pid); extern struct task_struct *pid_task(struct pid *pid, enum pid_type); static inline bool pid_has_task(struct pid *pid, enum pid_type type) { return !hlist_empty(&pid->tasks[type]); } extern struct task_struct *get_pid_task(struct pid *pid, enum pid_type); extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type); /* * these helpers must be called with the tasklist_lock write-held. */ extern void attach_pid(struct task_struct *task, enum pid_type); extern void detach_pid(struct task_struct *task, enum pid_type); extern void change_pid(struct task_struct *task, enum pid_type, struct pid *pid); extern void exchange_tids(struct task_struct *task, struct task_struct *old); extern void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type); extern int pid_max; extern int pid_max_min, pid_max_max; /* * look up a PID in the hash table. Must be called with the tasklist_lock * or rcu_read_lock() held. * * find_pid_ns() finds the pid in the namespace specified * find_vpid() finds the pid by its virtual id, i.e. in the current namespace * * see also find_task_by_vpid() set in include/linux/sched.h */ extern struct pid *find_pid_ns(int nr, struct pid_namespace *ns); extern struct pid *find_vpid(int nr); /* * Lookup a PID in the hash table, and return with it's count elevated. */ extern struct pid *find_get_pid(int nr); extern struct pid *find_ge_pid(int nr, struct pid_namespace *); extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, size_t set_tid_size); extern void free_pid(struct pid *pid); extern void disable_pid_allocation(struct pid_namespace *ns); /* * ns_of_pid() returns the pid namespace in which the specified pid was * allocated. * * NOTE: * ns_of_pid() is expected to be called for a process (task) that has * an attached 'struct pid' (see attach_pid(), detach_pid()) i.e @pid * is expected to be non-NULL. If @pid is NULL, caller should handle * the resulting NULL pid-ns. */ static inline struct pid_namespace *ns_of_pid(struct pid *pid) { struct pid_namespace *ns = NULL; if (pid) ns = pid->numbers[pid->level].ns; return ns; } /* * is_child_reaper returns true if the pid is the init process * of the current namespace. As this one could be checked before * pid_ns->child_reaper is assigned in copy_process, we check * with the pid number. */ static inline bool is_child_reaper(struct pid *pid) { return pid->numbers[pid->level].nr == 1; } /* * the helpers to get the pid's id seen from different namespaces * * pid_nr() : global id, i.e. the id seen from the init namespace; * pid_vnr() : virtual id, i.e. the id seen from the pid namespace of * current. * pid_nr_ns() : id seen from the ns specified. * * see also task_xid_nr() etc in include/linux/sched.h */ static inline pid_t pid_nr(struct pid *pid) { pid_t nr = 0; if (pid) nr = pid->numbers[0].nr; return nr; } pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns); pid_t pid_vnr(struct pid *pid); #define do_each_pid_task(pid, type, task) \ do { \ if ((pid) != NULL) \ hlist_for_each_entry_rcu((task), \ &(pid)->tasks[type], pid_links[type]) { /* * Both old and new leaders may be attached to * the same pid in the middle of de_thread(). */ #define while_each_pid_task(pid, type, task) \ if (type == PIDTYPE_PID) \ break; \ } \ } while (0) #define do_each_pid_thread(pid, type, task) \ do_each_pid_task(pid, type, task) { \ struct task_struct *tg___ = task; \ for_each_thread(tg___, task) { #define while_each_pid_thread(pid, type, task) \ } \ task = tg___; \ } while_each_pid_task(pid, type, task) static inline struct pid *task_pid(struct task_struct *task) { return task->thread_pid; } /* * the helpers to get the task's different pids as they are seen * from various namespaces * * task_xid_nr() : global id, i.e. the id seen from the init namespace; * task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of * current. * task_xid_nr_ns() : id seen from the ns specified; * * see also pid_nr() etc in include/linux/pid.h */ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns); static inline pid_t task_pid_nr(struct task_struct *tsk) { return tsk->pid; } static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns); } static inline pid_t task_pid_vnr(struct task_struct *tsk) { return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL); } static inline pid_t task_tgid_nr(struct task_struct *tsk) { return tsk->tgid; } /** * pid_alive - check that a task structure is not stale * @p: Task structure to be checked. * * Test if a process is not yet dead (at most zombie state) * If pid_alive fails, then pointers within the task structure * can be stale and must not be dereferenced. * * Return: 1 if the process is alive. 0 otherwise. */ static inline int pid_alive(const struct task_struct *p) { return p->thread_pid != NULL; } static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns); } static inline pid_t task_pgrp_vnr(struct task_struct *tsk) { return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL); } static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns); } static inline pid_t task_session_vnr(struct task_struct *tsk) { return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL); } static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns); } static inline pid_t task_tgid_vnr(struct task_struct *tsk) { return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL); } static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns) { pid_t pid = 0; rcu_read_lock(); if (pid_alive(tsk)) pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns); rcu_read_unlock(); return pid; } static inline pid_t task_ppid_nr(const struct task_struct *tsk) { return task_ppid_nr_ns(tsk, &init_pid_ns); } /* Obsolete, do not use: */ static inline pid_t task_pgrp_nr(struct task_struct *tsk) { return task_pgrp_nr_ns(tsk, &init_pid_ns); } /** * is_global_init - check if a task structure is init. Since init * is free to have sub-threads we need to check tgid. * @tsk: Task structure to be checked. * * Check if a task structure is the first user space task the kernel created. * * Return: 1 if the task structure is init. 0 otherwise. */ static inline int is_global_init(struct task_struct *tsk) { return task_tgid_nr(tsk) == 1; } #endif /* _LINUX_PID_H */
9 4 4 1 1 3 1 1 1 6 3 3 3 3 3 2 4 1 1 1 1 1 22 7 7 15 15 3 6 6 1 1 4 42 43 15 34 1 1 1 1 1 1 9 9 2 7 5 5 1 2 2 13 11 11 2 2 10 3 3 3 12 1 13 5 8 3 1 1 2 2 1 2 3 3 6 6 1 1 1 1 1 1 1 1 1 1 1 1 1 4 1 3 1 1 2 2 3 1 2 2 1 1 5 1 1 1 1 1 2 1 1 12 1 6 4 1 4 1 3 2 2 1 1 2 1 1 1 1 9 9 3 5 5 5 1 1 4 1 3 2 1 2 3 3 3 3 1 2 2 2 3 1 2 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 1 1 1 1 1 1 1 1 28 26 1 1 1 1 1 1 1 1 1 1 111 72 143 38 180 32 9 5 13 1 1 3 1 2 1 1 4 1 2 3 2 5 2 2 1 9 1 4 3 2 1 1 1 1 1 3 1 1 1 3 2 12 1 1 1 2 1 1 1 3 28 1 1 1 1 1 179 176 8 144 38 181 3 180 1 182 104 63 1 13 181 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/core/ethtool.c - Ethtool ioctl handler * Copyright (c) 2003 Matthew Wilcox <matthew@wil.cx> * * This file is where we call all the ethtool_ops commands to get * the information ethtool needs. */ #include <linux/compat.h> #include <linux/etherdevice.h> #include <linux/module.h> #include <linux/types.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/ethtool.h> #include <linux/netdevice.h> #include <linux/net_tstamp.h> #include <linux/phy.h> #include <linux/bitops.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> #include <linux/sfp.h> #include <linux/slab.h> #include <linux/rtnetlink.h> #include <linux/sched/signal.h> #include <linux/net.h> #include <linux/pm_runtime.h> #include <linux/utsname.h> #include <net/devlink.h> #include <net/ipv6.h> #include <net/xdp_sock_drv.h> #include <net/flow_offload.h> #include <linux/ethtool_netlink.h> #include "common.h" /* State held across locks and calls for commands which have devlink fallback */ struct ethtool_devlink_compat { struct devlink *devlink; union { struct ethtool_flash efl; struct ethtool_drvinfo info; }; }; static struct devlink *netdev_to_devlink_get(struct net_device *dev) { if (!dev->devlink_port) return NULL; return devlink_try_get(dev->devlink_port->devlink); } /* * Some useful ethtool_ops methods that're device independent. * If we find that all drivers want to do the same thing here, * we can turn these into dev_() function calls. */ u32 ethtool_op_get_link(struct net_device *dev) { /* Synchronize carrier state with link watch, see also rtnl_getlink() */ linkwatch_sync_dev(dev); return netif_carrier_ok(dev) ? 1 : 0; } EXPORT_SYMBOL(ethtool_op_get_link); int ethtool_op_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info) { info->so_timestamping = SOF_TIMESTAMPING_TX_SOFTWARE | SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_SOFTWARE; info->phc_index = -1; return 0; } EXPORT_SYMBOL(ethtool_op_get_ts_info); /* Handlers for each ethtool command */ static int ethtool_get_features(struct net_device *dev, void __user *useraddr) { struct ethtool_gfeatures cmd = { .cmd = ETHTOOL_GFEATURES, .size = ETHTOOL_DEV_FEATURE_WORDS, }; struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS]; u32 __user *sizeaddr; u32 copy_size; int i; /* in case feature bits run out again */ BUILD_BUG_ON(ETHTOOL_DEV_FEATURE_WORDS * sizeof(u32) > sizeof(netdev_features_t)); for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) { features[i].available = (u32)(dev->hw_features >> (32 * i)); features[i].requested = (u32)(dev->wanted_features >> (32 * i)); features[i].active = (u32)(dev->features >> (32 * i)); features[i].never_changed = (u32)(NETIF_F_NEVER_CHANGE >> (32 * i)); } sizeaddr = useraddr + offsetof(struct ethtool_gfeatures, size); if (get_user(copy_size, sizeaddr)) return -EFAULT; if (copy_size > ETHTOOL_DEV_FEATURE_WORDS) copy_size = ETHTOOL_DEV_FEATURE_WORDS; if (copy_to_user(useraddr, &cmd, sizeof(cmd))) return -EFAULT; useraddr += sizeof(cmd); if (copy_to_user(useraddr, features, array_size(copy_size, sizeof(*features)))) return -EFAULT; return 0; } static int ethtool_set_features(struct net_device *dev, void __user *useraddr) { struct ethtool_sfeatures cmd; struct ethtool_set_features_block features[ETHTOOL_DEV_FEATURE_WORDS]; netdev_features_t wanted = 0, valid = 0; int i, ret = 0; if (copy_from_user(&cmd, useraddr, sizeof(cmd))) return -EFAULT; useraddr += sizeof(cmd); if (cmd.size != ETHTOOL_DEV_FEATURE_WORDS) return -EINVAL; if (copy_from_user(features, useraddr, sizeof(features))) return -EFAULT; for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) { valid |= (netdev_features_t)features[i].valid << (32 * i); wanted |= (netdev_features_t)features[i].requested << (32 * i); } if (valid & ~NETIF_F_ETHTOOL_BITS) return -EINVAL; if (valid & ~dev->hw_features) { valid &= dev->hw_features; ret |= ETHTOOL_F_UNSUPPORTED; } dev->wanted_features &= ~valid; dev->wanted_features |= wanted & valid; __netdev_update_features(dev); if ((dev->wanted_features ^ dev->features) & valid) ret |= ETHTOOL_F_WISH; return ret; } static int __ethtool_get_sset_count(struct net_device *dev, int sset) { const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops; const struct ethtool_ops *ops = dev->ethtool_ops; if (sset == ETH_SS_FEATURES) return ARRAY_SIZE(netdev_features_strings); if (sset == ETH_SS_RSS_HASH_FUNCS) return ARRAY_SIZE(rss_hash_func_strings); if (sset == ETH_SS_TUNABLES) return ARRAY_SIZE(tunable_strings); if (sset == ETH_SS_PHY_TUNABLES) return ARRAY_SIZE(phy_tunable_strings); if (sset == ETH_SS_PHY_STATS && dev->phydev && !ops->get_ethtool_phy_stats && phy_ops && phy_ops->get_sset_count) return phy_ops->get_sset_count(dev->phydev); if (sset == ETH_SS_LINK_MODES) return __ETHTOOL_LINK_MODE_MASK_NBITS; if (ops->get_sset_count && ops->get_strings) return ops->get_sset_count(dev, sset); else return -EOPNOTSUPP; } static void __ethtool_get_strings(struct net_device *dev, u32 stringset, u8 *data) { const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops; const struct ethtool_ops *ops = dev->ethtool_ops; if (stringset == ETH_SS_FEATURES) memcpy(data, netdev_features_strings, sizeof(netdev_features_strings)); else if (stringset == ETH_SS_RSS_HASH_FUNCS) memcpy(data, rss_hash_func_strings, sizeof(rss_hash_func_strings)); else if (stringset == ETH_SS_TUNABLES) memcpy(data, tunable_strings, sizeof(tunable_strings)); else if (stringset == ETH_SS_PHY_TUNABLES) memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings)); else if (stringset == ETH_SS_PHY_STATS && dev->phydev && !ops->get_ethtool_phy_stats && phy_ops && phy_ops->get_strings) phy_ops->get_strings(dev->phydev, data); else if (stringset == ETH_SS_LINK_MODES) memcpy(data, link_mode_names, __ETHTOOL_LINK_MODE_MASK_NBITS * ETH_GSTRING_LEN); else /* ops->get_strings is valid because checked earlier */ ops->get_strings(dev, stringset, data); } static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd) { /* feature masks of legacy discrete ethtool ops */ switch (eth_cmd) { case ETHTOOL_GTXCSUM: case ETHTOOL_STXCSUM: return NETIF_F_CSUM_MASK | NETIF_F_FCOE_CRC | NETIF_F_SCTP_CRC; case ETHTOOL_GRXCSUM: case ETHTOOL_SRXCSUM: return NETIF_F_RXCSUM; case ETHTOOL_GSG: case ETHTOOL_SSG: return NETIF_F_SG | NETIF_F_FRAGLIST; case ETHTOOL_GTSO: case ETHTOOL_STSO: return NETIF_F_ALL_TSO; case ETHTOOL_GGSO: case ETHTOOL_SGSO: return NETIF_F_GSO; case ETHTOOL_GGRO: case ETHTOOL_SGRO: return NETIF_F_GRO; default: BUG(); } } static int ethtool_get_one_feature(struct net_device *dev, char __user *useraddr, u32 ethcmd) { netdev_features_t mask = ethtool_get_feature_mask(ethcmd); struct ethtool_value edata = { .cmd = ethcmd, .data = !!(dev->features & mask), }; if (copy_to_user(useraddr, &edata, sizeof(edata))) return -EFAULT; return 0; } static int ethtool_set_one_feature(struct net_device *dev, void __user *useraddr, u32 ethcmd) { struct ethtool_value edata; netdev_features_t mask; if (copy_from_user(&edata, useraddr, sizeof(edata))) return -EFAULT; mask = ethtool_get_feature_mask(ethcmd); mask &= dev->hw_features; if (!mask) return -EOPNOTSUPP; if (edata.data) dev->wanted_features |= mask; else dev->wanted_features &= ~mask; __netdev_update_features(dev); return 0; } #define ETH_ALL_FLAGS (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \ ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH) #define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_CTAG_RX | \ NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_NTUPLE | \ NETIF_F_RXHASH) static u32 __ethtool_get_flags(struct net_device *dev) { u32 flags = 0; if (dev->features & NETIF_F_LRO) flags |= ETH_FLAG_LRO; if (dev->features & NETIF_F_HW_VLAN_CTAG_RX) flags |= ETH_FLAG_RXVLAN; if (dev->features & NETIF_F_HW_VLAN_CTAG_TX) flags |= ETH_FLAG_TXVLAN; if (dev->features & NETIF_F_NTUPLE) flags |= ETH_FLAG_NTUPLE; if (dev->features & NETIF_F_RXHASH) flags |= ETH_FLAG_RXHASH; return flags; } static int __ethtool_set_flags(struct net_device *dev, u32 data) { netdev_features_t features = 0, changed; if (data & ~ETH_ALL_FLAGS) return -EINVAL; if (data & ETH_FLAG_LRO) features |= NETIF_F_LRO; if (data & ETH_FLAG_RXVLAN) features |= NETIF_F_HW_VLAN_CTAG_RX; if (data & ETH_FLAG_TXVLAN) features |= NETIF_F_HW_VLAN_CTAG_TX; if (data & ETH_FLAG_NTUPLE) features |= NETIF_F_NTUPLE; if (data & ETH_FLAG_RXHASH) features |= NETIF_F_RXHASH; /* allow changing only bits set in hw_features */ changed = (features ^ dev->features) & ETH_ALL_FEATURES; if (changed & ~dev->hw_features) return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP; dev->wanted_features = (dev->wanted_features & ~changed) | (features & changed); __netdev_update_features(dev); return 0; } /* Given two link masks, AND them together and save the result in dst. */ void ethtool_intersect_link_masks(struct ethtool_link_ksettings *dst, struct ethtool_link_ksettings *src) { unsigned int size = BITS_TO_LONGS(__ETHTOOL_LINK_MODE_MASK_NBITS); unsigned int idx = 0; for (; idx < size; idx++) { dst->link_modes.supported[idx] &= src->link_modes.supported[idx]; dst->link_modes.advertising[idx] &= src->link_modes.advertising[idx]; } } EXPORT_SYMBOL(ethtool_intersect_link_masks); void ethtool_convert_legacy_u32_to_link_mode(unsigned long *dst, u32 legacy_u32) { linkmode_zero(dst); dst[0] = legacy_u32; } EXPORT_SYMBOL(ethtool_convert_legacy_u32_to_link_mode); /* return false if src had higher bits set. lower bits always updated. */ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, const unsigned long *src) { *legacy_u32 = src[0]; return find_next_bit(src, __ETHTOOL_LINK_MODE_MASK_NBITS, 32) == __ETHTOOL_LINK_MODE_MASK_NBITS; } EXPORT_SYMBOL(ethtool_convert_link_mode_to_legacy_u32); /* return false if ksettings link modes had higher bits * set. legacy_settings always updated (best effort) */ static bool convert_link_ksettings_to_legacy_settings( struct ethtool_cmd *legacy_settings, const struct ethtool_link_ksettings *link_ksettings) { bool retval = true; memset(legacy_settings, 0, sizeof(*legacy_settings)); /* this also clears the deprecated fields in legacy structure: * __u8 transceiver; * __u32 maxtxpkt; * __u32 maxrxpkt; */ retval &= ethtool_convert_link_mode_to_legacy_u32( &legacy_settings->supported, link_ksettings->link_modes.supported); retval &= ethtool_convert_link_mode_to_legacy_u32( &legacy_settings->advertising, link_ksettings->link_modes.advertising); retval &= ethtool_convert_link_mode_to_legacy_u32( &legacy_settings->lp_advertising, link_ksettings->link_modes.lp_advertising); ethtool_cmd_speed_set(legacy_settings, link_ksettings->base.speed); legacy_settings->duplex = link_ksettings->base.duplex; legacy_settings->port = link_ksettings->base.port; legacy_settings->phy_address = link_ksettings->base.phy_address; legacy_settings->autoneg = link_ksettings->base.autoneg; legacy_settings->mdio_support = link_ksettings->base.mdio_support; legacy_settings->eth_tp_mdix = link_ksettings->base.eth_tp_mdix; legacy_settings->eth_tp_mdix_ctrl = link_ksettings->base.eth_tp_mdix_ctrl; legacy_settings->transceiver = link_ksettings->base.transceiver; return retval; } /* number of 32-bit words to store the user's link mode bitmaps */ #define __ETHTOOL_LINK_MODE_MASK_NU32 \ DIV_ROUND_UP(__ETHTOOL_LINK_MODE_MASK_NBITS, 32) /* layout of the struct passed from/to userland */ struct ethtool_link_usettings { struct ethtool_link_settings base; struct { __u32 supported[__ETHTOOL_LINK_MODE_MASK_NU32]; __u32 advertising[__ETHTOOL_LINK_MODE_MASK_NU32]; __u32 lp_advertising[__ETHTOOL_LINK_MODE_MASK_NU32]; } link_modes; }; /* Internal kernel helper to query a device ethtool_link_settings. */ int __ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *link_ksettings) { ASSERT_RTNL(); if (!dev->ethtool_ops->get_link_ksettings) return -EOPNOTSUPP; memset(link_ksettings, 0, sizeof(*link_ksettings)); return dev->ethtool_ops->get_link_ksettings(dev, link_ksettings); } EXPORT_SYMBOL(__ethtool_get_link_ksettings); /* convert ethtool_link_usettings in user space to a kernel internal * ethtool_link_ksettings. return 0 on success, errno on error. */ static int load_link_ksettings_from_user(struct ethtool_link_ksettings *to, const void __user *from) { struct ethtool_link_usettings link_usettings; if (copy_from_user(&link_usettings, from, sizeof(link_usettings))) return -EFAULT; memcpy(&to->base, &link_usettings.base, sizeof(to->base)); bitmap_from_arr32(to->link_modes.supported, link_usettings.link_modes.supported, __ETHTOOL_LINK_MODE_MASK_NBITS); bitmap_from_arr32(to->link_modes.advertising, link_usettings.link_modes.advertising, __ETHTOOL_LINK_MODE_MASK_NBITS); bitmap_from_arr32(to->link_modes.lp_advertising, link_usettings.link_modes.lp_advertising, __ETHTOOL_LINK_MODE_MASK_NBITS); return 0; } /* Check if the user is trying to change anything besides speed/duplex */ bool ethtool_virtdev_validate_cmd(const struct ethtool_link_ksettings *cmd) { struct ethtool_link_settings base2 = {}; base2.speed = cmd->base.speed; base2.port = PORT_OTHER; base2.duplex = cmd->base.duplex; base2.cmd = cmd->base.cmd; base2.link_mode_masks_nwords = cmd->base.link_mode_masks_nwords; return !memcmp(&base2, &cmd->base, sizeof(base2)) && bitmap_empty(cmd->link_modes.supported, __ETHTOOL_LINK_MODE_MASK_NBITS) && bitmap_empty(cmd->link_modes.lp_advertising, __ETHTOOL_LINK_MODE_MASK_NBITS); } /* convert a kernel internal ethtool_link_ksettings to * ethtool_link_usettings in user space. return 0 on success, errno on * error. */ static int store_link_ksettings_for_user(void __user *to, const struct ethtool_link_ksettings *from) { struct ethtool_link_usettings link_usettings; memcpy(&link_usettings, from, sizeof(link_usettings)); bitmap_to_arr32(link_usettings.link_modes.supported, from->link_modes.supported, __ETHTOOL_LINK_MODE_MASK_NBITS); bitmap_to_arr32(link_usettings.link_modes.advertising, from->link_modes.advertising, __ETHTOOL_LINK_MODE_MASK_NBITS); bitmap_to_arr32(link_usettings.link_modes.lp_advertising, from->link_modes.lp_advertising, __ETHTOOL_LINK_MODE_MASK_NBITS); if (copy_to_user(to, &link_usettings, sizeof(link_usettings))) return -EFAULT; return 0; } /* Query device for its ethtool_link_settings. */ static int ethtool_get_link_ksettings(struct net_device *dev, void __user *useraddr) { int err = 0; struct ethtool_link_ksettings link_ksettings; ASSERT_RTNL(); if (!dev->ethtool_ops->get_link_ksettings) return -EOPNOTSUPP; /* handle bitmap nbits handshake */ if (copy_from_user(&link_ksettings.base, useraddr, sizeof(link_ksettings.base))) return -EFAULT; if (__ETHTOOL_LINK_MODE_MASK_NU32 != link_ksettings.base.link_mode_masks_nwords) { /* wrong link mode nbits requested */ memset(&link_ksettings, 0, sizeof(link_ksettings)); link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS; /* send back number of words required as negative val */ compiletime_assert(__ETHTOOL_LINK_MODE_MASK_NU32 <= S8_MAX, "need too many bits for link modes!"); link_ksettings.base.link_mode_masks_nwords = -((s8)__ETHTOOL_LINK_MODE_MASK_NU32); /* copy the base fields back to user, not the link * mode bitmaps */ if (copy_to_user(useraddr, &link_ksettings.base, sizeof(link_ksettings.base))) return -EFAULT; return 0; } /* handshake successful: user/kernel agree on * link_mode_masks_nwords */ memset(&link_ksettings, 0, sizeof(link_ksettings)); err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings); if (err < 0) return err; /* make sure we tell the right values to user */ link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS; link_ksettings.base.link_mode_masks_nwords = __ETHTOOL_LINK_MODE_MASK_NU32; link_ksettings.base.master_slave_cfg = MASTER_SLAVE_CFG_UNSUPPORTED; link_ksettings.base.master_slave_state = MASTER_SLAVE_STATE_UNSUPPORTED; link_ksettings.base.rate_matching = RATE_MATCH_NONE; return store_link_ksettings_for_user(useraddr, &link_ksettings); } /* Update device ethtool_link_settings. */ static int ethtool_set_link_ksettings(struct net_device *dev, void __user *useraddr) { struct ethtool_link_ksettings link_ksettings = {}; int err; ASSERT_RTNL(); if (!dev->ethtool_ops->set_link_ksettings) return -EOPNOTSUPP; /* make sure nbits field has expected value */ if (copy_from_user(&link_ksettings.base, useraddr, sizeof(link_ksettings.base))) return -EFAULT; if (__ETHTOOL_LINK_MODE_MASK_NU32 != link_ksettings.base.link_mode_masks_nwords) return -EINVAL; /* copy the whole structure, now that we know it has expected * format */ err = load_link_ksettings_from_user(&link_ksettings, useraddr); if (err) return err; /* re-check nwords field, just in case */ if (__ETHTOOL_LINK_MODE_MASK_NU32 != link_ksettings.base.link_mode_masks_nwords) return -EINVAL; if (link_ksettings.base.master_slave_cfg || link_ksettings.base.master_slave_state) return -EINVAL; err = dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); if (err >= 0) { ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF, NULL); ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF, NULL); } return err; } int ethtool_virtdev_set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *cmd, u32 *dev_speed, u8 *dev_duplex) { u32 speed; u8 duplex; speed = cmd->base.speed; duplex = cmd->base.duplex; /* don't allow custom speed and duplex */ if (!ethtool_validate_speed(speed) || !ethtool_validate_duplex(duplex) || !ethtool_virtdev_validate_cmd(cmd)) return -EINVAL; *dev_speed = speed; *dev_duplex = duplex; return 0; } EXPORT_SYMBOL(ethtool_virtdev_set_link_ksettings); /* Query device for its ethtool_cmd settings. * * Backward compatibility note: for compatibility with legacy ethtool, this is * now implemented via get_link_ksettings. When driver reports higher link mode * bits, a kernel warning is logged once (with name of 1st driver/device) to * recommend user to upgrade ethtool, but the command is successful (only the * lower link mode bits reported back to user). Deprecated fields from * ethtool_cmd (transceiver/maxrxpkt/maxtxpkt) are always set to zero. */ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) { struct ethtool_link_ksettings link_ksettings; struct ethtool_cmd cmd; int err; ASSERT_RTNL(); if (!dev->ethtool_ops->get_link_ksettings) return -EOPNOTSUPP; memset(&link_ksettings, 0, sizeof(link_ksettings)); err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings); if (err < 0) return err; convert_link_ksettings_to_legacy_settings(&cmd, &link_ksettings); /* send a sensible cmd tag back to user */ cmd.cmd = ETHTOOL_GSET; if (copy_to_user(useraddr, &cmd, sizeof(cmd))) return -EFAULT; return 0; } /* Update device link settings with given ethtool_cmd. * * Backward compatibility note: for compatibility with legacy ethtool, this is * now always implemented via set_link_settings. When user's request updates * deprecated ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel * warning is logged once (with name of 1st driver/device) to recommend user to * upgrade ethtool, and the request is rejected. */ static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) { struct ethtool_link_ksettings link_ksettings; struct ethtool_cmd cmd; int ret; ASSERT_RTNL(); if (copy_from_user(&cmd, useraddr, sizeof(cmd))) return -EFAULT; if (!dev->ethtool_ops->set_link_ksettings) return -EOPNOTSUPP; if (!convert_legacy_settings_to_link_ksettings(&link_ksettings, &cmd)) return -EINVAL; link_ksettings.base.link_mode_masks_nwords = __ETHTOOL_LINK_MODE_MASK_NU32; ret = dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); if (ret >= 0) { ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF, NULL); ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF, NULL); } return ret; } static int ethtool_get_drvinfo(struct net_device *dev, struct ethtool_devlink_compat *rsp) { const struct ethtool_ops *ops = dev->ethtool_ops; struct device *parent = dev->dev.parent; rsp->info.cmd = ETHTOOL_GDRVINFO; strscpy(rsp->info.version, init_uts_ns.name.release, sizeof(rsp->info.version)); if (ops->get_drvinfo) { ops->get_drvinfo(dev, &rsp->info); if (!rsp->info.bus_info[0] && parent) strscpy(rsp->info.bus_info, dev_name(parent), sizeof(rsp->info.bus_info)); if (!rsp->info.driver[0] && parent && parent->driver) strscpy(rsp->info.driver, parent->driver->name, sizeof(rsp->info.driver)); } else if (parent && parent->driver) { strscpy(rsp->info.bus_info, dev_name(parent), sizeof(rsp->info.bus_info)); strscpy(rsp->info.driver, parent->driver->name, sizeof(rsp->info.driver)); } else if (dev->rtnl_link_ops) { strscpy(rsp->info.driver, dev->rtnl_link_ops->kind, sizeof(rsp->info.driver)); } else { return -EOPNOTSUPP; } /* * this method of obtaining string set info is deprecated; * Use ETHTOOL_GSSET_INFO instead. */ if (ops->get_sset_count) { int rc; rc = ops->get_sset_count(dev, ETH_SS_TEST); if (rc >= 0) rsp->info.testinfo_len = rc; rc = ops->get_sset_count(dev, ETH_SS_STATS); if (rc >= 0) rsp->info.n_stats = rc; rc = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS); if (rc >= 0) rsp->info.n_priv_flags = rc; } if (ops->get_regs_len) { int ret = ops->get_regs_len(dev); if (ret > 0) rsp->info.regdump_len = ret; } if (ops->get_eeprom_len) rsp->info.eedump_len = ops->get_eeprom_len(dev); if (!rsp->info.fw_version[0]) rsp->devlink = netdev_to_devlink_get(dev); return 0; } static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev, void __user *useraddr) { struct ethtool_sset_info info; u64 sset_mask; int i, idx = 0, n_bits = 0, ret, rc; u32 *info_buf = NULL; if (copy_from_user(&info, useraddr, sizeof(info))) return -EFAULT; /* store copy of mask, because we zero struct later on */ sset_mask = info.sset_mask; if (!sset_mask) return 0; /* calculate size of return buffer */ n_bits = hweight64(sset_mask); memset(&info, 0, sizeof(info)); info.cmd = ETHTOOL_GSSET_INFO; info_buf = kcalloc(n_bits, sizeof(u32), GFP_USER); if (!info_buf) return -ENOMEM; /* * fill return buffer based on input bitmask and successful * get_sset_count return */ for (i = 0; i < 64; i++) { if (!(sset_mask & (1ULL << i))) continue; rc = __ethtool_get_sset_count(dev, i); if (rc >= 0) { info.sset_mask |= (1ULL << i); info_buf[idx++] = rc; } } ret = -EFAULT; if (copy_to_user(useraddr, &info, sizeof(info))) goto out; useraddr += offsetof(struct ethtool_sset_info, data); if (copy_to_user(useraddr, info_buf, array_size(idx, sizeof(u32)))) goto out; ret = 0; out: kfree(info_buf); return ret; } static noinline_for_stack int ethtool_rxnfc_copy_from_compat(struct ethtool_rxnfc *rxnfc, const struct compat_ethtool_rxnfc __user *useraddr, size_t size) { struct compat_ethtool_rxnfc crxnfc = {}; /* We expect there to be holes between fs.m_ext and * fs.ring_cookie and at the end of fs, but nowhere else. * On non-x86, no conversion should be needed. */ BUILD_BUG_ON(!IS_ENABLED(CONFIG_X86_64) && sizeof(struct compat_ethtool_rxnfc) != sizeof(struct ethtool_rxnfc)); BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.m_ext) + sizeof(useraddr->fs.m_ext) != offsetof(struct ethtool_rxnfc, fs.m_ext) + sizeof(rxnfc->fs.m_ext)); BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.location) - offsetof(struct compat_ethtool_rxnfc, fs.ring_cookie) != offsetof(struct ethtool_rxnfc, fs.location) - offsetof(struct ethtool_rxnfc, fs.ring_cookie)); if (copy_from_user(&crxnfc, useraddr, min(size, sizeof(crxnfc)))) return -EFAULT; *rxnfc = (struct ethtool_rxnfc) { .cmd = crxnfc.cmd, .flow_type = crxnfc.flow_type, .data = crxnfc.data, .fs = { .flow_type = crxnfc.fs.flow_type, .h_u = crxnfc.fs.h_u, .h_ext = crxnfc.fs.h_ext, .m_u = crxnfc.fs.m_u, .m_ext = crxnfc.fs.m_ext, .ring_cookie = crxnfc.fs.ring_cookie, .location = crxnfc.fs.location, }, .rule_cnt = crxnfc.rule_cnt, }; return 0; } static int ethtool_rxnfc_copy_from_user(struct ethtool_rxnfc *rxnfc, const void __user *useraddr, size_t size) { if (compat_need_64bit_alignment_fixup()) return ethtool_rxnfc_copy_from_compat(rxnfc, useraddr, size); if (copy_from_user(rxnfc, useraddr, size)) return -EFAULT; return 0; } static int ethtool_rxnfc_copy_to_compat(void __user *useraddr, const struct ethtool_rxnfc *rxnfc, size_t size, const u32 *rule_buf) { struct compat_ethtool_rxnfc crxnfc; memset(&crxnfc, 0, sizeof(crxnfc)); crxnfc = (struct compat_ethtool_rxnfc) { .cmd = rxnfc->cmd, .flow_type = rxnfc->flow_type, .data = rxnfc->data, .fs = { .flow_type = rxnfc->fs.flow_type, .h_u = rxnfc->fs.h_u, .h_ext = rxnfc->fs.h_ext, .m_u = rxnfc->fs.m_u, .m_ext = rxnfc->fs.m_ext, .ring_cookie = rxnfc->fs.ring_cookie, .location = rxnfc->fs.location, }, .rule_cnt = rxnfc->rule_cnt, }; if (copy_to_user(useraddr, &crxnfc, min(size, sizeof(crxnfc)))) return -EFAULT; return 0; } static int ethtool_rxnfc_copy_struct(u32 cmd, struct ethtool_rxnfc *info, size_t *info_size, void __user *useraddr) { /* struct ethtool_rxnfc was originally defined for * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data * members. User-space might still be using that * definition. */ if (cmd == ETHTOOL_GRXFH || cmd == ETHTOOL_SRXFH) *info_size = (offsetof(struct ethtool_rxnfc, data) + sizeof(info->data)); if (ethtool_rxnfc_copy_from_user(info, useraddr, *info_size)) return -EFAULT; if ((cmd == ETHTOOL_GRXFH || cmd == ETHTOOL_SRXFH) && info->flow_type & FLOW_RSS) { *info_size = sizeof(*info); if (ethtool_rxnfc_copy_from_user(info, useraddr, *info_size)) return -EFAULT; /* Since malicious users may modify the original data, * we need to check whether FLOW_RSS is still requested. */ if (!(info->flow_type & FLOW_RSS)) return -EINVAL; } if (info->cmd != cmd) return -EINVAL; return 0; } static int ethtool_rxnfc_copy_to_user(void __user *useraddr, const struct ethtool_rxnfc *rxnfc, size_t size, const u32 *rule_buf) { int ret; if (compat_need_64bit_alignment_fixup()) { ret = ethtool_rxnfc_copy_to_compat(useraddr, rxnfc, size, rule_buf); useraddr += offsetof(struct compat_ethtool_rxnfc, rule_locs); } else { ret = copy_to_user(useraddr, rxnfc, size); useraddr += offsetof(struct ethtool_rxnfc, rule_locs); } if (ret) return -EFAULT; if (rule_buf) { if (copy_to_user(useraddr, rule_buf, rxnfc->rule_cnt * sizeof(u32))) return -EFAULT; } return 0; } static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, u32 cmd, void __user *useraddr) { const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_rxnfc info; size_t info_size = sizeof(info); int rc; if (!ops->set_rxnfc) return -EOPNOTSUPP; rc = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr); if (rc) return rc; if (ops->get_rxfh) { struct ethtool_rxfh_param rxfh = {}; rc = ops->get_rxfh(dev, &rxfh); if (rc) return rc; /* Sanity check: if symmetric-xor is set, then: * 1 - no other fields besides IP src/dst and/or L4 src/dst * 2 - If src is set, dst must also be set */ if ((rxfh.input_xfrm & RXH_XFRM_SYM_XOR) && ((info.data & ~(RXH_IP_SRC | RXH_IP_DST | RXH_L4_B_0_1 | RXH_L4_B_2_3)) || (!!(info.data & RXH_IP_SRC) ^ !!(info.data & RXH_IP_DST)) || (!!(info.data & RXH_L4_B_0_1) ^ !!(info.data & RXH_L4_B_2_3)))) return -EINVAL; } rc = ops->set_rxnfc(dev, &info); if (rc) return rc; if (cmd == ETHTOOL_SRXCLSRLINS && ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, NULL)) return -EFAULT; return 0; } static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, u32 cmd, void __user *useraddr) { struct ethtool_rxnfc info; size_t info_size = sizeof(info); const struct ethtool_ops *ops = dev->ethtool_ops; int ret; void *rule_buf = NULL; if (!ops->get_rxnfc) return -EOPNOTSUPP; ret = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr); if (ret) return ret; if (info.cmd == ETHTOOL_GRXCLSRLALL) { if (info.rule_cnt > 0) { if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32)) rule_buf = kcalloc(info.rule_cnt, sizeof(u32), GFP_USER); if (!rule_buf) return -ENOMEM; } } ret = ops->get_rxnfc(dev, &info, rule_buf); if (ret < 0) goto err_out; ret = ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, rule_buf); err_out: kfree(rule_buf); return ret; } static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr, struct ethtool_rxnfc *rx_rings, u32 size) { int i; if (copy_from_user(indir, useraddr, array_size(size, sizeof(indir[0])))) return -EFAULT; /* Validate ring indices */ for (i = 0; i < size; i++) if (indir[i] >= rx_rings->data) return -EINVAL; return 0; } u8 netdev_rss_key[NETDEV_RSS_KEY_LEN] __read_mostly; void netdev_rss_key_fill(void *buffer, size_t len) { BUG_ON(len > sizeof(netdev_rss_key)); net_get_random_once(netdev_rss_key, sizeof(netdev_rss_key)); memcpy(buffer, netdev_rss_key, len); } EXPORT_SYMBOL(netdev_rss_key_fill); static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, void __user *useraddr) { struct ethtool_rxfh_param rxfh = {}; u32 user_size; int ret; if (!dev->ethtool_ops->get_rxfh_indir_size || !dev->ethtool_ops->get_rxfh) return -EOPNOTSUPP; rxfh.indir_size = dev->ethtool_ops->get_rxfh_indir_size(dev); if (rxfh.indir_size == 0) return -EOPNOTSUPP; if (copy_from_user(&user_size, useraddr + offsetof(struct ethtool_rxfh_indir, size), sizeof(user_size))) return -EFAULT; if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh_indir, size), &rxfh.indir_size, sizeof(rxfh.indir_size))) return -EFAULT; /* If the user buffer size is 0, this is just a query for the * device table size. Otherwise, if it's smaller than the * device table size it's an error. */ if (user_size < rxfh.indir_size) return user_size == 0 ? 0 : -EINVAL; rxfh.indir = kcalloc(rxfh.indir_size, sizeof(rxfh.indir[0]), GFP_USER); if (!rxfh.indir) return -ENOMEM; ret = dev->ethtool_ops->get_rxfh(dev, &rxfh); if (ret) goto out; if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh_indir, ring_index[0]), rxfh.indir, rxfh.indir_size * sizeof(*rxfh.indir))) ret = -EFAULT; out: kfree(rxfh.indir); return ret; } static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, void __user *useraddr) { const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_rxfh_param rxfh_dev = {}; struct netlink_ext_ack *extack = NULL; struct ethtool_rxnfc rx_rings; u32 user_size, i; int ret; u32 ringidx_offset = offsetof(struct ethtool_rxfh_indir, ring_index[0]); if (!ops->get_rxfh_indir_size || !ops->set_rxfh || !ops->get_rxnfc) return -EOPNOTSUPP; rxfh_dev.indir_size = ops->get_rxfh_indir_size(dev); if (rxfh_dev.indir_size == 0) return -EOPNOTSUPP; if (copy_from_user(&user_size, useraddr + offsetof(struct ethtool_rxfh_indir, size), sizeof(user_size))) return -EFAULT; if (user_size != 0 && user_size != rxfh_dev.indir_size) return -EINVAL; rxfh_dev.indir = kcalloc(rxfh_dev.indir_size, sizeof(rxfh_dev.indir[0]), GFP_USER); if (!rxfh_dev.indir) return -ENOMEM; rx_rings.cmd = ETHTOOL_GRXRINGS; ret = ops->get_rxnfc(dev, &rx_rings, NULL); if (ret) goto out; if (user_size == 0) { u32 *indir = rxfh_dev.indir; for (i = 0; i < rxfh_dev.indir_size; i++) indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); } else { ret = ethtool_copy_validate_indir(rxfh_dev.indir, useraddr + ringidx_offset, &rx_rings, rxfh_dev.indir_size); if (ret) goto out; } rxfh_dev.hfunc = ETH_RSS_HASH_NO_CHANGE; ret = ops->set_rxfh(dev, &rxfh_dev, extack); if (ret) goto out; /* indicate whether rxfh was set to default */ if (user_size == 0) dev->priv_flags &= ~IFF_RXFH_CONFIGURED; else dev->priv_flags |= IFF_RXFH_CONFIGURED; out: kfree(rxfh_dev.indir); return ret; } static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, void __user *useraddr) { const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_rxfh_param rxfh_dev = {}; u32 user_indir_size, user_key_size; struct ethtool_rxfh rxfh; u32 indir_bytes; u8 *rss_config; u32 total_size; int ret; if (!ops->get_rxfh) return -EOPNOTSUPP; if (ops->get_rxfh_indir_size) rxfh_dev.indir_size = ops->get_rxfh_indir_size(dev); if (ops->get_rxfh_key_size) rxfh_dev.key_size = ops->get_rxfh_key_size(dev); if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) return -EFAULT; user_indir_size = rxfh.indir_size; user_key_size = rxfh.key_size; /* Check that reserved fields are 0 for now */ if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd32) return -EINVAL; /* Most drivers don't handle rss_context, check it's 0 as well */ if (rxfh.rss_context && !ops->cap_rss_ctx_supported) return -EOPNOTSUPP; rxfh.indir_size = rxfh_dev.indir_size; rxfh.key_size = rxfh_dev.key_size; if (copy_to_user(useraddr, &rxfh, sizeof(rxfh))) return -EFAULT; if ((user_indir_size && user_indir_size != rxfh_dev.indir_size) || (user_key_size && user_key_size != rxfh_dev.key_size)) return -EINVAL; indir_bytes = user_indir_size * sizeof(rxfh_dev.indir[0]); total_size = indir_bytes + user_key_size; rss_config = kzalloc(total_size, GFP_USER); if (!rss_config) return -ENOMEM; if (user_indir_size) rxfh_dev.indir = (u32 *)rss_config; if (user_key_size) rxfh_dev.key = rss_config + indir_bytes; rxfh_dev.rss_context = rxfh.rss_context; ret = dev->ethtool_ops->get_rxfh(dev, &rxfh_dev); if (ret) goto out; if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, hfunc), &rxfh_dev.hfunc, sizeof(rxfh.hfunc))) { ret = -EFAULT; } else if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, input_xfrm), &rxfh_dev.input_xfrm, sizeof(rxfh.input_xfrm))) { ret = -EFAULT; } else if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, rss_config[0]), rss_config, total_size)) { ret = -EFAULT; } out: kfree(rss_config); return ret; } static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, void __user *useraddr) { u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]); const struct ethtool_ops *ops = dev->ethtool_ops; u32 dev_indir_size = 0, dev_key_size = 0, i; struct ethtool_rxfh_param rxfh_dev = {}; struct netlink_ext_ack *extack = NULL; struct ethtool_rxnfc rx_rings; struct ethtool_rxfh rxfh; u32 indir_bytes = 0; u8 *rss_config; int ret; if (!ops->get_rxnfc || !ops->set_rxfh) return -EOPNOTSUPP; if (ops->get_rxfh_indir_size) dev_indir_size = ops->get_rxfh_indir_size(dev); if (ops->get_rxfh_key_size) dev_key_size = ops->get_rxfh_key_size(dev); if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) return -EFAULT; /* Check that reserved fields are 0 for now */ if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd32) return -EINVAL; /* Most drivers don't handle rss_context, check it's 0 as well */ if (rxfh.rss_context && !ops->cap_rss_ctx_supported) return -EOPNOTSUPP; /* Check input data transformation capabilities */ if (rxfh.input_xfrm && rxfh.input_xfrm != RXH_XFRM_SYM_XOR && rxfh.input_xfrm != RXH_XFRM_NO_CHANGE) return -EINVAL; if ((rxfh.input_xfrm & RXH_XFRM_SYM_XOR) && !ops->cap_rss_sym_xor_supported) return -EOPNOTSUPP; /* If either indir, hash key or function is valid, proceed further. * Must request at least one change: indir size, hash key, function * or input transformation. */ if ((rxfh.indir_size && rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE && rxfh.indir_size != dev_indir_size) || (rxfh.key_size && (rxfh.key_size != dev_key_size)) || (rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE && rxfh.key_size == 0 && rxfh.hfunc == ETH_RSS_HASH_NO_CHANGE && rxfh.input_xfrm == RXH_XFRM_NO_CHANGE)) return -EINVAL; if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) indir_bytes = dev_indir_size * sizeof(rxfh_dev.indir[0]); rss_config = kzalloc(indir_bytes + rxfh.key_size, GFP_USER); if (!rss_config) return -ENOMEM; rx_rings.cmd = ETHTOOL_GRXRINGS; ret = ops->get_rxnfc(dev, &rx_rings, NULL); if (ret) goto out; /* rxfh.indir_size == 0 means reset the indir table to default (master * context) or delete the context (other RSS contexts). * rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE means leave it unchanged. */ if (rxfh.indir_size && rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) { rxfh_dev.indir = (u32 *)rss_config; rxfh_dev.indir_size = dev_indir_size; ret = ethtool_copy_validate_indir(rxfh_dev.indir, useraddr + rss_cfg_offset, &rx_rings, rxfh.indir_size); if (ret) goto out; } else if (rxfh.indir_size == 0) { if (rxfh.rss_context == 0) { u32 *indir; rxfh_dev.indir = (u32 *)rss_config; rxfh_dev.indir_size = dev_indir_size; indir = rxfh_dev.indir; for (i = 0; i < dev_indir_size; i++) indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); } else { rxfh_dev.rss_delete = true; } } if (rxfh.key_size) { rxfh_dev.key_size = dev_key_size; rxfh_dev.key = rss_config + indir_bytes; if (copy_from_user(rxfh_dev.key, useraddr + rss_cfg_offset + indir_bytes, rxfh.key_size)) { ret = -EFAULT; goto out; } } rxfh_dev.hfunc = rxfh.hfunc; rxfh_dev.rss_context = rxfh.rss_context; rxfh_dev.input_xfrm = rxfh.input_xfrm; ret = ops->set_rxfh(dev, &rxfh_dev, extack); if (ret) goto out; if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, rss_context), &rxfh_dev.rss_context, sizeof(rxfh_dev.rss_context))) ret = -EFAULT; if (!rxfh_dev.rss_context) { /* indicate whether rxfh was set to default */ if (rxfh.indir_size == 0) dev->priv_flags &= ~IFF_RXFH_CONFIGURED; else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) dev->priv_flags |= IFF_RXFH_CONFIGURED; } out: kfree(rss_config); return ret; } static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) { struct ethtool_regs regs; const struct ethtool_ops *ops = dev->ethtool_ops; void *regbuf; int reglen, ret; if (!ops->get_regs || !ops->get_regs_len) return -EOPNOTSUPP; if (copy_from_user(&regs, useraddr, sizeof(regs))) return -EFAULT; reglen = ops->get_regs_len(dev); if (reglen <= 0) return reglen; if (regs.len > reglen) regs.len = reglen; regbuf = vzalloc(reglen); if (!regbuf) return -ENOMEM; if (regs.len < reglen) reglen = regs.len; ops->get_regs(dev, &regs, regbuf); ret = -EFAULT; if (copy_to_user(useraddr, &regs, sizeof(regs))) goto out; useraddr += offsetof(struct ethtool_regs, data); if (copy_to_user(useraddr, regbuf, reglen)) goto out; ret = 0; out: vfree(regbuf); return ret; } static int ethtool_reset(struct net_device *dev, char __user *useraddr) { struct ethtool_value reset; int ret; if (!dev->ethtool_ops->reset) return -EOPNOTSUPP; if (copy_from_user(&reset, useraddr, sizeof(reset))) return -EFAULT; ret = dev->ethtool_ops->reset(dev, &reset.data); if (ret) return ret; if (copy_to_user(useraddr, &reset, sizeof(reset))) return -EFAULT; return 0; } static int ethtool_get_wol(struct net_device *dev, char __user *useraddr) { struct ethtool_wolinfo wol; if (!dev->ethtool_ops->get_wol) return -EOPNOTSUPP; memset(&wol, 0, sizeof(struct ethtool_wolinfo)); wol.cmd = ETHTOOL_GWOL; dev->ethtool_ops->get_wol(dev, &wol); if (copy_to_user(useraddr, &wol, sizeof(wol))) return -EFAULT; return 0; } static int ethtool_set_wol(struct net_device *dev, char __user *useraddr) { struct ethtool_wolinfo wol, cur_wol; int ret; if (!dev->ethtool_ops->get_wol || !dev->ethtool_ops->set_wol) return -EOPNOTSUPP; memset(&cur_wol, 0, sizeof(struct ethtool_wolinfo)); cur_wol.cmd = ETHTOOL_GWOL; dev->ethtool_ops->get_wol(dev, &cur_wol); if (copy_from_user(&wol, useraddr, sizeof(wol))) return -EFAULT; if (wol.wolopts & ~cur_wol.supported) return -EINVAL; if (wol.wolopts == cur_wol.wolopts && !memcmp(wol.sopass, cur_wol.sopass, sizeof(wol.sopass))) return 0; ret = dev->ethtool_ops->set_wol(dev, &wol); if (ret) return ret; dev->wol_enabled = !!wol.wolopts; ethtool_notify(dev, ETHTOOL_MSG_WOL_NTF, NULL); return 0; } static void eee_to_keee(struct ethtool_keee *keee, const struct ethtool_eee *eee) { memset(keee, 0, sizeof(*keee)); keee->eee_enabled = eee->eee_enabled; keee->tx_lpi_enabled = eee->tx_lpi_enabled; keee->tx_lpi_timer = eee->tx_lpi_timer; ethtool_convert_legacy_u32_to_link_mode(keee->advertised, eee->advertised); } static void keee_to_eee(struct ethtool_eee *eee, const struct ethtool_keee *keee) { bool overflow; memset(eee, 0, sizeof(*eee)); eee->eee_active = keee->eee_active; eee->eee_enabled = keee->eee_enabled; eee->tx_lpi_enabled = keee->tx_lpi_enabled; eee->tx_lpi_timer = keee->tx_lpi_timer; overflow = !ethtool_convert_link_mode_to_legacy_u32(&eee->supported, keee->supported); ethtool_convert_link_mode_to_legacy_u32(&eee->advertised, keee->advertised); ethtool_convert_link_mode_to_legacy_u32(&eee->lp_advertised, keee->lp_advertised); if (overflow) pr_warn("Ethtool ioctl interface doesn't support passing EEE linkmodes beyond bit 32\n"); } static int ethtool_get_eee(struct net_device *dev, char __user *useraddr) { struct ethtool_keee keee; struct ethtool_eee eee; int rc; if (!dev->ethtool_ops->get_eee) return -EOPNOTSUPP; memset(&keee, 0, sizeof(keee)); rc = dev->ethtool_ops->get_eee(dev, &keee); if (rc) return rc; keee_to_eee(&eee, &keee); if (copy_to_user(useraddr, &eee, sizeof(eee))) return -EFAULT; return 0; } static int ethtool_set_eee(struct net_device *dev, char __user *useraddr) { struct ethtool_keee keee; struct ethtool_eee eee; int ret; if (!dev->ethtool_ops->set_eee) return -EOPNOTSUPP; if (copy_from_user(&eee, useraddr, sizeof(eee))) return -EFAULT; eee_to_keee(&keee, &eee); ret = dev->ethtool_ops->set_eee(dev, &keee); if (!ret) ethtool_notify(dev, ETHTOOL_MSG_EEE_NTF, NULL); return ret; } static int ethtool_nway_reset(struct net_device *dev) { if (!dev->ethtool_ops->nway_reset) return -EOPNOTSUPP; return dev->ethtool_ops->nway_reset(dev); } static int ethtool_get_link(struct net_device *dev, char __user *useraddr) { struct ethtool_value edata = { .cmd = ETHTOOL_GLINK }; int link = __ethtool_get_link(dev); if (link < 0) return link; edata.data = link; if (copy_to_user(useraddr, &edata, sizeof(edata))) return -EFAULT; return 0; } static int ethtool_get_any_eeprom(struct net_device *dev, void __user *useraddr, int (*getter)(struct net_device *, struct ethtool_eeprom *, u8 *), u32 total_len) { struct ethtool_eeprom eeprom; void __user *userbuf = useraddr + sizeof(eeprom); u32 bytes_remaining; u8 *data; int ret = 0; if (copy_from_user(&eeprom, useraddr, sizeof(eeprom))) return -EFAULT; /* Check for wrap and zero */ if (eeprom.offset + eeprom.len <= eeprom.offset) return -EINVAL; /* Check for exceeding total eeprom len */ if (eeprom.offset + eeprom.len > total_len) return -EINVAL; data = kzalloc(PAGE_SIZE, GFP_USER); if (!data) return -ENOMEM; bytes_remaining = eeprom.len; while (bytes_remaining > 0) { eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE); ret = getter(dev, &eeprom, data); if (ret) break; if (!eeprom.len) { ret = -EIO; break; } if (copy_to_user(userbuf, data, eeprom.len)) { ret = -EFAULT; break; } userbuf += eeprom.len; eeprom.offset += eeprom.len; bytes_remaining -= eeprom.len; } eeprom.len = userbuf - (useraddr + sizeof(eeprom)); eeprom.offset -= eeprom.len; if (copy_to_user(useraddr, &eeprom, sizeof(eeprom))) ret = -EFAULT; kfree(data); return ret; } static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr) { const struct ethtool_ops *ops = dev->ethtool_ops; if (!ops->get_eeprom || !ops->get_eeprom_len || !ops->get_eeprom_len(dev)) return -EOPNOTSUPP; return ethtool_get_any_eeprom(dev, useraddr, ops->get_eeprom, ops->get_eeprom_len(dev)); } static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr) { struct ethtool_eeprom eeprom; const struct ethtool_ops *ops = dev->ethtool_ops; void __user *userbuf = useraddr + sizeof(eeprom); u32 bytes_remaining; u8 *data; int ret = 0; if (!ops->set_eeprom || !ops->get_eeprom_len || !ops->get_eeprom_len(dev)) return -EOPNOTSUPP; if (copy_from_user(&eeprom, useraddr, sizeof(eeprom))) return -EFAULT; /* Check for wrap and zero */ if (eeprom.offset + eeprom.len <= eeprom.offset) return -EINVAL; /* Check for exceeding total eeprom len */ if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev)) return -EINVAL; data = kzalloc(PAGE_SIZE, GFP_USER); if (!data) return -ENOMEM; bytes_remaining = eeprom.len; while (bytes_remaining > 0) { eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE); if (copy_from_user(data, userbuf, eeprom.len)) { ret = -EFAULT; break; } ret = ops->set_eeprom(dev, &eeprom, data); if (ret) break; userbuf += eeprom.len; eeprom.offset += eeprom.len; bytes_remaining -= eeprom.len; } kfree(data); return ret; } static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr) { struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE }; struct kernel_ethtool_coalesce kernel_coalesce = {}; int ret; if (!dev->ethtool_ops->get_coalesce) return -EOPNOTSUPP; ret = dev->ethtool_ops->get_coalesce(dev, &coalesce, &kernel_coalesce, NULL); if (ret) return ret; if (copy_to_user(useraddr, &coalesce, sizeof(coalesce))) return -EFAULT; return 0; } static bool ethtool_set_coalesce_supported(struct net_device *dev, struct ethtool_coalesce *coalesce) { u32 supported_params = dev->ethtool_ops->supported_coalesce_params; u32 nonzero_params = 0; if (coalesce->rx_coalesce_usecs) nonzero_params |= ETHTOOL_COALESCE_RX_USECS; if (coalesce->rx_max_coalesced_frames) nonzero_params |= ETHTOOL_COALESCE_RX_MAX_FRAMES; if (coalesce->rx_coalesce_usecs_irq) nonzero_params |= ETHTOOL_COALESCE_RX_USECS_IRQ; if (coalesce->rx_max_coalesced_frames_irq) nonzero_params |= ETHTOOL_COALESCE_RX_MAX_FRAMES_IRQ; if (coalesce->tx_coalesce_usecs) nonzero_params |= ETHTOOL_COALESCE_TX_USECS; if (coalesce->tx_max_coalesced_frames) nonzero_params |= ETHTOOL_COALESCE_TX_MAX_FRAMES; if (coalesce->tx_coalesce_usecs_irq) nonzero_params |= ETHTOOL_COALESCE_TX_USECS_IRQ; if (coalesce->tx_max_coalesced_frames_irq) nonzero_params |= ETHTOOL_COALESCE_TX_MAX_FRAMES_IRQ; if (coalesce->stats_block_coalesce_usecs) nonzero_params |= ETHTOOL_COALESCE_STATS_BLOCK_USECS; if (coalesce->use_adaptive_rx_coalesce) nonzero_params |= ETHTOOL_COALESCE_USE_ADAPTIVE_RX; if (coalesce->use_adaptive_tx_coalesce) nonzero_params |= ETHTOOL_COALESCE_USE_ADAPTIVE_TX; if (coalesce->pkt_rate_low) nonzero_params |= ETHTOOL_COALESCE_PKT_RATE_LOW; if (coalesce->rx_coalesce_usecs_low) nonzero_params |= ETHTOOL_COALESCE_RX_USECS_LOW; if (coalesce->rx_max_coalesced_frames_low) nonzero_params |= ETHTOOL_COALESCE_RX_MAX_FRAMES_LOW; if (coalesce->tx_coalesce_usecs_low) nonzero_params |= ETHTOOL_COALESCE_TX_USECS_LOW; if (coalesce->tx_max_coalesced_frames_low) nonzero_params |= ETHTOOL_COALESCE_TX_MAX_FRAMES_LOW; if (coalesce->pkt_rate_high) nonzero_params |= ETHTOOL_COALESCE_PKT_RATE_HIGH; if (coalesce->rx_coalesce_usecs_high) nonzero_params |= ETHTOOL_COALESCE_RX_USECS_HIGH; if (coalesce->rx_max_coalesced_frames_high) nonzero_params |= ETHTOOL_COALESCE_RX_MAX_FRAMES_HIGH; if (coalesce->tx_coalesce_usecs_high) nonzero_params |= ETHTOOL_COALESCE_TX_USECS_HIGH; if (coalesce->tx_max_coalesced_frames_high) nonzero_params |= ETHTOOL_COALESCE_TX_MAX_FRAMES_HIGH; if (coalesce->rate_sample_interval) nonzero_params |= ETHTOOL_COALESCE_RATE_SAMPLE_INTERVAL; return (supported_params & nonzero_params) == nonzero_params; } static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev, void __user *useraddr) { struct kernel_ethtool_coalesce kernel_coalesce = {}; struct ethtool_coalesce coalesce; int ret; if (!dev->ethtool_ops->set_coalesce || !dev->ethtool_ops->get_coalesce) return -EOPNOTSUPP; ret = dev->ethtool_ops->get_coalesce(dev, &coalesce, &kernel_coalesce, NULL); if (ret) return ret; if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) return -EFAULT; if (!ethtool_set_coalesce_supported(dev, &coalesce)) return -EOPNOTSUPP; ret = dev->ethtool_ops->set_coalesce(dev, &coalesce, &kernel_coalesce, NULL); if (!ret) ethtool_notify(dev, ETHTOOL_MSG_COALESCE_NTF, NULL); return ret; } static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr) { struct ethtool_ringparam ringparam = { .cmd = ETHTOOL_GRINGPARAM }; struct kernel_ethtool_ringparam kernel_ringparam = {}; if (!dev->ethtool_ops->get_ringparam) return -EOPNOTSUPP; dev->ethtool_ops->get_ringparam(dev, &ringparam, &kernel_ringparam, NULL); if (copy_to_user(useraddr, &ringparam, sizeof(ringparam))) return -EFAULT; return 0; } static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) { struct ethtool_ringparam ringparam, max = { .cmd = ETHTOOL_GRINGPARAM }; struct kernel_ethtool_ringparam kernel_ringparam; int ret; if (!dev->ethtool_ops->set_ringparam || !dev->ethtool_ops->get_ringparam) return -EOPNOTSUPP; if (copy_from_user(&ringparam, useraddr, sizeof(ringparam))) return -EFAULT; dev->ethtool_ops->get_ringparam(dev, &max, &kernel_ringparam, NULL); /* ensure new ring parameters are within the maximums */ if (ringparam.rx_pending > max.rx_max_pending || ringparam.rx_mini_pending > max.rx_mini_max_pending || ringparam.rx_jumbo_pending > max.rx_jumbo_max_pending || ringparam.tx_pending > max.tx_max_pending) return -EINVAL; ret = dev->ethtool_ops->set_ringparam(dev, &ringparam, &kernel_ringparam, NULL); if (!ret) ethtool_notify(dev, ETHTOOL_MSG_RINGS_NTF, NULL); return ret; } static noinline_for_stack int ethtool_get_channels(struct net_device *dev, void __user *useraddr) { struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS }; if (!dev->ethtool_ops->get_channels) return -EOPNOTSUPP; dev->ethtool_ops->get_channels(dev, &channels); if (copy_to_user(useraddr, &channels, sizeof(channels))) return -EFAULT; return 0; } static noinline_for_stack int ethtool_set_channels(struct net_device *dev, void __user *useraddr) { struct ethtool_channels channels, curr = { .cmd = ETHTOOL_GCHANNELS }; u16 from_channel, to_channel; u64 max_rxnfc_in_use; u32 max_rxfh_in_use; unsigned int i; int ret; if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels) return -EOPNOTSUPP; if (copy_from_user(&channels, useraddr, sizeof(channels))) return -EFAULT; dev->ethtool_ops->get_channels(dev, &curr); if (channels.rx_count == curr.rx_count && channels.tx_count == curr.tx_count && channels.combined_count == curr.combined_count && channels.other_count == curr.other_count) return 0; /* ensure new counts are within the maximums */ if (channels.rx_count > curr.max_rx || channels.tx_count > curr.max_tx || channels.combined_count > curr.max_combined || channels.other_count > curr.max_other) return -EINVAL; /* ensure there is at least one RX and one TX channel */ if (!channels.combined_count && (!channels.rx_count || !channels.tx_count)) return -EINVAL; /* ensure the new Rx count fits within the configured Rx flow * indirection table/rxnfc settings */ if (ethtool_get_max_rxnfc_channel(dev, &max_rxnfc_in_use)) max_rxnfc_in_use = 0; if (!netif_is_rxfh_configured(dev) || ethtool_get_max_rxfh_channel(dev, &max_rxfh_in_use)) max_rxfh_in_use = 0; if (channels.combined_count + channels.rx_count <= max_t(u64, max_rxnfc_in_use, max_rxfh_in_use)) return -EINVAL; /* Disabling channels, query zero-copy AF_XDP sockets */ from_channel = channels.combined_count + min(channels.rx_count, channels.tx_count); to_channel = curr.combined_count + max(curr.rx_count, curr.tx_count); for (i = from_channel; i < to_channel; i++) if (xsk_get_pool_from_qid(dev, i)) return -EINVAL; ret = dev->ethtool_ops->set_channels(dev, &channels); if (!ret) ethtool_notify(dev, ETHTOOL_MSG_CHANNELS_NTF, NULL); return ret; } static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr) { struct ethtool_pauseparam pauseparam = { .cmd = ETHTOOL_GPAUSEPARAM }; if (!dev->ethtool_ops->get_pauseparam) return -EOPNOTSUPP; dev->ethtool_ops->get_pauseparam(dev, &pauseparam); if (copy_to_user(useraddr, &pauseparam, sizeof(pauseparam))) return -EFAULT; return 0; } static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr) { struct ethtool_pauseparam pauseparam; int ret; if (!dev->ethtool_ops->set_pauseparam) return -EOPNOTSUPP; if (copy_from_user(&pauseparam, useraddr, sizeof(pauseparam))) return -EFAULT; ret = dev->ethtool_ops->set_pauseparam(dev, &pauseparam); if (!ret) ethtool_notify(dev, ETHTOOL_MSG_PAUSE_NTF, NULL); return ret; } static int ethtool_self_test(struct net_device *dev, char __user *useraddr) { struct ethtool_test test; const struct ethtool_ops *ops = dev->ethtool_ops; u64 *data; int ret, test_len; if (!ops->self_test || !ops->get_sset_count) return -EOPNOTSUPP; test_len = ops->get_sset_count(dev, ETH_SS_TEST); if (test_len < 0) return test_len; WARN_ON(test_len == 0); if (copy_from_user(&test, useraddr, sizeof(test))) return -EFAULT; test.len = test_len; data = kcalloc(test_len, sizeof(u64), GFP_USER); if (!data) return -ENOMEM; netif_testing_on(dev); ops->self_test(dev, &test, data); netif_testing_off(dev); ret = -EFAULT; if (copy_to_user(useraddr, &test, sizeof(test))) goto out; useraddr += sizeof(test); if (copy_to_user(useraddr, data, array_size(test.len, sizeof(u64)))) goto out; ret = 0; out: kfree(data); return ret; } static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) { struct ethtool_gstrings gstrings; u8 *data; int ret; if (copy_from_user(&gstrings, useraddr, sizeof(gstrings))) return -EFAULT; ret = __ethtool_get_sset_count(dev, gstrings.string_set); if (ret < 0) return ret; if (ret > S32_MAX / ETH_GSTRING_LEN) return -ENOMEM; WARN_ON_ONCE(!ret); gstrings.len = ret; if (gstrings.len) { data = vzalloc(array_size(gstrings.len, ETH_GSTRING_LEN)); if (!data) return -ENOMEM; __ethtool_get_strings(dev, gstrings.string_set, data); } else { data = NULL; } ret = -EFAULT; if (copy_to_user(useraddr, &gstrings, sizeof(gstrings))) goto out; useraddr += sizeof(gstrings); if (gstrings.len && copy_to_user(useraddr, data, array_size(gstrings.len, ETH_GSTRING_LEN))) goto out; ret = 0; out: vfree(data); return ret; } __printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...) { va_list args; va_start(args, fmt); vsnprintf(*data, ETH_GSTRING_LEN, fmt, args); va_end(args); *data += ETH_GSTRING_LEN; } EXPORT_SYMBOL(ethtool_sprintf); void ethtool_puts(u8 **data, const char *str) { strscpy(*data, str, ETH_GSTRING_LEN); *data += ETH_GSTRING_LEN; } EXPORT_SYMBOL(ethtool_puts); static int ethtool_phys_id(struct net_device *dev, void __user *useraddr) { struct ethtool_value id; static bool busy; const struct ethtool_ops *ops = dev->ethtool_ops; netdevice_tracker dev_tracker; int rc; if (!ops->set_phys_id) return -EOPNOTSUPP; if (busy) return -EBUSY; if (copy_from_user(&id, useraddr, sizeof(id))) return -EFAULT; rc = ops->set_phys_id(dev, ETHTOOL_ID_ACTIVE); if (rc < 0) return rc; /* Drop the RTNL lock while waiting, but prevent reentry or * removal of the device. */ busy = true; netdev_hold(dev, &dev_tracker, GFP_KERNEL); rtnl_unlock(); if (rc == 0) { /* Driver will handle this itself */ schedule_timeout_interruptible( id.data ? (id.data * HZ) : MAX_SCHEDULE_TIMEOUT); } else { /* Driver expects to be called at twice the frequency in rc */ int n = rc * 2, interval = HZ / n; u64 count = mul_u32_u32(n, id.data); u64 i = 0; do { rtnl_lock(); rc = ops->set_phys_id(dev, (i++ & 1) ? ETHTOOL_ID_OFF : ETHTOOL_ID_ON); rtnl_unlock(); if (rc) break; schedule_timeout_interruptible(interval); } while (!signal_pending(current) && (!id.data || i < count)); } rtnl_lock(); netdev_put(dev, &dev_tracker); busy = false; (void) ops->set_phys_id(dev, ETHTOOL_ID_INACTIVE); return rc; } static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) { struct ethtool_stats stats; const struct ethtool_ops *ops = dev->ethtool_ops; u64 *data; int ret, n_stats; if (!ops->get_ethtool_stats || !ops->get_sset_count) return -EOPNOTSUPP; n_stats = ops->get_sset_count(dev, ETH_SS_STATS); if (n_stats < 0) return n_stats; if (n_stats > S32_MAX / sizeof(u64)) return -ENOMEM; WARN_ON_ONCE(!n_stats); if (copy_from_user(&stats, useraddr, sizeof(stats))) return -EFAULT; stats.n_stats = n_stats; if (n_stats) { data = vzalloc(array_size(n_stats, sizeof(u64))); if (!data) return -ENOMEM; ops->get_ethtool_stats(dev, &stats, data); } else { data = NULL; } ret = -EFAULT; if (copy_to_user(useraddr, &stats, sizeof(stats))) goto out; useraddr += sizeof(stats); if (n_stats && copy_to_user(useraddr, data, array_size(n_stats, sizeof(u64)))) goto out; ret = 0; out: vfree(data); return ret; } static int ethtool_vzalloc_stats_array(int n_stats, u64 **data) { if (n_stats < 0) return n_stats; if (n_stats > S32_MAX / sizeof(u64)) return -ENOMEM; if (WARN_ON_ONCE(!n_stats)) return -EOPNOTSUPP; *data = vzalloc(array_size(n_stats, sizeof(u64))); if (!*data) return -ENOMEM; return 0; } static int ethtool_get_phy_stats_phydev(struct phy_device *phydev, struct ethtool_stats *stats, u64 **data) { const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops; int n_stats, ret; if (!phy_ops || !phy_ops->get_sset_count || !phy_ops->get_stats) return -EOPNOTSUPP; n_stats = phy_ops->get_sset_count(phydev); ret = ethtool_vzalloc_stats_array(n_stats, data); if (ret) return ret; stats->n_stats = n_stats; return phy_ops->get_stats(phydev, stats, *data); } static int ethtool_get_phy_stats_ethtool(struct net_device *dev, struct ethtool_stats *stats, u64 **data) { const struct ethtool_ops *ops = dev->ethtool_ops; int n_stats, ret; if (!ops || !ops->get_sset_count || ops->get_ethtool_phy_stats) return -EOPNOTSUPP; n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS); ret = ethtool_vzalloc_stats_array(n_stats, data); if (ret) return ret; stats->n_stats = n_stats; ops->get_ethtool_phy_stats(dev, stats, *data); return 0; } static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) { struct phy_device *phydev = dev->phydev; struct ethtool_stats stats; u64 *data = NULL; int ret = -EOPNOTSUPP; if (copy_from_user(&stats, useraddr, sizeof(stats))) return -EFAULT; if (phydev) ret = ethtool_get_phy_stats_phydev(phydev, &stats, &data); if (ret == -EOPNOTSUPP) ret = ethtool_get_phy_stats_ethtool(dev, &stats, &data); if (ret) goto out; if (copy_to_user(useraddr, &stats, sizeof(stats))) { ret = -EFAULT; goto out; } useraddr += sizeof(stats); if (copy_to_user(useraddr, data, array_size(stats.n_stats, sizeof(u64)))) ret = -EFAULT; out: vfree(data); return ret; } static int ethtool_get_perm_addr(struct net_device *dev, void __user *useraddr) { struct ethtool_perm_addr epaddr; if (copy_from_user(&epaddr, useraddr, sizeof(epaddr))) return -EFAULT; if (epaddr.size < dev->addr_len) return -ETOOSMALL; epaddr.size = dev->addr_len; if (copy_to_user(useraddr, &epaddr, sizeof(epaddr))) return -EFAULT; useraddr += sizeof(epaddr); if (copy_to_user(useraddr, dev->perm_addr, epaddr.size)) return -EFAULT; return 0; } static int ethtool_get_value(struct net_device *dev, char __user *useraddr, u32 cmd, u32 (*actor)(struct net_device *)) { struct ethtool_value edata = { .cmd = cmd }; if (!actor) return -EOPNOTSUPP; edata.data = actor(dev); if (copy_to_user(useraddr, &edata, sizeof(edata))) return -EFAULT; return 0; } static int ethtool_set_value_void(struct net_device *dev, char __user *useraddr, void (*actor)(struct net_device *, u32)) { struct ethtool_value edata; if (!actor) return -EOPNOTSUPP; if (copy_from_user(&edata, useraddr, sizeof(edata))) return -EFAULT; actor(dev, edata.data); return 0; } static int ethtool_set_value(struct net_device *dev, char __user *useraddr, int (*actor)(struct net_device *, u32)) { struct ethtool_value edata; if (!actor) return -EOPNOTSUPP; if (copy_from_user(&edata, useraddr, sizeof(edata))) return -EFAULT; return actor(dev, edata.data); } static int ethtool_flash_device(struct net_device *dev, struct ethtool_devlink_compat *req) { if (!dev->ethtool_ops->flash_device) { req->devlink = netdev_to_devlink_get(dev); return 0; } return dev->ethtool_ops->flash_device(dev, &req->efl); } static int ethtool_set_dump(struct net_device *dev, void __user *useraddr) { struct ethtool_dump dump; if (!dev->ethtool_ops->set_dump) return -EOPNOTSUPP; if (copy_from_user(&dump, useraddr, sizeof(dump))) return -EFAULT; return dev->ethtool_ops->set_dump(dev, &dump); } static int ethtool_get_dump_flag(struct net_device *dev, void __user *useraddr) { int ret; struct ethtool_dump dump; const struct ethtool_ops *ops = dev->ethtool_ops; if (!ops->get_dump_flag) return -EOPNOTSUPP; if (copy_from_user(&dump, useraddr, sizeof(dump))) return -EFAULT; ret = ops->get_dump_flag(dev, &dump); if (ret) return ret; if (copy_to_user(useraddr, &dump, sizeof(dump))) return -EFAULT; return 0; } static int ethtool_get_dump_data(struct net_device *dev, void __user *useraddr) { int ret; __u32 len; struct ethtool_dump dump, tmp; const struct ethtool_ops *ops = dev->ethtool_ops; void *data = NULL; if (!ops->get_dump_data || !ops->get_dump_flag) return -EOPNOTSUPP; if (copy_from_user(&dump, useraddr, sizeof(dump))) return -EFAULT; memset(&tmp, 0, sizeof(tmp)); tmp.cmd = ETHTOOL_GET_DUMP_FLAG; ret = ops->get_dump_flag(dev, &tmp); if (ret) return ret; len = min(tmp.len, dump.len); if (!len) return -EFAULT; /* Don't ever let the driver think there's more space available * than it requested with .get_dump_flag(). */ dump.len = len; /* Always allocate enough space to hold the whole thing so that the * driver does not need to check the length and bother with partial * dumping. */ data = vzalloc(tmp.len); if (!data) return -ENOMEM; ret = ops->get_dump_data(dev, &dump, data); if (ret) goto out; /* There are two sane possibilities: * 1. The driver's .get_dump_data() does not touch dump.len. * 2. Or it may set dump.len to how much it really writes, which * should be tmp.len (or len if it can do a partial dump). * In any case respond to userspace with the actual length of data * it's receiving. */ WARN_ON(dump.len != len && dump.len != tmp.len); dump.len = len; if (copy_to_user(useraddr, &dump, sizeof(dump))) { ret = -EFAULT; goto out; } useraddr += offsetof(struct ethtool_dump, data); if (copy_to_user(useraddr, data, len)) ret = -EFAULT; out: vfree(data); return ret; } static int ethtool_get_ts_info(struct net_device *dev, void __user *useraddr) { struct ethtool_ts_info info; int err; err = __ethtool_get_ts_info(dev, &info); if (err) return err; if (copy_to_user(useraddr, &info, sizeof(info))) return -EFAULT; return 0; } int ethtool_get_module_info_call(struct net_device *dev, struct ethtool_modinfo *modinfo) { const struct ethtool_ops *ops = dev->ethtool_ops; struct phy_device *phydev = dev->phydev; if (dev->sfp_bus) return sfp_get_module_info(dev->sfp_bus, modinfo); if (phydev && phydev->drv && phydev->drv->module_info) return phydev->drv->module_info(phydev, modinfo); if (ops->get_module_info) return ops->get_module_info(dev, modinfo); return -EOPNOTSUPP; } static int ethtool_get_module_info(struct net_device *dev, void __user *useraddr) { int ret; struct ethtool_modinfo modinfo; if (copy_from_user(&modinfo, useraddr, sizeof(modinfo))) return -EFAULT; ret = ethtool_get_module_info_call(dev, &modinfo); if (ret) return ret; if (copy_to_user(useraddr, &modinfo, sizeof(modinfo))) return -EFAULT; return 0; } int ethtool_get_module_eeprom_call(struct net_device *dev, struct ethtool_eeprom *ee, u8 *data) { const struct ethtool_ops *ops = dev->ethtool_ops; struct phy_device *phydev = dev->phydev; if (dev->sfp_bus) return sfp_get_module_eeprom(dev->sfp_bus, ee, data); if (phydev && phydev->drv && phydev->drv->module_eeprom) return phydev->drv->module_eeprom(phydev, ee, data); if (ops->get_module_eeprom) return ops->get_module_eeprom(dev, ee, data); return -EOPNOTSUPP; } static int ethtool_get_module_eeprom(struct net_device *dev, void __user *useraddr) { int ret; struct ethtool_modinfo modinfo; ret = ethtool_get_module_info_call(dev, &modinfo); if (ret) return ret; return ethtool_get_any_eeprom(dev, useraddr, ethtool_get_module_eeprom_call, modinfo.eeprom_len); } static int ethtool_tunable_valid(const struct ethtool_tunable *tuna) { switch (tuna->id) { case ETHTOOL_RX_COPYBREAK: case ETHTOOL_TX_COPYBREAK: case ETHTOOL_TX_COPYBREAK_BUF_SIZE: if (tuna->len != sizeof(u32) || tuna->type_id != ETHTOOL_TUNABLE_U32) return -EINVAL; break; case ETHTOOL_PFC_PREVENTION_TOUT: if (tuna->len != sizeof(u16) || tuna->type_id != ETHTOOL_TUNABLE_U16) return -EINVAL; break; default: return -EINVAL; } return 0; } static int ethtool_get_tunable(struct net_device *dev, void __user *useraddr) { int ret; struct ethtool_tunable tuna; const struct ethtool_ops *ops = dev->ethtool_ops; void *data; if (!ops->get_tunable) return -EOPNOTSUPP; if (copy_from_user(&tuna, useraddr, sizeof(tuna))) return -EFAULT; ret = ethtool_tunable_valid(&tuna); if (ret) return ret; data = kzalloc(tuna.len, GFP_USER); if (!data) return -ENOMEM; ret = ops->get_tunable(dev, &tuna, data); if (ret) goto out; useraddr += sizeof(tuna); ret = -EFAULT; if (copy_to_user(useraddr, data, tuna.len)) goto out; ret = 0; out: kfree(data); return ret; } static int ethtool_set_tunable(struct net_device *dev, void __user *useraddr) { int ret; struct ethtool_tunable tuna; const struct ethtool_ops *ops = dev->ethtool_ops; void *data; if (!ops->set_tunable) return -EOPNOTSUPP; if (copy_from_user(&tuna, useraddr, sizeof(tuna))) return -EFAULT; ret = ethtool_tunable_valid(&tuna); if (ret) return ret; useraddr += sizeof(tuna); data = memdup_user(useraddr, tuna.len); if (IS_ERR(data)) return PTR_ERR(data); ret = ops->set_tunable(dev, &tuna, data); kfree(data); return ret; } static noinline_for_stack int ethtool_get_per_queue_coalesce(struct net_device *dev, void __user *useraddr, struct ethtool_per_queue_op *per_queue_opt) { u32 bit; int ret; DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE); if (!dev->ethtool_ops->get_per_queue_coalesce) return -EOPNOTSUPP; useraddr += sizeof(*per_queue_opt); bitmap_from_arr32(queue_mask, per_queue_opt->queue_mask, MAX_NUM_QUEUE); for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) { struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE }; ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, &coalesce); if (ret != 0) return ret; if (copy_to_user(useraddr, &coalesce, sizeof(coalesce))) return -EFAULT; useraddr += sizeof(coalesce); } return 0; } static noinline_for_stack int ethtool_set_per_queue_coalesce(struct net_device *dev, void __user *useraddr, struct ethtool_per_queue_op *per_queue_opt) { u32 bit; int i, ret = 0; int n_queue; struct ethtool_coalesce *backup = NULL, *tmp = NULL; DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE); if ((!dev->ethtool_ops->set_per_queue_coalesce) || (!dev->ethtool_ops->get_per_queue_coalesce)) return -EOPNOTSUPP; useraddr += sizeof(*per_queue_opt); bitmap_from_arr32(queue_mask, per_queue_opt->queue_mask, MAX_NUM_QUEUE); n_queue = bitmap_weight(queue_mask, MAX_NUM_QUEUE); tmp = backup = kmalloc_array(n_queue, sizeof(*backup), GFP_KERNEL); if (!backup) return -ENOMEM; for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) { struct ethtool_coalesce coalesce; ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, tmp); if (ret != 0) goto roll_back; tmp++; if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) { ret = -EFAULT; goto roll_back; } if (!ethtool_set_coalesce_supported(dev, &coalesce)) { ret = -EOPNOTSUPP; goto roll_back; } ret = dev->ethtool_ops->set_per_queue_coalesce(dev, bit, &coalesce); if (ret != 0) goto roll_back; useraddr += sizeof(coalesce); } roll_back: if (ret != 0) { tmp = backup; for_each_set_bit(i, queue_mask, bit) { dev->ethtool_ops->set_per_queue_coalesce(dev, i, tmp); tmp++; } } kfree(backup); return ret; } static int noinline_for_stack ethtool_set_per_queue(struct net_device *dev, void __user *useraddr, u32 sub_cmd) { struct ethtool_per_queue_op per_queue_opt; if (copy_from_user(&per_queue_opt, useraddr, sizeof(per_queue_opt))) return -EFAULT; if (per_queue_opt.sub_command != sub_cmd) return -EINVAL; switch (per_queue_opt.sub_command) { case ETHTOOL_GCOALESCE: return ethtool_get_per_queue_coalesce(dev, useraddr, &per_queue_opt); case ETHTOOL_SCOALESCE: return ethtool_set_per_queue_coalesce(dev, useraddr, &per_queue_opt); default: return -EOPNOTSUPP; } } static int ethtool_phy_tunable_valid(const struct ethtool_tunable *tuna) { switch (tuna->id) { case ETHTOOL_PHY_DOWNSHIFT: case ETHTOOL_PHY_FAST_LINK_DOWN: if (tuna->len != sizeof(u8) || tuna->type_id != ETHTOOL_TUNABLE_U8) return -EINVAL; break; case ETHTOOL_PHY_EDPD: if (tuna->len != sizeof(u16) || tuna->type_id != ETHTOOL_TUNABLE_U16) return -EINVAL; break; default: return -EINVAL; } return 0; } static int get_phy_tunable(struct net_device *dev, void __user *useraddr) { struct phy_device *phydev = dev->phydev; struct ethtool_tunable tuna; bool phy_drv_tunable; void *data; int ret; phy_drv_tunable = phydev && phydev->drv && phydev->drv->get_tunable; if (!phy_drv_tunable && !dev->ethtool_ops->get_phy_tunable) return -EOPNOTSUPP; if (copy_from_user(&tuna, useraddr, sizeof(tuna))) return -EFAULT; ret = ethtool_phy_tunable_valid(&tuna); if (ret) return ret; data = kzalloc(tuna.len, GFP_USER); if (!data) return -ENOMEM; if (phy_drv_tunable) { mutex_lock(&phydev->lock); ret = phydev->drv->get_tunable(phydev, &tuna, data); mutex_unlock(&phydev->lock); } else { ret = dev->ethtool_ops->get_phy_tunable(dev, &tuna, data); } if (ret) goto out; useraddr += sizeof(tuna); ret = -EFAULT; if (copy_to_user(useraddr, data, tuna.len)) goto out; ret = 0; out: kfree(data); return ret; } static int set_phy_tunable(struct net_device *dev, void __user *useraddr) { struct phy_device *phydev = dev->phydev; struct ethtool_tunable tuna; bool phy_drv_tunable; void *data; int ret; phy_drv_tunable = phydev && phydev->drv && phydev->drv->get_tunable; if (!phy_drv_tunable && !dev->ethtool_ops->set_phy_tunable) return -EOPNOTSUPP; if (copy_from_user(&tuna, useraddr, sizeof(tuna))) return -EFAULT; ret = ethtool_phy_tunable_valid(&tuna); if (ret) return ret; useraddr += sizeof(tuna); data = memdup_user(useraddr, tuna.len); if (IS_ERR(data)) return PTR_ERR(data); if (phy_drv_tunable) { mutex_lock(&phydev->lock); ret = phydev->drv->set_tunable(phydev, &tuna, data); mutex_unlock(&phydev->lock); } else { ret = dev->ethtool_ops->set_phy_tunable(dev, &tuna, data); } kfree(data); return ret; } static int ethtool_get_fecparam(struct net_device *dev, void __user *useraddr) { struct ethtool_fecparam fecparam = { .cmd = ETHTOOL_GFECPARAM }; int rc; if (!dev->ethtool_ops->get_fecparam) return -EOPNOTSUPP; rc = dev->ethtool_ops->get_fecparam(dev, &fecparam); if (rc) return rc; if (WARN_ON_ONCE(fecparam.reserved)) fecparam.reserved = 0; if (copy_to_user(useraddr, &fecparam, sizeof(fecparam))) return -EFAULT; return 0; } static int ethtool_set_fecparam(struct net_device *dev, void __user *useraddr) { struct ethtool_fecparam fecparam; if (!dev->ethtool_ops->set_fecparam) return -EOPNOTSUPP; if (copy_from_user(&fecparam, useraddr, sizeof(fecparam))) return -EFAULT; if (!fecparam.fec || fecparam.fec & ETHTOOL_FEC_NONE) return -EINVAL; fecparam.active_fec = 0; fecparam.reserved = 0; return dev->ethtool_ops->set_fecparam(dev, &fecparam); } /* The main entry point in this file. Called from net/core/dev_ioctl.c */ static int __dev_ethtool(struct net *net, struct ifreq *ifr, void __user *useraddr, u32 ethcmd, struct ethtool_devlink_compat *devlink_state) { struct net_device *dev; u32 sub_cmd; int rc; netdev_features_t old_features; dev = __dev_get_by_name(net, ifr->ifr_name); if (!dev) return -ENODEV; if (ethcmd == ETHTOOL_PERQUEUE) { if (copy_from_user(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd))) return -EFAULT; } else { sub_cmd = ethcmd; } /* Allow some commands to be done by anyone */ switch (sub_cmd) { case ETHTOOL_GSET: case ETHTOOL_GDRVINFO: case ETHTOOL_GMSGLVL: case ETHTOOL_GLINK: case ETHTOOL_GCOALESCE: case ETHTOOL_GRINGPARAM: case ETHTOOL_GPAUSEPARAM: case ETHTOOL_GRXCSUM: case ETHTOOL_GTXCSUM: case ETHTOOL_GSG: case ETHTOOL_GSSET_INFO: case ETHTOOL_GSTRINGS: case ETHTOOL_GSTATS: case ETHTOOL_GPHYSTATS: case ETHTOOL_GTSO: case ETHTOOL_GPERMADDR: case ETHTOOL_GUFO: case ETHTOOL_GGSO: case ETHTOOL_GGRO: case ETHTOOL_GFLAGS: case ETHTOOL_GPFLAGS: case ETHTOOL_GRXFH: case ETHTOOL_GRXRINGS: case ETHTOOL_GRXCLSRLCNT: case ETHTOOL_GRXCLSRULE: case ETHTOOL_GRXCLSRLALL: case ETHTOOL_GRXFHINDIR: case ETHTOOL_GRSSH: case ETHTOOL_GFEATURES: case ETHTOOL_GCHANNELS: case ETHTOOL_GET_TS_INFO: case ETHTOOL_GEEE: case ETHTOOL_GTUNABLE: case ETHTOOL_PHY_GTUNABLE: case ETHTOOL_GLINKSETTINGS: case ETHTOOL_GFECPARAM: break; default: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; } if (dev->dev.parent) pm_runtime_get_sync(dev->dev.parent); if (!netif_device_present(dev)) { rc = -ENODEV; goto out; } if (dev->ethtool_ops->begin) { rc = dev->ethtool_ops->begin(dev); if (rc < 0) goto out; } old_features = dev->features; switch (ethcmd) { case ETHTOOL_GSET: rc = ethtool_get_settings(dev, useraddr); break; case ETHTOOL_SSET: rc = ethtool_set_settings(dev, useraddr); break; case ETHTOOL_GDRVINFO: rc = ethtool_get_drvinfo(dev, devlink_state); break; case ETHTOOL_GREGS: rc = ethtool_get_regs(dev, useraddr); break; case ETHTOOL_GWOL: rc = ethtool_get_wol(dev, useraddr); break; case ETHTOOL_SWOL: rc = ethtool_set_wol(dev, useraddr); break; case ETHTOOL_GMSGLVL: rc = ethtool_get_value(dev, useraddr, ethcmd, dev->ethtool_ops->get_msglevel); break; case ETHTOOL_SMSGLVL: rc = ethtool_set_value_void(dev, useraddr, dev->ethtool_ops->set_msglevel); if (!rc) ethtool_notify(dev, ETHTOOL_MSG_DEBUG_NTF, NULL); break; case ETHTOOL_GEEE: rc = ethtool_get_eee(dev, useraddr); break; case ETHTOOL_SEEE: rc = ethtool_set_eee(dev, useraddr); break; case ETHTOOL_NWAY_RST: rc = ethtool_nway_reset(dev); break; case ETHTOOL_GLINK: rc = ethtool_get_link(dev, useraddr); break; case ETHTOOL_GEEPROM: rc = ethtool_get_eeprom(dev, useraddr); break; case ETHTOOL_SEEPROM: rc = ethtool_set_eeprom(dev, useraddr); break; case ETHTOOL_GCOALESCE: rc = ethtool_get_coalesce(dev, useraddr); break; case ETHTOOL_SCOALESCE: rc = ethtool_set_coalesce(dev, useraddr); break; case ETHTOOL_GRINGPARAM: rc = ethtool_get_ringparam(dev, useraddr); break; case ETHTOOL_SRINGPARAM: rc = ethtool_set_ringparam(dev, useraddr); break; case ETHTOOL_GPAUSEPARAM: rc = ethtool_get_pauseparam(dev, useraddr); break; case ETHTOOL_SPAUSEPARAM: rc = ethtool_set_pauseparam(dev, useraddr); break; case ETHTOOL_TEST: rc = ethtool_self_test(dev, useraddr); break; case ETHTOOL_GSTRINGS: rc = ethtool_get_strings(dev, useraddr); break; case ETHTOOL_PHYS_ID: rc = ethtool_phys_id(dev, useraddr); break; case ETHTOOL_GSTATS: rc = ethtool_get_stats(dev, useraddr); break; case ETHTOOL_GPERMADDR: rc = ethtool_get_perm_addr(dev, useraddr); break; case ETHTOOL_GFLAGS: rc = ethtool_get_value(dev, useraddr, ethcmd, __ethtool_get_flags); break; case ETHTOOL_SFLAGS: rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags); break; case ETHTOOL_GPFLAGS: rc = ethtool_get_value(dev, useraddr, ethcmd, dev->ethtool_ops->get_priv_flags); if (!rc) ethtool_notify(dev, ETHTOOL_MSG_PRIVFLAGS_NTF, NULL); break; case ETHTOOL_SPFLAGS: rc = ethtool_set_value(dev, useraddr, dev->ethtool_ops->set_priv_flags); break; case ETHTOOL_GRXFH: case ETHTOOL_GRXRINGS: case ETHTOOL_GRXCLSRLCNT: case ETHTOOL_GRXCLSRULE: case ETHTOOL_GRXCLSRLALL: rc = ethtool_get_rxnfc(dev, ethcmd, useraddr); break; case ETHTOOL_SRXFH: case ETHTOOL_SRXCLSRLDEL: case ETHTOOL_SRXCLSRLINS: rc = ethtool_set_rxnfc(dev, ethcmd, useraddr); break; case ETHTOOL_FLASHDEV: rc = ethtool_flash_device(dev, devlink_state); break; case ETHTOOL_RESET: rc = ethtool_reset(dev, useraddr); break; case ETHTOOL_GSSET_INFO: rc = ethtool_get_sset_info(dev, useraddr); break; case ETHTOOL_GRXFHINDIR: rc = ethtool_get_rxfh_indir(dev, useraddr); break; case ETHTOOL_SRXFHINDIR: rc = ethtool_set_rxfh_indir(dev, useraddr); break; case ETHTOOL_GRSSH: rc = ethtool_get_rxfh(dev, useraddr); break; case ETHTOOL_SRSSH: rc = ethtool_set_rxfh(dev, useraddr); break; case ETHTOOL_GFEATURES: rc = ethtool_get_features(dev, useraddr); break; case ETHTOOL_SFEATURES: rc = ethtool_set_features(dev, useraddr); break; case ETHTOOL_GTXCSUM: case ETHTOOL_GRXCSUM: case ETHTOOL_GSG: case ETHTOOL_GTSO: case ETHTOOL_GGSO: case ETHTOOL_GGRO: rc = ethtool_get_one_feature(dev, useraddr, ethcmd); break; case ETHTOOL_STXCSUM: case ETHTOOL_SRXCSUM: case ETHTOOL_SSG: case ETHTOOL_STSO: case ETHTOOL_SGSO: case ETHTOOL_SGRO: rc = ethtool_set_one_feature(dev, useraddr, ethcmd); break; case ETHTOOL_GCHANNELS: rc = ethtool_get_channels(dev, useraddr); break; case ETHTOOL_SCHANNELS: rc = ethtool_set_channels(dev, useraddr); break; case ETHTOOL_SET_DUMP: rc = ethtool_set_dump(dev, useraddr); break; case ETHTOOL_GET_DUMP_FLAG: rc = ethtool_get_dump_flag(dev, useraddr); break; case ETHTOOL_GET_DUMP_DATA: rc = ethtool_get_dump_data(dev, useraddr); break; case ETHTOOL_GET_TS_INFO: rc = ethtool_get_ts_info(dev, useraddr); break; case ETHTOOL_GMODULEINFO: rc = ethtool_get_module_info(dev, useraddr); break; case ETHTOOL_GMODULEEEPROM: rc = ethtool_get_module_eeprom(dev, useraddr); break; case ETHTOOL_GTUNABLE: rc = ethtool_get_tunable(dev, useraddr); break; case ETHTOOL_STUNABLE: rc = ethtool_set_tunable(dev, useraddr); break; case ETHTOOL_GPHYSTATS: rc = ethtool_get_phy_stats(dev, useraddr); break; case ETHTOOL_PERQUEUE: rc = ethtool_set_per_queue(dev, useraddr, sub_cmd); break; case ETHTOOL_GLINKSETTINGS: rc = ethtool_get_link_ksettings(dev, useraddr); break; case ETHTOOL_SLINKSETTINGS: rc = ethtool_set_link_ksettings(dev, useraddr); break; case ETHTOOL_PHY_GTUNABLE: rc = get_phy_tunable(dev, useraddr); break; case ETHTOOL_PHY_STUNABLE: rc = set_phy_tunable(dev, useraddr); break; case ETHTOOL_GFECPARAM: rc = ethtool_get_fecparam(dev, useraddr); break; case ETHTOOL_SFECPARAM: rc = ethtool_set_fecparam(dev, useraddr); break; default: rc = -EOPNOTSUPP; } if (dev->ethtool_ops->complete) dev->ethtool_ops->complete(dev); if (old_features != dev->features) netdev_features_change(dev); out: if (dev->dev.parent) pm_runtime_put(dev->dev.parent); return rc; } int dev_ethtool(struct net *net, struct ifreq *ifr, void __user *useraddr) { struct ethtool_devlink_compat *state; u32 ethcmd; int rc; if (copy_from_user(&ethcmd, useraddr, sizeof(ethcmd))) return -EFAULT; state = kzalloc(sizeof(*state), GFP_KERNEL); if (!state) return -ENOMEM; switch (ethcmd) { case ETHTOOL_FLASHDEV: if (copy_from_user(&state->efl, useraddr, sizeof(state->efl))) { rc = -EFAULT; goto exit_free; } state->efl.data[ETHTOOL_FLASH_MAX_FILENAME - 1] = 0; break; } rtnl_lock(); rc = __dev_ethtool(net, ifr, useraddr, ethcmd, state); rtnl_unlock(); if (rc) goto exit_free; switch (ethcmd) { case ETHTOOL_FLASHDEV: if (state->devlink) rc = devlink_compat_flash_update(state->devlink, state->efl.data); break; case ETHTOOL_GDRVINFO: if (state->devlink) devlink_compat_running_version(state->devlink, state->info.fw_version, sizeof(state->info.fw_version)); if (copy_to_user(useraddr, &state->info, sizeof(state->info))) { rc = -EFAULT; goto exit_free; } break; } exit_free: if (state->devlink) devlink_put(state->devlink); kfree(state); return rc; } struct ethtool_rx_flow_key { struct flow_dissector_key_basic basic; union { struct flow_dissector_key_ipv4_addrs ipv4; struct flow_dissector_key_ipv6_addrs ipv6; }; struct flow_dissector_key_ports tp; struct flow_dissector_key_ip ip; struct flow_dissector_key_vlan vlan; struct flow_dissector_key_eth_addrs eth_addrs; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct ethtool_rx_flow_match { struct flow_dissector dissector; struct ethtool_rx_flow_key key; struct ethtool_rx_flow_key mask; }; struct ethtool_rx_flow_rule * ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) { const struct ethtool_rx_flow_spec *fs = input->fs; struct ethtool_rx_flow_match *match; struct ethtool_rx_flow_rule *flow; struct flow_action_entry *act; flow = kzalloc(sizeof(struct ethtool_rx_flow_rule) + sizeof(struct ethtool_rx_flow_match), GFP_KERNEL); if (!flow) return ERR_PTR(-ENOMEM); /* ethtool_rx supports only one single action per rule. */ flow->rule = flow_rule_alloc(1); if (!flow->rule) { kfree(flow); return ERR_PTR(-ENOMEM); } match = (struct ethtool_rx_flow_match *)flow->priv; flow->rule->match.dissector = &match->dissector; flow->rule->match.mask = &match->mask; flow->rule->match.key = &match->key; match->mask.basic.n_proto = htons(0xffff); switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT | FLOW_RSS)) { case ETHER_FLOW: { const struct ethhdr *ether_spec, *ether_m_spec; ether_spec = &fs->h_u.ether_spec; ether_m_spec = &fs->m_u.ether_spec; if (!is_zero_ether_addr(ether_m_spec->h_source)) { ether_addr_copy(match->key.eth_addrs.src, ether_spec->h_source); ether_addr_copy(match->mask.eth_addrs.src, ether_m_spec->h_source); } if (!is_zero_ether_addr(ether_m_spec->h_dest)) { ether_addr_copy(match->key.eth_addrs.dst, ether_spec->h_dest); ether_addr_copy(match->mask.eth_addrs.dst, ether_m_spec->h_dest); } if (ether_m_spec->h_proto) { match->key.basic.n_proto = ether_spec->h_proto; match->mask.basic.n_proto = ether_m_spec->h_proto; } } break; case TCP_V4_FLOW: case UDP_V4_FLOW: { const struct ethtool_tcpip4_spec *v4_spec, *v4_m_spec; match->key.basic.n_proto = htons(ETH_P_IP); v4_spec = &fs->h_u.tcp_ip4_spec; v4_m_spec = &fs->m_u.tcp_ip4_spec; if (v4_m_spec->ip4src) { match->key.ipv4.src = v4_spec->ip4src; match->mask.ipv4.src = v4_m_spec->ip4src; } if (v4_m_spec->ip4dst) { match->key.ipv4.dst = v4_spec->ip4dst; match->mask.ipv4.dst = v4_m_spec->ip4dst; } if (v4_m_spec->ip4src || v4_m_spec->ip4dst) { match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_IPV4_ADDRS); match->dissector.offset[FLOW_DISSECTOR_KEY_IPV4_ADDRS] = offsetof(struct ethtool_rx_flow_key, ipv4); } if (v4_m_spec->psrc) { match->key.tp.src = v4_spec->psrc; match->mask.tp.src = v4_m_spec->psrc; } if (v4_m_spec->pdst) { match->key.tp.dst = v4_spec->pdst; match->mask.tp.dst = v4_m_spec->pdst; } if (v4_m_spec->psrc || v4_m_spec->pdst) { match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_PORTS); match->dissector.offset[FLOW_DISSECTOR_KEY_PORTS] = offsetof(struct ethtool_rx_flow_key, tp); } if (v4_m_spec->tos) { match->key.ip.tos = v4_spec->tos; match->mask.ip.tos = v4_m_spec->tos; match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_IP); match->dissector.offset[FLOW_DISSECTOR_KEY_IP] = offsetof(struct ethtool_rx_flow_key, ip); } } break; case TCP_V6_FLOW: case UDP_V6_FLOW: { const struct ethtool_tcpip6_spec *v6_spec, *v6_m_spec; match->key.basic.n_proto = htons(ETH_P_IPV6); v6_spec = &fs->h_u.tcp_ip6_spec; v6_m_spec = &fs->m_u.tcp_ip6_spec; if (!ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6src)) { memcpy(&match->key.ipv6.src, v6_spec->ip6src, sizeof(match->key.ipv6.src)); memcpy(&match->mask.ipv6.src, v6_m_spec->ip6src, sizeof(match->mask.ipv6.src)); } if (!ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6dst)) { memcpy(&match->key.ipv6.dst, v6_spec->ip6dst, sizeof(match->key.ipv6.dst)); memcpy(&match->mask.ipv6.dst, v6_m_spec->ip6dst, sizeof(match->mask.ipv6.dst)); } if (!ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6src) || !ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6dst)) { match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_IPV6_ADDRS); match->dissector.offset[FLOW_DISSECTOR_KEY_IPV6_ADDRS] = offsetof(struct ethtool_rx_flow_key, ipv6); } if (v6_m_spec->psrc) { match->key.tp.src = v6_spec->psrc; match->mask.tp.src = v6_m_spec->psrc; } if (v6_m_spec->pdst) { match->key.tp.dst = v6_spec->pdst; match->mask.tp.dst = v6_m_spec->pdst; } if (v6_m_spec->psrc || v6_m_spec->pdst) { match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_PORTS); match->dissector.offset[FLOW_DISSECTOR_KEY_PORTS] = offsetof(struct ethtool_rx_flow_key, tp); } if (v6_m_spec->tclass) { match->key.ip.tos = v6_spec->tclass; match->mask.ip.tos = v6_m_spec->tclass; match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_IP); match->dissector.offset[FLOW_DISSECTOR_KEY_IP] = offsetof(struct ethtool_rx_flow_key, ip); } } break; default: ethtool_rx_flow_rule_destroy(flow); return ERR_PTR(-EINVAL); } switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT | FLOW_RSS)) { case TCP_V4_FLOW: case TCP_V6_FLOW: match->key.basic.ip_proto = IPPROTO_TCP; match->mask.basic.ip_proto = 0xff; break; case UDP_V4_FLOW: case UDP_V6_FLOW: match->key.basic.ip_proto = IPPROTO_UDP; match->mask.basic.ip_proto = 0xff; break; } match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_BASIC); match->dissector.offset[FLOW_DISSECTOR_KEY_BASIC] = offsetof(struct ethtool_rx_flow_key, basic); if (fs->flow_type & FLOW_EXT) { const struct ethtool_flow_ext *ext_h_spec = &fs->h_ext; const struct ethtool_flow_ext *ext_m_spec = &fs->m_ext; if (ext_m_spec->vlan_etype) { match->key.vlan.vlan_tpid = ext_h_spec->vlan_etype; match->mask.vlan.vlan_tpid = ext_m_spec->vlan_etype; } if (ext_m_spec->vlan_tci) { match->key.vlan.vlan_id = ntohs(ext_h_spec->vlan_tci) & 0x0fff; match->mask.vlan.vlan_id = ntohs(ext_m_spec->vlan_tci) & 0x0fff; match->key.vlan.vlan_dei = !!(ext_h_spec->vlan_tci & htons(0x1000)); match->mask.vlan.vlan_dei = !!(ext_m_spec->vlan_tci & htons(0x1000)); match->key.vlan.vlan_priority = (ntohs(ext_h_spec->vlan_tci) & 0xe000) >> 13; match->mask.vlan.vlan_priority = (ntohs(ext_m_spec->vlan_tci) & 0xe000) >> 13; } if (ext_m_spec->vlan_etype || ext_m_spec->vlan_tci) { match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_VLAN); match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] = offsetof(struct ethtool_rx_flow_key, vlan); } } if (fs->flow_type & FLOW_MAC_EXT) { const struct ethtool_flow_ext *ext_h_spec = &fs->h_ext; const struct ethtool_flow_ext *ext_m_spec = &fs->m_ext; memcpy(match->key.eth_addrs.dst, ext_h_spec->h_dest, ETH_ALEN); memcpy(match->mask.eth_addrs.dst, ext_m_spec->h_dest, ETH_ALEN); match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_ETH_ADDRS); match->dissector.offset[FLOW_DISSECTOR_KEY_ETH_ADDRS] = offsetof(struct ethtool_rx_flow_key, eth_addrs); } act = &flow->rule->action.entries[0]; switch (fs->ring_cookie) { case RX_CLS_FLOW_DISC: act->id = FLOW_ACTION_DROP; break; case RX_CLS_FLOW_WAKE: act->id = FLOW_ACTION_WAKE; break; default: act->id = FLOW_ACTION_QUEUE; if (fs->flow_type & FLOW_RSS) act->queue.ctx = input->rss_ctx; act->queue.vf = ethtool_get_flow_spec_ring_vf(fs->ring_cookie); act->queue.index = ethtool_get_flow_spec_ring(fs->ring_cookie); break; } return flow; } EXPORT_SYMBOL(ethtool_rx_flow_rule_create); void ethtool_rx_flow_rule_destroy(struct ethtool_rx_flow_rule *flow) { kfree(flow->rule); kfree(flow); } EXPORT_SYMBOL(ethtool_rx_flow_rule_destroy);
562 562 562 562 59 250 416 286 270 5 258 6 463 18 1 49 86 544 542 561 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef BLK_INTERNAL_H #define BLK_INTERNAL_H #include <linux/blk-crypto.h> #include <linux/memblock.h> /* for max_pfn/max_low_pfn */ #include <linux/sched/sysctl.h> #include <linux/timekeeping.h> #include <xen/xen.h> #include "blk-crypto-internal.h" struct elevator_type; /* Max future timer expiry for timeouts */ #define BLK_MAX_TIMEOUT (5 * HZ) extern struct dentry *blk_debugfs_root; struct blk_flush_queue { spinlock_t mq_flush_lock; unsigned int flush_pending_idx:1; unsigned int flush_running_idx:1; blk_status_t rq_status; unsigned long flush_pending_since; struct list_head flush_queue[2]; unsigned long flush_data_in_flight; struct request *flush_rq; }; bool is_flush_rq(struct request *req); struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, gfp_t flags); void blk_free_flush_queue(struct blk_flush_queue *q); void blk_freeze_queue(struct request_queue *q); void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic); void blk_queue_start_drain(struct request_queue *q); int __bio_queue_enter(struct request_queue *q, struct bio *bio); void submit_bio_noacct_nocheck(struct bio *bio); static inline bool blk_try_enter_queue(struct request_queue *q, bool pm) { rcu_read_lock(); if (!percpu_ref_tryget_live_rcu(&q->q_usage_counter)) goto fail; /* * The code that increments the pm_only counter must ensure that the * counter is globally visible before the queue is unfrozen. */ if (blk_queue_pm_only(q) && (!pm || queue_rpm_status(q) == RPM_SUSPENDED)) goto fail_put; rcu_read_unlock(); return true; fail_put: blk_queue_exit(q); fail: rcu_read_unlock(); return false; } static inline int bio_queue_enter(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); if (blk_try_enter_queue(q, false)) return 0; return __bio_queue_enter(q, bio); } static inline void blk_wait_io(struct completion *done) { /* Prevent hang_check timer from firing at us during very long I/O */ unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2; if (timeout) while (!wait_for_completion_io_timeout(done, timeout)) ; else wait_for_completion_io(done); } #define BIO_INLINE_VECS 4 struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, gfp_t gfp_mask); void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs); bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, struct page *page, unsigned len, unsigned offset, bool *same_page); static inline bool biovec_phys_mergeable(struct request_queue *q, struct bio_vec *vec1, struct bio_vec *vec2) { unsigned long mask = queue_segment_boundary(q); phys_addr_t addr1 = page_to_phys(vec1->bv_page) + vec1->bv_offset; phys_addr_t addr2 = page_to_phys(vec2->bv_page) + vec2->bv_offset; /* * Merging adjacent physical pages may not work correctly under KMSAN * if their metadata pages aren't adjacent. Just disable merging. */ if (IS_ENABLED(CONFIG_KMSAN)) return false; if (addr1 + vec1->bv_len != addr2) return false; if (xen_domain() && !xen_biovec_phys_mergeable(vec1, vec2->bv_page)) return false; if ((addr1 | mask) != ((addr2 + vec2->bv_len - 1) | mask)) return false; return true; } static inline bool __bvec_gap_to_prev(const struct queue_limits *lim, struct bio_vec *bprv, unsigned int offset) { return (offset & lim->virt_boundary_mask) || ((bprv->bv_offset + bprv->bv_len) & lim->virt_boundary_mask); } /* * Check if adding a bio_vec after bprv with offset would create a gap in * the SG list. Most drivers don't care about this, but some do. */ static inline bool bvec_gap_to_prev(const struct queue_limits *lim, struct bio_vec *bprv, unsigned int offset) { if (!lim->virt_boundary_mask) return false; return __bvec_gap_to_prev(lim, bprv, offset); } static inline bool rq_mergeable(struct request *rq) { if (blk_rq_is_passthrough(rq)) return false; if (req_op(rq) == REQ_OP_FLUSH) return false; if (req_op(rq) == REQ_OP_WRITE_ZEROES) return false; if (req_op(rq) == REQ_OP_ZONE_APPEND) return false; if (rq->cmd_flags & REQ_NOMERGE_FLAGS) return false; if (rq->rq_flags & RQF_NOMERGE_FLAGS) return false; return true; } /* * There are two different ways to handle DISCARD merges: * 1) If max_discard_segments > 1, the driver treats every bio as a range and * send the bios to controller together. The ranges don't need to be * contiguous. * 2) Otherwise, the request will be normal read/write requests. The ranges * need to be contiguous. */ static inline bool blk_discard_mergable(struct request *req) { if (req_op(req) == REQ_OP_DISCARD && queue_max_discard_segments(req->q) > 1) return true; return false; } static inline unsigned int blk_rq_get_max_segments(struct request *rq) { if (req_op(rq) == REQ_OP_DISCARD) return queue_max_discard_segments(rq->q); return queue_max_segments(rq->q); } static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, enum req_op op) { if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)) return min(q->limits.max_discard_sectors, UINT_MAX >> SECTOR_SHIFT); if (unlikely(op == REQ_OP_WRITE_ZEROES)) return q->limits.max_write_zeroes_sectors; return q->limits.max_sectors; } #ifdef CONFIG_BLK_DEV_INTEGRITY void blk_flush_integrity(void); bool __bio_integrity_endio(struct bio *); void bio_integrity_free(struct bio *bio); static inline bool bio_integrity_endio(struct bio *bio) { if (bio_integrity(bio)) return __bio_integrity_endio(bio); return true; } bool blk_integrity_merge_rq(struct request_queue *, struct request *, struct request *); bool blk_integrity_merge_bio(struct request_queue *, struct request *, struct bio *); static inline bool integrity_req_gap_back_merge(struct request *req, struct bio *next) { struct bio_integrity_payload *bip = bio_integrity(req->bio); struct bio_integrity_payload *bip_next = bio_integrity(next); return bvec_gap_to_prev(&req->q->limits, &bip->bip_vec[bip->bip_vcnt - 1], bip_next->bip_vec[0].bv_offset); } static inline bool integrity_req_gap_front_merge(struct request *req, struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); struct bio_integrity_payload *bip_next = bio_integrity(req->bio); return bvec_gap_to_prev(&req->q->limits, &bip->bip_vec[bip->bip_vcnt - 1], bip_next->bip_vec[0].bv_offset); } extern const struct attribute_group blk_integrity_attr_group; #else /* CONFIG_BLK_DEV_INTEGRITY */ static inline bool blk_integrity_merge_rq(struct request_queue *rq, struct request *r1, struct request *r2) { return true; } static inline bool blk_integrity_merge_bio(struct request_queue *rq, struct request *r, struct bio *b) { return true; } static inline bool integrity_req_gap_back_merge(struct request *req, struct bio *next) { return false; } static inline bool integrity_req_gap_front_merge(struct request *req, struct bio *bio) { return false; } static inline void blk_flush_integrity(void) { } static inline bool bio_integrity_endio(struct bio *bio) { return true; } static inline void bio_integrity_free(struct bio *bio) { } #endif /* CONFIG_BLK_DEV_INTEGRITY */ unsigned long blk_rq_timeout(unsigned long timeout); void blk_add_timer(struct request *req); bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs); bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, struct bio *bio, unsigned int nr_segs); /* * Plug flush limits */ #define BLK_MAX_REQUEST_COUNT 32 #define BLK_PLUG_FLUSH_SIZE (128 * 1024) /* * Internal elevator interface */ #define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED) bool blk_insert_flush(struct request *rq); int elevator_switch(struct request_queue *q, struct elevator_type *new_e); void elevator_disable(struct request_queue *q); void elevator_exit(struct request_queue *q); int elv_register_queue(struct request_queue *q, bool uevent); void elv_unregister_queue(struct request_queue *q); ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf); ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf); ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, char *buf); ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf); ssize_t part_fail_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count); ssize_t part_timeout_show(struct device *, struct device_attribute *, char *); ssize_t part_timeout_store(struct device *, struct device_attribute *, const char *, size_t); static inline bool bio_may_exceed_limits(struct bio *bio, const struct queue_limits *lim) { switch (bio_op(bio)) { case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: case REQ_OP_WRITE_ZEROES: return true; /* non-trivial splitting decisions */ default: break; } /* * All drivers must accept single-segments bios that are <= PAGE_SIZE. * This is a quick and dirty check that relies on the fact that * bi_io_vec[0] is always valid if a bio has data. The check might * lead to occasional false negatives when bios are cloned, but compared * to the performance impact of cloned bios themselves the loop below * doesn't matter anyway. */ return lim->chunk_sectors || bio->bi_vcnt != 1 || bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE; } struct bio *__bio_split_to_limits(struct bio *bio, const struct queue_limits *lim, unsigned int *nr_segs); int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs); bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, struct request *next); unsigned int blk_recalc_rq_segments(struct request *rq); void blk_rq_set_mixed_merge(struct request *rq); bool blk_rq_merge_ok(struct request *rq, struct bio *bio); enum elv_merge blk_try_merge(struct request *rq, struct bio *bio); int blk_set_default_limits(struct queue_limits *lim); int blk_dev_init(void); /* * Contribute to IO statistics IFF: * * a) it's attached to a gendisk, and * b) the queue had IO stats enabled when this request was started */ static inline bool blk_do_io_stat(struct request *rq) { return (rq->rq_flags & RQF_IO_STAT) && !blk_rq_is_passthrough(rq); } void update_io_ticks(struct block_device *part, unsigned long now, bool end); static inline void req_set_nomerge(struct request_queue *q, struct request *req) { req->cmd_flags |= REQ_NOMERGE; if (req == q->last_merge) q->last_merge = NULL; } /* * Internal io_context interface */ struct io_cq *ioc_find_get_icq(struct request_queue *q); struct io_cq *ioc_lookup_icq(struct request_queue *q); #ifdef CONFIG_BLK_ICQ void ioc_clear_queue(struct request_queue *q); #else static inline void ioc_clear_queue(struct request_queue *q) { } #endif /* CONFIG_BLK_ICQ */ #ifdef CONFIG_BLK_DEV_THROTTLING_LOW extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page); extern ssize_t blk_throtl_sample_time_store(struct request_queue *q, const char *page, size_t count); extern void blk_throtl_bio_endio(struct bio *bio); extern void blk_throtl_stat_add(struct request *rq, u64 time); #else static inline void blk_throtl_bio_endio(struct bio *bio) { } static inline void blk_throtl_stat_add(struct request *rq, u64 time) { } #endif struct bio *__blk_queue_bounce(struct bio *bio, struct request_queue *q); static inline bool blk_queue_may_bounce(struct request_queue *q) { return IS_ENABLED(CONFIG_BOUNCE) && q->limits.bounce == BLK_BOUNCE_HIGH && max_low_pfn >= max_pfn; } static inline struct bio *blk_queue_bounce(struct bio *bio, struct request_queue *q) { if (unlikely(blk_queue_may_bounce(q) && bio_has_data(bio))) return __blk_queue_bounce(bio, q); return bio; } #ifdef CONFIG_BLK_DEV_ZONED void disk_free_zone_bitmaps(struct gendisk *disk); int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg); int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg); #else /* CONFIG_BLK_DEV_ZONED */ static inline void disk_free_zone_bitmaps(struct gendisk *disk) {} static inline int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg) { return -ENOTTY; } static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { return -ENOTTY; } #endif /* CONFIG_BLK_DEV_ZONED */ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno); void bdev_add(struct block_device *bdev, dev_t dev); int blk_alloc_ext_minor(void); void blk_free_ext_minor(unsigned int minor); #define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_RAID 1 #define ADDPART_FLAG_WHOLEDISK 2 int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, sector_t length); int bdev_del_partition(struct gendisk *disk, int partno); int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start, sector_t length); void drop_partition(struct block_device *part); void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors); struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, struct lock_class_key *lkclass); int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page); /* * Clean up a page appropriately, where the page may be pinned, may have a * ref taken on it or neither. */ static inline void bio_release_page(struct bio *bio, struct page *page) { if (bio_flagged(bio, BIO_PAGE_PINNED)) unpin_user_page(page); } struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id); int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode); int disk_alloc_events(struct gendisk *disk); void disk_add_events(struct gendisk *disk); void disk_del_events(struct gendisk *disk); void disk_release_events(struct gendisk *disk); void disk_block_events(struct gendisk *disk); void disk_unblock_events(struct gendisk *disk); void disk_flush_events(struct gendisk *disk, unsigned int mask); extern struct device_attribute dev_attr_events; extern struct device_attribute dev_attr_events_async; extern struct device_attribute dev_attr_events_poll_msecs; extern struct attribute_group blk_trace_attr_group; blk_mode_t file_to_blk_mode(struct file *file); int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode, loff_t lstart, loff_t lend); long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); extern const struct address_space_operations def_blk_aops; int disk_register_independent_access_ranges(struct gendisk *disk); void disk_unregister_independent_access_ranges(struct gendisk *disk); #ifdef CONFIG_FAIL_MAKE_REQUEST bool should_fail_request(struct block_device *part, unsigned int bytes); #else /* CONFIG_FAIL_MAKE_REQUEST */ static inline bool should_fail_request(struct block_device *part, unsigned int bytes) { return false; } #endif /* CONFIG_FAIL_MAKE_REQUEST */ /* * Optimized request reference counting. Ideally we'd make timeouts be more * clever, as that's the only reason we need references at all... But until * this happens, this is faster than using refcount_t. Also see: * * abc54d634334 ("io_uring: switch to atomic_t for io_kiocb reference count") */ #define req_ref_zero_or_close_to_overflow(req) \ ((unsigned int) atomic_read(&(req->ref)) + 127u <= 127u) static inline bool req_ref_inc_not_zero(struct request *req) { return atomic_inc_not_zero(&req->ref); } static inline bool req_ref_put_and_test(struct request *req) { WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); return atomic_dec_and_test(&req->ref); } static inline void req_ref_set(struct request *req, int value) { atomic_set(&req->ref, value); } static inline int req_ref_read(struct request *req) { return atomic_read(&req->ref); } static inline u64 blk_time_get_ns(void) { struct blk_plug *plug = current->plug; if (!plug) return ktime_get_ns(); /* * 0 could very well be a valid time, but rather than flag "this is * a valid timestamp" separately, just accept that we'll do an extra * ktime_get_ns() if we just happen to get 0 as the current time. */ if (!plug->cur_ktime) { plug->cur_ktime = ktime_get_ns(); current->flags |= PF_BLOCK_TS; } return plug->cur_ktime; } static inline ktime_t blk_time_get(void) { return ns_to_ktime(blk_time_get_ns()); } /* * From most significant bit: * 1 bit: reserved for other usage, see below * 12 bits: original size of bio * 51 bits: issue time of bio */ #define BIO_ISSUE_RES_BITS 1 #define BIO_ISSUE_SIZE_BITS 12 #define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS) #define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS) #define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1) #define BIO_ISSUE_SIZE_MASK \ (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT) #define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1)) /* Reserved bit for blk-throtl */ #define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63) static inline u64 __bio_issue_time(u64 time) { return time & BIO_ISSUE_TIME_MASK; } static inline u64 bio_issue_time(struct bio_issue *issue) { return __bio_issue_time(issue->value); } static inline sector_t bio_issue_size(struct bio_issue *issue) { return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT); } static inline void bio_issue_init(struct bio_issue *issue, sector_t size) { size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1; issue->value = ((issue->value & BIO_ISSUE_RES_MASK) | (blk_time_get_ns() & BIO_ISSUE_TIME_MASK) | ((u64)size << BIO_ISSUE_SIZE_SHIFT)); } void bdev_release(struct file *bdev_file); int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops, struct file *bdev_file); int bdev_permission(dev_t dev, blk_mode_t mode, void *holder); #endif /* BLK_INTERNAL_H */
44 44 31 44 44 5259 5259 3316 2346 2345 4356 4360 3205 1251 172 1120 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 // SPDX-License-Identifier: GPL-2.0-or-later #define pr_fmt(fmt) "ref_tracker: " fmt #include <linux/export.h> #include <linux/list_sort.h> #include <linux/ref_tracker.h> #include <linux/slab.h> #include <linux/stacktrace.h> #include <linux/stackdepot.h> #define REF_TRACKER_STACK_ENTRIES 16 #define STACK_BUF_SIZE 1024 struct ref_tracker { struct list_head head; /* anchor into dir->list or dir->quarantine */ bool dead; depot_stack_handle_t alloc_stack_handle; depot_stack_handle_t free_stack_handle; }; struct ref_tracker_dir_stats { int total; int count; struct { depot_stack_handle_t stack_handle; unsigned int count; } stacks[]; }; static struct ref_tracker_dir_stats * ref_tracker_get_stats(struct ref_tracker_dir *dir, unsigned int limit) { struct ref_tracker_dir_stats *stats; struct ref_tracker *tracker; stats = kmalloc(struct_size(stats, stacks, limit), GFP_NOWAIT | __GFP_NOWARN); if (!stats) return ERR_PTR(-ENOMEM); stats->total = 0; stats->count = 0; list_for_each_entry(tracker, &dir->list, head) { depot_stack_handle_t stack = tracker->alloc_stack_handle; int i; ++stats->total; for (i = 0; i < stats->count; ++i) if (stats->stacks[i].stack_handle == stack) break; if (i >= limit) continue; if (i >= stats->count) { stats->stacks[i].stack_handle = stack; stats->stacks[i].count = 0; ++stats->count; } ++stats->stacks[i].count; } return stats; } struct ostream { char *buf; int size, used; }; #define pr_ostream(stream, fmt, args...) \ ({ \ struct ostream *_s = (stream); \ \ if (!_s->buf) { \ pr_err(fmt, ##args); \ } else { \ int ret, len = _s->size - _s->used; \ ret = snprintf(_s->buf + _s->used, len, pr_fmt(fmt), ##args); \ _s->used += min(ret, len); \ } \ }) static void __ref_tracker_dir_pr_ostream(struct ref_tracker_dir *dir, unsigned int display_limit, struct ostream *s) { struct ref_tracker_dir_stats *stats; unsigned int i = 0, skipped; depot_stack_handle_t stack; char *sbuf; lockdep_assert_held(&dir->lock); if (list_empty(&dir->list)) return; stats = ref_tracker_get_stats(dir, display_limit); if (IS_ERR(stats)) { pr_ostream(s, "%s@%pK: couldn't get stats, error %pe\n", dir->name, dir, stats); return; } sbuf = kmalloc(STACK_BUF_SIZE, GFP_NOWAIT | __GFP_NOWARN); for (i = 0, skipped = stats->total; i < stats->count; ++i) { stack = stats->stacks[i].stack_handle; if (sbuf && !stack_depot_snprint(stack, sbuf, STACK_BUF_SIZE, 4)) sbuf[0] = 0; pr_ostream(s, "%s@%pK has %d/%d users at\n%s\n", dir->name, dir, stats->stacks[i].count, stats->total, sbuf); skipped -= stats->stacks[i].count; } if (skipped) pr_ostream(s, "%s@%pK skipped reports about %d/%d users.\n", dir->name, dir, skipped, stats->total); kfree(sbuf); kfree(stats); } void ref_tracker_dir_print_locked(struct ref_tracker_dir *dir, unsigned int display_limit) { struct ostream os = {}; __ref_tracker_dir_pr_ostream(dir, display_limit, &os); } EXPORT_SYMBOL(ref_tracker_dir_print_locked); void ref_tracker_dir_print(struct ref_tracker_dir *dir, unsigned int display_limit) { unsigned long flags; spin_lock_irqsave(&dir->lock, flags); ref_tracker_dir_print_locked(dir, display_limit); spin_unlock_irqrestore(&dir->lock, flags); } EXPORT_SYMBOL(ref_tracker_dir_print); int ref_tracker_dir_snprint(struct ref_tracker_dir *dir, char *buf, size_t size) { struct ostream os = { .buf = buf, .size = size }; unsigned long flags; spin_lock_irqsave(&dir->lock, flags); __ref_tracker_dir_pr_ostream(dir, 16, &os); spin_unlock_irqrestore(&dir->lock, flags); return os.used; } EXPORT_SYMBOL(ref_tracker_dir_snprint); void ref_tracker_dir_exit(struct ref_tracker_dir *dir) { struct ref_tracker *tracker, *n; unsigned long flags; bool leak = false; dir->dead = true; spin_lock_irqsave(&dir->lock, flags); list_for_each_entry_safe(tracker, n, &dir->quarantine, head) { list_del(&tracker->head); kfree(tracker); dir->quarantine_avail++; } if (!list_empty(&dir->list)) { ref_tracker_dir_print_locked(dir, 16); leak = true; list_for_each_entry_safe(tracker, n, &dir->list, head) { list_del(&tracker->head); kfree(tracker); } } spin_unlock_irqrestore(&dir->lock, flags); WARN_ON_ONCE(leak); WARN_ON_ONCE(refcount_read(&dir->untracked) != 1); WARN_ON_ONCE(refcount_read(&dir->no_tracker) != 1); } EXPORT_SYMBOL(ref_tracker_dir_exit); int ref_tracker_alloc(struct ref_tracker_dir *dir, struct ref_tracker **trackerp, gfp_t gfp) { unsigned long entries[REF_TRACKER_STACK_ENTRIES]; struct ref_tracker *tracker; unsigned int nr_entries; gfp_t gfp_mask = gfp | __GFP_NOWARN; unsigned long flags; WARN_ON_ONCE(dir->dead); if (!trackerp) { refcount_inc(&dir->no_tracker); return 0; } if (gfp & __GFP_DIRECT_RECLAIM) gfp_mask |= __GFP_NOFAIL; *trackerp = tracker = kzalloc(sizeof(*tracker), gfp_mask); if (unlikely(!tracker)) { pr_err_once("memory allocation failure, unreliable refcount tracker.\n"); refcount_inc(&dir->untracked); return -ENOMEM; } nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 1); tracker->alloc_stack_handle = stack_depot_save(entries, nr_entries, gfp); spin_lock_irqsave(&dir->lock, flags); list_add(&tracker->head, &dir->list); spin_unlock_irqrestore(&dir->lock, flags); return 0; } EXPORT_SYMBOL_GPL(ref_tracker_alloc); int ref_tracker_free(struct ref_tracker_dir *dir, struct ref_tracker **trackerp) { unsigned long entries[REF_TRACKER_STACK_ENTRIES]; depot_stack_handle_t stack_handle; struct ref_tracker *tracker; unsigned int nr_entries; unsigned long flags; WARN_ON_ONCE(dir->dead); if (!trackerp) { refcount_dec(&dir->no_tracker); return 0; } tracker = *trackerp; if (!tracker) { refcount_dec(&dir->untracked); return -EEXIST; } nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 1); stack_handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT | __GFP_NOWARN); spin_lock_irqsave(&dir->lock, flags); if (tracker->dead) { pr_err("reference already released.\n"); if (tracker->alloc_stack_handle) { pr_err("allocated in:\n"); stack_depot_print(tracker->alloc_stack_handle); } if (tracker->free_stack_handle) { pr_err("freed in:\n"); stack_depot_print(tracker->free_stack_handle); } spin_unlock_irqrestore(&dir->lock, flags); WARN_ON_ONCE(1); return -EINVAL; } tracker->dead = true; tracker->free_stack_handle = stack_handle; list_move_tail(&tracker->head, &dir->quarantine); if (!dir->quarantine_avail) { tracker = list_first_entry(&dir->quarantine, struct ref_tracker, head); list_del(&tracker->head); } else { dir->quarantine_avail--; tracker = NULL; } spin_unlock_irqrestore(&dir->lock, flags); kfree(tracker); return 0; } EXPORT_SYMBOL_GPL(ref_tracker_free);
23 22 23 15 15 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 // SPDX-License-Identifier: GPL-2.0 /* * Performance events ring-buffer code: * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> */ #include <linux/perf_event.h> #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/circ_buf.h> #include <linux/poll.h> #include <linux/nospec.h> #include "internal.h" static void perf_output_wakeup(struct perf_output_handle *handle) { atomic_set(&handle->rb->poll, EPOLLIN); handle->event->pending_wakeup = 1; irq_work_queue(&handle->event->pending_irq); } /* * We need to ensure a later event_id doesn't publish a head when a former * event isn't done writing. However since we need to deal with NMIs we * cannot fully serialize things. * * We only publish the head (and generate a wakeup) when the outer-most * event completes. */ static void perf_output_get_handle(struct perf_output_handle *handle) { struct perf_buffer *rb = handle->rb; preempt_disable(); /* * Avoid an explicit LOAD/STORE such that architectures with memops * can use them. */ (*(volatile unsigned int *)&rb->nest)++; handle->wakeup = local_read(&rb->wakeup); } static void perf_output_put_handle(struct perf_output_handle *handle) { struct perf_buffer *rb = handle->rb; unsigned long head; unsigned int nest; /* * If this isn't the outermost nesting, we don't have to update * @rb->user_page->data_head. */ nest = READ_ONCE(rb->nest); if (nest > 1) { WRITE_ONCE(rb->nest, nest - 1); goto out; } again: /* * In order to avoid publishing a head value that goes backwards, * we must ensure the load of @rb->head happens after we've * incremented @rb->nest. * * Otherwise we can observe a @rb->head value before one published * by an IRQ/NMI happening between the load and the increment. */ barrier(); head = local_read(&rb->head); /* * IRQ/NMI can happen here and advance @rb->head, causing our * load above to be stale. */ /* * Since the mmap() consumer (userspace) can run on a different CPU: * * kernel user * * if (LOAD ->data_tail) { LOAD ->data_head * (A) smp_rmb() (C) * STORE $data LOAD $data * smp_wmb() (B) smp_mb() (D) * STORE ->data_head STORE ->data_tail * } * * Where A pairs with D, and B pairs with C. * * In our case (A) is a control dependency that separates the load of * the ->data_tail and the stores of $data. In case ->data_tail * indicates there is no room in the buffer to store $data we do not. * * D needs to be a full barrier since it separates the data READ * from the tail WRITE. * * For B a WMB is sufficient since it separates two WRITEs, and for C * an RMB is sufficient since it separates two READs. * * See perf_output_begin(). */ smp_wmb(); /* B, matches C */ WRITE_ONCE(rb->user_page->data_head, head); /* * We must publish the head before decrementing the nest count, * otherwise an IRQ/NMI can publish a more recent head value and our * write will (temporarily) publish a stale value. */ barrier(); WRITE_ONCE(rb->nest, 0); /* * Ensure we decrement @rb->nest before we validate the @rb->head. * Otherwise we cannot be sure we caught the 'last' nested update. */ barrier(); if (unlikely(head != local_read(&rb->head))) { WRITE_ONCE(rb->nest, 1); goto again; } if (handle->wakeup != local_read(&rb->wakeup)) perf_output_wakeup(handle); out: preempt_enable(); } static __always_inline bool ring_buffer_has_space(unsigned long head, unsigned long tail, unsigned long data_size, unsigned int size, bool backward) { if (!backward) return CIRC_SPACE(head, tail, data_size) >= size; else return CIRC_SPACE(tail, head, data_size) >= size; } static __always_inline int __perf_output_begin(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size, bool backward) { struct perf_buffer *rb; unsigned long tail, offset, head; int have_lost, page_shift; struct { struct perf_event_header header; u64 id; u64 lost; } lost_event; rcu_read_lock(); /* * For inherited events we send all the output towards the parent. */ if (event->parent) event = event->parent; rb = rcu_dereference(event->rb); if (unlikely(!rb)) goto out; if (unlikely(rb->paused)) { if (rb->nr_pages) { local_inc(&rb->lost); atomic64_inc(&event->lost_samples); } goto out; } handle->rb = rb; handle->event = event; have_lost = local_read(&rb->lost); if (unlikely(have_lost)) { size += sizeof(lost_event); if (event->attr.sample_id_all) size += event->id_header_size; } perf_output_get_handle(handle); offset = local_read(&rb->head); do { head = offset; tail = READ_ONCE(rb->user_page->data_tail); if (!rb->overwrite) { if (unlikely(!ring_buffer_has_space(head, tail, perf_data_size(rb), size, backward))) goto fail; } /* * The above forms a control dependency barrier separating the * @tail load above from the data stores below. Since the @tail * load is required to compute the branch to fail below. * * A, matches D; the full memory barrier userspace SHOULD issue * after reading the data and before storing the new tail * position. * * See perf_output_put_handle(). */ if (!backward) head += size; else head -= size; } while (!local_try_cmpxchg(&rb->head, &offset, head)); if (backward) { offset = head; head = (u64)(-head); } /* * We rely on the implied barrier() by local_cmpxchg() to ensure * none of the data stores below can be lifted up by the compiler. */ if (unlikely(head - local_read(&rb->wakeup) > rb->watermark)) local_add(rb->watermark, &rb->wakeup); page_shift = PAGE_SHIFT + page_order(rb); handle->page = (offset >> page_shift) & (rb->nr_pages - 1); offset &= (1UL << page_shift) - 1; handle->addr = rb->data_pages[handle->page] + offset; handle->size = (1UL << page_shift) - offset; if (unlikely(have_lost)) { lost_event.header.size = sizeof(lost_event); lost_event.header.type = PERF_RECORD_LOST; lost_event.header.misc = 0; lost_event.id = event->id; lost_event.lost = local_xchg(&rb->lost, 0); /* XXX mostly redundant; @data is already fully initializes */ perf_event_header__init_id(&lost_event.header, data, event); perf_output_put(handle, lost_event); perf_event__output_id_sample(event, handle, data); } return 0; fail: local_inc(&rb->lost); atomic64_inc(&event->lost_samples); perf_output_put_handle(handle); out: rcu_read_unlock(); return -ENOSPC; } int perf_output_begin_forward(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size) { return __perf_output_begin(handle, data, event, size, false); } int perf_output_begin_backward(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size) { return __perf_output_begin(handle, data, event, size, true); } int perf_output_begin(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size) { return __perf_output_begin(handle, data, event, size, unlikely(is_write_backward(event))); } unsigned int perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len) { return __output_copy(handle, buf, len); } unsigned int perf_output_skip(struct perf_output_handle *handle, unsigned int len) { return __output_skip(handle, NULL, len); } void perf_output_end(struct perf_output_handle *handle) { perf_output_put_handle(handle); rcu_read_unlock(); } static void ring_buffer_init(struct perf_buffer *rb, long watermark, int flags) { long max_size = perf_data_size(rb); if (watermark) rb->watermark = min(max_size, watermark); if (!rb->watermark) rb->watermark = max_size / 2; if (flags & RING_BUFFER_WRITABLE) rb->overwrite = 0; else rb->overwrite = 1; refcount_set(&rb->refcount, 1); INIT_LIST_HEAD(&rb->event_list); spin_lock_init(&rb->event_lock); /* * perf_output_begin() only checks rb->paused, therefore * rb->paused must be true if we have no pages for output. */ if (!rb->nr_pages) rb->paused = 1; } void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags) { /* * OVERWRITE is determined by perf_aux_output_end() and can't * be passed in directly. */ if (WARN_ON_ONCE(flags & PERF_AUX_FLAG_OVERWRITE)) return; handle->aux_flags |= flags; } EXPORT_SYMBOL_GPL(perf_aux_output_flag); /* * This is called before hardware starts writing to the AUX area to * obtain an output handle and make sure there's room in the buffer. * When the capture completes, call perf_aux_output_end() to commit * the recorded data to the buffer. * * The ordering is similar to that of perf_output_{begin,end}, with * the exception of (B), which should be taken care of by the pmu * driver, since ordering rules will differ depending on hardware. * * Call this from pmu::start(); see the comment in perf_aux_output_end() * about its use in pmu callbacks. Both can also be called from the PMI * handler if needed. */ void *perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event) { struct perf_event *output_event = event; unsigned long aux_head, aux_tail; struct perf_buffer *rb; unsigned int nest; if (output_event->parent) output_event = output_event->parent; /* * Since this will typically be open across pmu::add/pmu::del, we * grab ring_buffer's refcount instead of holding rcu read lock * to make sure it doesn't disappear under us. */ rb = ring_buffer_get(output_event); if (!rb) return NULL; if (!rb_has_aux(rb)) goto err; /* * If aux_mmap_count is zero, the aux buffer is in perf_mmap_close(), * about to get freed, so we leave immediately. * * Checking rb::aux_mmap_count and rb::refcount has to be done in * the same order, see perf_mmap_close. Otherwise we end up freeing * aux pages in this path, which is a bug, because in_atomic(). */ if (!atomic_read(&rb->aux_mmap_count)) goto err; if (!refcount_inc_not_zero(&rb->aux_refcount)) goto err; nest = READ_ONCE(rb->aux_nest); /* * Nesting is not supported for AUX area, make sure nested * writers are caught early */ if (WARN_ON_ONCE(nest)) goto err_put; WRITE_ONCE(rb->aux_nest, nest + 1); aux_head = rb->aux_head; handle->rb = rb; handle->event = event; handle->head = aux_head; handle->size = 0; handle->aux_flags = 0; /* * In overwrite mode, AUX data stores do not depend on aux_tail, * therefore (A) control dependency barrier does not exist. The * (B) <-> (C) ordering is still observed by the pmu driver. */ if (!rb->aux_overwrite) { aux_tail = READ_ONCE(rb->user_page->aux_tail); handle->wakeup = rb->aux_wakeup + rb->aux_watermark; if (aux_head - aux_tail < perf_aux_size(rb)) handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb)); /* * handle->size computation depends on aux_tail load; this forms a * control dependency barrier separating aux_tail load from aux data * store that will be enabled on successful return */ if (!handle->size) { /* A, matches D */ event->pending_disable = smp_processor_id(); perf_output_wakeup(handle); WRITE_ONCE(rb->aux_nest, 0); goto err_put; } } return handle->rb->aux_priv; err_put: /* can't be last */ rb_free_aux(rb); err: ring_buffer_put(rb); handle->event = NULL; return NULL; } EXPORT_SYMBOL_GPL(perf_aux_output_begin); static __always_inline bool rb_need_aux_wakeup(struct perf_buffer *rb) { if (rb->aux_overwrite) return false; if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); return true; } return false; } /* * Commit the data written by hardware into the ring buffer by adjusting * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the * pmu driver's responsibility to observe ordering rules of the hardware, * so that all the data is externally visible before this is called. * * Note: this has to be called from pmu::stop() callback, as the assumption * of the AUX buffer management code is that after pmu::stop(), the AUX * transaction must be stopped and therefore drop the AUX reference count. */ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) { bool wakeup = !!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED); struct perf_buffer *rb = handle->rb; unsigned long aux_head; /* in overwrite mode, driver provides aux_head via handle */ if (rb->aux_overwrite) { handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE; aux_head = handle->head; rb->aux_head = aux_head; } else { handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE; aux_head = rb->aux_head; rb->aux_head += size; } /* * Only send RECORD_AUX if we have something useful to communicate * * Note: the OVERWRITE records by themselves are not considered * useful, as they don't communicate any *new* information, * aside from the short-lived offset, that becomes history at * the next event sched-in and therefore isn't useful. * The userspace that needs to copy out AUX data in overwrite * mode should know to use user_page::aux_head for the actual * offset. So, from now on we don't output AUX records that * have *only* OVERWRITE flag set. */ if (size || (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE)) perf_event_aux_event(handle->event, aux_head, size, handle->aux_flags); WRITE_ONCE(rb->user_page->aux_head, rb->aux_head); if (rb_need_aux_wakeup(rb)) wakeup = true; if (wakeup) { if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) handle->event->pending_disable = smp_processor_id(); perf_output_wakeup(handle); } handle->event = NULL; WRITE_ONCE(rb->aux_nest, 0); /* can't be last */ rb_free_aux(rb); ring_buffer_put(rb); } EXPORT_SYMBOL_GPL(perf_aux_output_end); /* * Skip over a given number of bytes in the AUX buffer, due to, for example, * hardware's alignment constraints. */ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) { struct perf_buffer *rb = handle->rb; if (size > handle->size) return -ENOSPC; rb->aux_head += size; WRITE_ONCE(rb->user_page->aux_head, rb->aux_head); if (rb_need_aux_wakeup(rb)) { perf_output_wakeup(handle); handle->wakeup = rb->aux_wakeup + rb->aux_watermark; } handle->head = rb->aux_head; handle->size -= size; return 0; } EXPORT_SYMBOL_GPL(perf_aux_output_skip); void *perf_get_aux(struct perf_output_handle *handle) { /* this is only valid between perf_aux_output_begin and *_end */ if (!handle->event) return NULL; return handle->rb->aux_priv; } EXPORT_SYMBOL_GPL(perf_get_aux); /* * Copy out AUX data from an AUX handle. */ long perf_output_copy_aux(struct perf_output_handle *aux_handle, struct perf_output_handle *handle, unsigned long from, unsigned long to) { struct perf_buffer *rb = aux_handle->rb; unsigned long tocopy, remainder, len = 0; void *addr; from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1; to &= (rb->aux_nr_pages << PAGE_SHIFT) - 1; do { tocopy = PAGE_SIZE - offset_in_page(from); if (to > from) tocopy = min(tocopy, to - from); if (!tocopy) break; addr = rb->aux_pages[from >> PAGE_SHIFT]; addr += offset_in_page(from); remainder = perf_output_copy(handle, addr, tocopy); if (remainder) return -EFAULT; len += tocopy; from += tocopy; from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1; } while (to != from); return len; } #define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY) static struct page *rb_alloc_aux_page(int node, int order) { struct page *page; if (order > MAX_PAGE_ORDER) order = MAX_PAGE_ORDER; do { page = alloc_pages_node(node, PERF_AUX_GFP, order); } while (!page && order--); if (page && order) { /* * Communicate the allocation size to the driver: * if we managed to secure a high-order allocation, * set its first page's private to this order; * !PagePrivate(page) means it's just a normal page. */ split_page(page, order); SetPagePrivate(page); set_page_private(page, order); } return page; } static void rb_free_aux_page(struct perf_buffer *rb, int idx) { struct page *page = virt_to_page(rb->aux_pages[idx]); ClearPagePrivate(page); page->mapping = NULL; __free_page(page); } static void __rb_free_aux(struct perf_buffer *rb) { int pg; /* * Should never happen, the last reference should be dropped from * perf_mmap_close() path, which first stops aux transactions (which * in turn are the atomic holders of aux_refcount) and then does the * last rb_free_aux(). */ WARN_ON_ONCE(in_atomic()); if (rb->aux_priv) { rb->free_aux(rb->aux_priv); rb->free_aux = NULL; rb->aux_priv = NULL; } if (rb->aux_nr_pages) { for (pg = 0; pg < rb->aux_nr_pages; pg++) rb_free_aux_page(rb, pg); kfree(rb->aux_pages); rb->aux_nr_pages = 0; } } int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, pgoff_t pgoff, int nr_pages, long watermark, int flags) { bool overwrite = !(flags & RING_BUFFER_WRITABLE); int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu); int ret = -ENOMEM, max_order; if (!has_aux(event)) return -EOPNOTSUPP; if (!overwrite) { /* * Watermark defaults to half the buffer, and so does the * max_order, to aid PMU drivers in double buffering. */ if (!watermark) watermark = nr_pages << (PAGE_SHIFT - 1); /* * Use aux_watermark as the basis for chunking to * help PMU drivers honor the watermark. */ max_order = get_order(watermark); } else { /* * We need to start with the max_order that fits in nr_pages, * not the other way around, hence ilog2() and not get_order. */ max_order = ilog2(nr_pages); watermark = 0; } /* * kcalloc_node() is unable to allocate buffer if the size is larger * than: PAGE_SIZE << MAX_PAGE_ORDER; directly bail out in this case. */ if (get_order((unsigned long)nr_pages * sizeof(void *)) > MAX_PAGE_ORDER) return -ENOMEM; rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL, node); if (!rb->aux_pages) return -ENOMEM; rb->free_aux = event->pmu->free_aux; for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) { struct page *page; int last, order; order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages)); page = rb_alloc_aux_page(node, order); if (!page) goto out; for (last = rb->aux_nr_pages + (1 << page_private(page)); last > rb->aux_nr_pages; rb->aux_nr_pages++) rb->aux_pages[rb->aux_nr_pages] = page_address(page++); } /* * In overwrite mode, PMUs that don't support SG may not handle more * than one contiguous allocation, since they rely on PMI to do double * buffering. In this case, the entire buffer has to be one contiguous * chunk. */ if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) && overwrite) { struct page *page = virt_to_page(rb->aux_pages[0]); if (page_private(page) != max_order) goto out; } rb->aux_priv = event->pmu->setup_aux(event, rb->aux_pages, nr_pages, overwrite); if (!rb->aux_priv) goto out; ret = 0; /* * aux_pages (and pmu driver's private data, aux_priv) will be * referenced in both producer's and consumer's contexts, thus * we keep a refcount here to make sure either of the two can * reference them safely. */ refcount_set(&rb->aux_refcount, 1); rb->aux_overwrite = overwrite; rb->aux_watermark = watermark; out: if (!ret) rb->aux_pgoff = pgoff; else __rb_free_aux(rb); return ret; } void rb_free_aux(struct perf_buffer *rb) { if (refcount_dec_and_test(&rb->aux_refcount)) __rb_free_aux(rb); } #ifndef CONFIG_PERF_USE_VMALLOC /* * Back perf_mmap() with regular GFP_KERNEL-0 pages. */ static struct page * __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) { if (pgoff > rb->nr_pages) return NULL; if (pgoff == 0) return virt_to_page(rb->user_page); return virt_to_page(rb->data_pages[pgoff - 1]); } static void *perf_mmap_alloc_page(int cpu) { struct page *page; int node; node = (cpu == -1) ? cpu : cpu_to_node(cpu); page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (!page) return NULL; return page_address(page); } static void perf_mmap_free_page(void *addr) { struct page *page = virt_to_page(addr); page->mapping = NULL; __free_page(page); } struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) { struct perf_buffer *rb; unsigned long size; int i, node; size = sizeof(struct perf_buffer); size += nr_pages * sizeof(void *); if (order_base_2(size) > PAGE_SHIFT+MAX_PAGE_ORDER) goto fail; node = (cpu == -1) ? cpu : cpu_to_node(cpu); rb = kzalloc_node(size, GFP_KERNEL, node); if (!rb) goto fail; rb->user_page = perf_mmap_alloc_page(cpu); if (!rb->user_page) goto fail_user_page; for (i = 0; i < nr_pages; i++) { rb->data_pages[i] = perf_mmap_alloc_page(cpu); if (!rb->data_pages[i]) goto fail_data_pages; } rb->nr_pages = nr_pages; ring_buffer_init(rb, watermark, flags); return rb; fail_data_pages: for (i--; i >= 0; i--) perf_mmap_free_page(rb->data_pages[i]); perf_mmap_free_page(rb->user_page); fail_user_page: kfree(rb); fail: return NULL; } void rb_free(struct perf_buffer *rb) { int i; perf_mmap_free_page(rb->user_page); for (i = 0; i < rb->nr_pages; i++) perf_mmap_free_page(rb->data_pages[i]); kfree(rb); } #else static struct page * __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) { /* The '>' counts in the user page. */ if (pgoff > data_page_nr(rb)) return NULL; return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE); } static void perf_mmap_unmark_page(void *addr) { struct page *page = vmalloc_to_page(addr); page->mapping = NULL; } static void rb_free_work(struct work_struct *work) { struct perf_buffer *rb; void *base; int i, nr; rb = container_of(work, struct perf_buffer, work); nr = data_page_nr(rb); base = rb->user_page; /* The '<=' counts in the user page. */ for (i = 0; i <= nr; i++) perf_mmap_unmark_page(base + (i * PAGE_SIZE)); vfree(base); kfree(rb); } void rb_free(struct perf_buffer *rb) { schedule_work(&rb->work); } struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) { struct perf_buffer *rb; unsigned long size; void *all_buf; int node; size = sizeof(struct perf_buffer); size += sizeof(void *); node = (cpu == -1) ? cpu : cpu_to_node(cpu); rb = kzalloc_node(size, GFP_KERNEL, node); if (!rb) goto fail; INIT_WORK(&rb->work, rb_free_work); all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); if (!all_buf) goto fail_all_buf; rb->user_page = all_buf; rb->data_pages[0] = all_buf + PAGE_SIZE; if (nr_pages) { rb->nr_pages = 1; rb->page_order = ilog2(nr_pages); } ring_buffer_init(rb, watermark, flags); return rb; fail_all_buf: kfree(rb); fail: return NULL; } #endif struct page * perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) { if (rb->aux_nr_pages) { /* above AUX space */ if (pgoff > rb->aux_pgoff + rb->aux_nr_pages) return NULL; /* AUX space */ if (pgoff >= rb->aux_pgoff) { int aux_pgoff = array_index_nospec(pgoff - rb->aux_pgoff, rb->aux_nr_pages); return virt_to_page(rb->aux_pages[aux_pgoff]); } } return __perf_mmap_to_page(rb, pgoff); }
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 // SPDX-License-Identifier: GPL-2.0-only /* * Crypto user configuration API. * * Copyright (C) 2011 secunet Security Networks AG * Copyright (C) 2011 Steffen Klassert <steffen.klassert@secunet.com> */ #include <linux/module.h> #include <linux/crypto.h> #include <linux/cryptouser.h> #include <linux/sched.h> #include <linux/security.h> #include <net/netlink.h> #include <net/net_namespace.h> #include <net/sock.h> #include <crypto/internal/skcipher.h> #include <crypto/internal/rng.h> #include <crypto/akcipher.h> #include <crypto/kpp.h> #include <crypto/internal/cryptouser.h> #include "internal.h" #define null_terminated(x) (strnlen(x, sizeof(x)) < sizeof(x)) static DEFINE_MUTEX(crypto_cfg_mutex); struct crypto_dump_info { struct sk_buff *in_skb; struct sk_buff *out_skb; u32 nlmsg_seq; u16 nlmsg_flags; }; struct crypto_alg *crypto_alg_match(struct crypto_user_alg *p, int exact) { struct crypto_alg *q, *alg = NULL; down_read(&crypto_alg_sem); list_for_each_entry(q, &crypto_alg_list, cra_list) { int match = 0; if (crypto_is_larval(q)) continue; if ((q->cra_flags ^ p->cru_type) & p->cru_mask) continue; if (strlen(p->cru_driver_name)) match = !strcmp(q->cra_driver_name, p->cru_driver_name); else if (!exact) match = !strcmp(q->cra_name, p->cru_name); if (!match) continue; if (unlikely(!crypto_mod_get(q))) continue; alg = q; break; } up_read(&crypto_alg_sem); return alg; } static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_cipher rcipher; memset(&rcipher, 0, sizeof(rcipher)); strscpy(rcipher.type, "cipher", sizeof(rcipher.type)); rcipher.blocksize = alg->cra_blocksize; rcipher.min_keysize = alg->cra_cipher.cia_min_keysize; rcipher.max_keysize = alg->cra_cipher.cia_max_keysize; return nla_put(skb, CRYPTOCFGA_REPORT_CIPHER, sizeof(rcipher), &rcipher); } static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_comp rcomp; memset(&rcomp, 0, sizeof(rcomp)); strscpy(rcomp.type, "compression", sizeof(rcomp.type)); return nla_put(skb, CRYPTOCFGA_REPORT_COMPRESS, sizeof(rcomp), &rcomp); } static int crypto_report_one(struct crypto_alg *alg, struct crypto_user_alg *ualg, struct sk_buff *skb) { memset(ualg, 0, sizeof(*ualg)); strscpy(ualg->cru_name, alg->cra_name, sizeof(ualg->cru_name)); strscpy(ualg->cru_driver_name, alg->cra_driver_name, sizeof(ualg->cru_driver_name)); strscpy(ualg->cru_module_name, module_name(alg->cra_module), sizeof(ualg->cru_module_name)); ualg->cru_type = 0; ualg->cru_mask = 0; ualg->cru_flags = alg->cra_flags; ualg->cru_refcnt = refcount_read(&alg->cra_refcnt); if (nla_put_u32(skb, CRYPTOCFGA_PRIORITY_VAL, alg->cra_priority)) goto nla_put_failure; if (alg->cra_flags & CRYPTO_ALG_LARVAL) { struct crypto_report_larval rl; memset(&rl, 0, sizeof(rl)); strscpy(rl.type, "larval", sizeof(rl.type)); if (nla_put(skb, CRYPTOCFGA_REPORT_LARVAL, sizeof(rl), &rl)) goto nla_put_failure; goto out; } if (alg->cra_type && alg->cra_type->report) { if (alg->cra_type->report(skb, alg)) goto nla_put_failure; goto out; } switch (alg->cra_flags & (CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_LARVAL)) { case CRYPTO_ALG_TYPE_CIPHER: if (crypto_report_cipher(skb, alg)) goto nla_put_failure; break; case CRYPTO_ALG_TYPE_COMPRESS: if (crypto_report_comp(skb, alg)) goto nla_put_failure; break; } out: return 0; nla_put_failure: return -EMSGSIZE; } static int crypto_report_alg(struct crypto_alg *alg, struct crypto_dump_info *info) { struct sk_buff *in_skb = info->in_skb; struct sk_buff *skb = info->out_skb; struct nlmsghdr *nlh; struct crypto_user_alg *ualg; int err = 0; nlh = nlmsg_put(skb, NETLINK_CB(in_skb).portid, info->nlmsg_seq, CRYPTO_MSG_GETALG, sizeof(*ualg), info->nlmsg_flags); if (!nlh) { err = -EMSGSIZE; goto out; } ualg = nlmsg_data(nlh); err = crypto_report_one(alg, ualg, skb); if (err) { nlmsg_cancel(skb, nlh); goto out; } nlmsg_end(skb, nlh); out: return err; } static int crypto_report(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, struct nlattr **attrs) { struct net *net = sock_net(in_skb->sk); struct crypto_user_alg *p = nlmsg_data(in_nlh); struct crypto_alg *alg; struct sk_buff *skb; struct crypto_dump_info info; int err; if (!null_terminated(p->cru_name) || !null_terminated(p->cru_driver_name)) return -EINVAL; alg = crypto_alg_match(p, 0); if (!alg) return -ENOENT; err = -ENOMEM; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) goto drop_alg; info.in_skb = in_skb; info.out_skb = skb; info.nlmsg_seq = in_nlh->nlmsg_seq; info.nlmsg_flags = 0; err = crypto_report_alg(alg, &info); drop_alg: crypto_mod_put(alg); if (err) { kfree_skb(skb); return err; } return nlmsg_unicast(net->crypto_nlsk, skb, NETLINK_CB(in_skb).portid); } static int crypto_dump_report(struct sk_buff *skb, struct netlink_callback *cb) { const size_t start_pos = cb->args[0]; size_t pos = 0; struct crypto_dump_info info; struct crypto_alg *alg; int res; info.in_skb = cb->skb; info.out_skb = skb; info.nlmsg_seq = cb->nlh->nlmsg_seq; info.nlmsg_flags = NLM_F_MULTI; down_read(&crypto_alg_sem); list_for_each_entry(alg, &crypto_alg_list, cra_list) { if (pos >= start_pos) { res = crypto_report_alg(alg, &info); if (res == -EMSGSIZE) break; if (res) goto out; } pos++; } cb->args[0] = pos; res = skb->len; out: up_read(&crypto_alg_sem); return res; } static int crypto_dump_report_done(struct netlink_callback *cb) { return 0; } static int crypto_update_alg(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs) { struct crypto_alg *alg; struct crypto_user_alg *p = nlmsg_data(nlh); struct nlattr *priority = attrs[CRYPTOCFGA_PRIORITY_VAL]; LIST_HEAD(list); if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (!null_terminated(p->cru_name) || !null_terminated(p->cru_driver_name)) return -EINVAL; if (priority && !strlen(p->cru_driver_name)) return -EINVAL; alg = crypto_alg_match(p, 1); if (!alg) return -ENOENT; down_write(&crypto_alg_sem); crypto_remove_spawns(alg, &list, NULL); if (priority) alg->cra_priority = nla_get_u32(priority); up_write(&crypto_alg_sem); crypto_mod_put(alg); crypto_remove_final(&list); return 0; } static int crypto_del_alg(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs) { struct crypto_alg *alg; struct crypto_user_alg *p = nlmsg_data(nlh); int err; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (!null_terminated(p->cru_name) || !null_terminated(p->cru_driver_name)) return -EINVAL; alg = crypto_alg_match(p, 1); if (!alg) return -ENOENT; /* We can not unregister core algorithms such as aes-generic. * We would loose the reference in the crypto_alg_list to this algorithm * if we try to unregister. Unregistering such an algorithm without * removing the module is not possible, so we restrict to crypto * instances that are build from templates. */ err = -EINVAL; if (!(alg->cra_flags & CRYPTO_ALG_INSTANCE)) goto drop_alg; err = -EBUSY; if (refcount_read(&alg->cra_refcnt) > 2) goto drop_alg; crypto_unregister_instance((struct crypto_instance *)alg); err = 0; drop_alg: crypto_mod_put(alg); return err; } static int crypto_add_alg(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs) { int exact = 0; const char *name; struct crypto_alg *alg; struct crypto_user_alg *p = nlmsg_data(nlh); struct nlattr *priority = attrs[CRYPTOCFGA_PRIORITY_VAL]; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (!null_terminated(p->cru_name) || !null_terminated(p->cru_driver_name)) return -EINVAL; if (strlen(p->cru_driver_name)) exact = 1; if (priority && !exact) return -EINVAL; alg = crypto_alg_match(p, exact); if (alg) { crypto_mod_put(alg); return -EEXIST; } if (strlen(p->cru_driver_name)) name = p->cru_driver_name; else name = p->cru_name; alg = crypto_alg_mod_lookup(name, p->cru_type, p->cru_mask); if (IS_ERR(alg)) return PTR_ERR(alg); down_write(&crypto_alg_sem); if (priority) alg->cra_priority = nla_get_u32(priority); up_write(&crypto_alg_sem); crypto_mod_put(alg); return 0; } static int crypto_del_rng(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs) { if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; return crypto_del_default_rng(); } #define MSGSIZE(type) sizeof(struct type) static const int crypto_msg_min[CRYPTO_NR_MSGTYPES] = { [CRYPTO_MSG_NEWALG - CRYPTO_MSG_BASE] = MSGSIZE(crypto_user_alg), [CRYPTO_MSG_DELALG - CRYPTO_MSG_BASE] = MSGSIZE(crypto_user_alg), [CRYPTO_MSG_UPDATEALG - CRYPTO_MSG_BASE] = MSGSIZE(crypto_user_alg), [CRYPTO_MSG_GETALG - CRYPTO_MSG_BASE] = MSGSIZE(crypto_user_alg), [CRYPTO_MSG_DELRNG - CRYPTO_MSG_BASE] = 0, [CRYPTO_MSG_GETSTAT - CRYPTO_MSG_BASE] = MSGSIZE(crypto_user_alg), }; static const struct nla_policy crypto_policy[CRYPTOCFGA_MAX+1] = { [CRYPTOCFGA_PRIORITY_VAL] = { .type = NLA_U32}, }; #undef MSGSIZE static const struct crypto_link { int (*doit)(struct sk_buff *, struct nlmsghdr *, struct nlattr **); int (*dump)(struct sk_buff *, struct netlink_callback *); int (*done)(struct netlink_callback *); } crypto_dispatch[CRYPTO_NR_MSGTYPES] = { [CRYPTO_MSG_NEWALG - CRYPTO_MSG_BASE] = { .doit = crypto_add_alg}, [CRYPTO_MSG_DELALG - CRYPTO_MSG_BASE] = { .doit = crypto_del_alg}, [CRYPTO_MSG_UPDATEALG - CRYPTO_MSG_BASE] = { .doit = crypto_update_alg}, [CRYPTO_MSG_GETALG - CRYPTO_MSG_BASE] = { .doit = crypto_report, .dump = crypto_dump_report, .done = crypto_dump_report_done}, [CRYPTO_MSG_DELRNG - CRYPTO_MSG_BASE] = { .doit = crypto_del_rng }, [CRYPTO_MSG_GETSTAT - CRYPTO_MSG_BASE] = { .doit = crypto_reportstat}, }; static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *attrs[CRYPTOCFGA_MAX+1]; const struct crypto_link *link; int type, err; type = nlh->nlmsg_type; if (type > CRYPTO_MSG_MAX) return -EINVAL; type -= CRYPTO_MSG_BASE; link = &crypto_dispatch[type]; if ((type == (CRYPTO_MSG_GETALG - CRYPTO_MSG_BASE) && (nlh->nlmsg_flags & NLM_F_DUMP))) { struct crypto_alg *alg; unsigned long dump_alloc = 0; if (link->dump == NULL) return -EINVAL; down_read(&crypto_alg_sem); list_for_each_entry(alg, &crypto_alg_list, cra_list) dump_alloc += CRYPTO_REPORT_MAXSIZE; up_read(&crypto_alg_sem); { struct netlink_dump_control c = { .dump = link->dump, .done = link->done, .min_dump_alloc = min(dump_alloc, 65535UL), }; err = netlink_dump_start(net->crypto_nlsk, skb, nlh, &c); } return err; } err = nlmsg_parse_deprecated(nlh, crypto_msg_min[type], attrs, CRYPTOCFGA_MAX, crypto_policy, extack); if (err < 0) return err; if (link->doit == NULL) return -EINVAL; return link->doit(skb, nlh, attrs); } static void crypto_netlink_rcv(struct sk_buff *skb) { mutex_lock(&crypto_cfg_mutex); netlink_rcv_skb(skb, &crypto_user_rcv_msg); mutex_unlock(&crypto_cfg_mutex); } static int __net_init crypto_netlink_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = crypto_netlink_rcv, }; net->crypto_nlsk = netlink_kernel_create(net, NETLINK_CRYPTO, &cfg); return net->crypto_nlsk == NULL ? -ENOMEM : 0; } static void __net_exit crypto_netlink_exit(struct net *net) { netlink_kernel_release(net->crypto_nlsk); net->crypto_nlsk = NULL; } static struct pernet_operations crypto_netlink_net_ops = { .init = crypto_netlink_init, .exit = crypto_netlink_exit, }; static int __init crypto_user_init(void) { return register_pernet_subsys(&crypto_netlink_net_ops); } static void __exit crypto_user_exit(void) { unregister_pernet_subsys(&crypto_netlink_net_ops); } module_init(crypto_user_init); module_exit(crypto_user_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>"); MODULE_DESCRIPTION("Crypto userspace configuration API"); MODULE_ALIAS("net-pf-16-proto-21");
1 1 1 1 1 1 1 1 451 450 5 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* isotp.c - ISO 15765-2 CAN transport protocol for protocol family CAN * * This implementation does not provide ISO-TP specific return values to the * userspace. * * - RX path timeout of data reception leads to -ETIMEDOUT * - RX path SN mismatch leads to -EILSEQ * - RX path data reception with wrong padding leads to -EBADMSG * - TX path flowcontrol reception timeout leads to -ECOMM * - TX path flowcontrol reception overflow leads to -EMSGSIZE * - TX path flowcontrol reception with wrong layout/padding leads to -EBADMSG * - when a transfer (tx) is on the run the next write() blocks until it's done * - use CAN_ISOTP_WAIT_TX_DONE flag to block the caller until the PDU is sent * - as we have static buffers the check whether the PDU fits into the buffer * is done at FF reception time (no support for sending 'wait frames') * * Copyright (c) 2020 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include <linux/module.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/spinlock.h> #include <linux/hrtimer.h> #include <linux/wait.h> #include <linux/uio.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/socket.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/can/isotp.h> #include <linux/slab.h> #include <net/sock.h> #include <net/net_namespace.h> MODULE_DESCRIPTION("PF_CAN isotp 15765-2:2016 protocol"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Oliver Hartkopp <socketcan@hartkopp.net>"); MODULE_ALIAS("can-proto-6"); #define ISOTP_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_addr.tp) #define SINGLE_MASK(id) (((id) & CAN_EFF_FLAG) ? \ (CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \ (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG)) /* ISO 15765-2:2016 supports more than 4095 byte per ISO PDU as the FF_DL can * take full 32 bit values (4 Gbyte). We would need some good concept to handle * this between user space and kernel space. For now set the static buffer to * something about 8 kbyte to be able to test this new functionality. */ #define DEFAULT_MAX_PDU_SIZE 8300 /* maximum PDU size before ISO 15765-2:2016 extension was 4095 */ #define MAX_12BIT_PDU_SIZE 4095 /* limit the isotp pdu size from the optional module parameter to 1MByte */ #define MAX_PDU_SIZE (1025 * 1024U) static unsigned int max_pdu_size __read_mostly = DEFAULT_MAX_PDU_SIZE; module_param(max_pdu_size, uint, 0444); MODULE_PARM_DESC(max_pdu_size, "maximum isotp pdu size (default " __stringify(DEFAULT_MAX_PDU_SIZE) ")"); /* N_PCI type values in bits 7-4 of N_PCI bytes */ #define N_PCI_SF 0x00 /* single frame */ #define N_PCI_FF 0x10 /* first frame */ #define N_PCI_CF 0x20 /* consecutive frame */ #define N_PCI_FC 0x30 /* flow control */ #define N_PCI_SZ 1 /* size of the PCI byte #1 */ #define SF_PCI_SZ4 1 /* size of SingleFrame PCI including 4 bit SF_DL */ #define SF_PCI_SZ8 2 /* size of SingleFrame PCI including 8 bit SF_DL */ #define FF_PCI_SZ12 2 /* size of FirstFrame PCI including 12 bit FF_DL */ #define FF_PCI_SZ32 6 /* size of FirstFrame PCI including 32 bit FF_DL */ #define FC_CONTENT_SZ 3 /* flow control content size in byte (FS/BS/STmin) */ #define ISOTP_CHECK_PADDING (CAN_ISOTP_CHK_PAD_LEN | CAN_ISOTP_CHK_PAD_DATA) #define ISOTP_ALL_BC_FLAGS (CAN_ISOTP_SF_BROADCAST | CAN_ISOTP_CF_BROADCAST) /* Flow Status given in FC frame */ #define ISOTP_FC_CTS 0 /* clear to send */ #define ISOTP_FC_WT 1 /* wait */ #define ISOTP_FC_OVFLW 2 /* overflow */ #define ISOTP_FC_TIMEOUT 1 /* 1 sec */ #define ISOTP_ECHO_TIMEOUT 2 /* 2 secs */ enum { ISOTP_IDLE = 0, ISOTP_WAIT_FIRST_FC, ISOTP_WAIT_FC, ISOTP_WAIT_DATA, ISOTP_SENDING, ISOTP_SHUTDOWN, }; struct tpcon { u8 *buf; unsigned int buflen; unsigned int len; unsigned int idx; u32 state; u8 bs; u8 sn; u8 ll_dl; u8 sbuf[DEFAULT_MAX_PDU_SIZE]; }; struct isotp_sock { struct sock sk; int bound; int ifindex; canid_t txid; canid_t rxid; ktime_t tx_gap; ktime_t lastrxcf_tstamp; struct hrtimer rxtimer, txtimer, txfrtimer; struct can_isotp_options opt; struct can_isotp_fc_options rxfc, txfc; struct can_isotp_ll_options ll; u32 frame_txtime; u32 force_tx_stmin; u32 force_rx_stmin; u32 cfecho; /* consecutive frame echo tag */ struct tpcon rx, tx; struct list_head notifier; wait_queue_head_t wait; spinlock_t rx_lock; /* protect single thread state machine */ }; static LIST_HEAD(isotp_notifier_list); static DEFINE_SPINLOCK(isotp_notifier_lock); static struct isotp_sock *isotp_busy_notifier; static inline struct isotp_sock *isotp_sk(const struct sock *sk) { return (struct isotp_sock *)sk; } static u32 isotp_bc_flags(struct isotp_sock *so) { return so->opt.flags & ISOTP_ALL_BC_FLAGS; } static bool isotp_register_rxid(struct isotp_sock *so) { /* no broadcast modes => register rx_id for FC frame reception */ return (isotp_bc_flags(so) == 0); } static enum hrtimer_restart isotp_rx_timer_handler(struct hrtimer *hrtimer) { struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, rxtimer); struct sock *sk = &so->sk; if (so->rx.state == ISOTP_WAIT_DATA) { /* we did not get new data frames in time */ /* report 'connection timed out' */ sk->sk_err = ETIMEDOUT; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); /* reset rx state */ so->rx.state = ISOTP_IDLE; } return HRTIMER_NORESTART; } static int isotp_send_fc(struct sock *sk, int ae, u8 flowstatus) { struct net_device *dev; struct sk_buff *nskb; struct canfd_frame *ncf; struct isotp_sock *so = isotp_sk(sk); int can_send_ret; nskb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), gfp_any()); if (!nskb) return 1; dev = dev_get_by_index(sock_net(sk), so->ifindex); if (!dev) { kfree_skb(nskb); return 1; } can_skb_reserve(nskb); can_skb_prv(nskb)->ifindex = dev->ifindex; can_skb_prv(nskb)->skbcnt = 0; nskb->dev = dev; can_skb_set_owner(nskb, sk); ncf = (struct canfd_frame *)nskb->data; skb_put_zero(nskb, so->ll.mtu); /* create & send flow control reply */ ncf->can_id = so->txid; if (so->opt.flags & CAN_ISOTP_TX_PADDING) { memset(ncf->data, so->opt.txpad_content, CAN_MAX_DLEN); ncf->len = CAN_MAX_DLEN; } else { ncf->len = ae + FC_CONTENT_SZ; } ncf->data[ae] = N_PCI_FC | flowstatus; ncf->data[ae + 1] = so->rxfc.bs; ncf->data[ae + 2] = so->rxfc.stmin; if (ae) ncf->data[0] = so->opt.ext_address; ncf->flags = so->ll.tx_flags; can_send_ret = can_send(nskb, 1); if (can_send_ret) pr_notice_once("can-isotp: %s: can_send_ret %pe\n", __func__, ERR_PTR(can_send_ret)); dev_put(dev); /* reset blocksize counter */ so->rx.bs = 0; /* reset last CF frame rx timestamp for rx stmin enforcement */ so->lastrxcf_tstamp = ktime_set(0, 0); /* start rx timeout watchdog */ hrtimer_start(&so->rxtimer, ktime_set(ISOTP_FC_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); return 0; } static void isotp_rcv_skb(struct sk_buff *skb, struct sock *sk) { struct sockaddr_can *addr = (struct sockaddr_can *)skb->cb; BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct sockaddr_can)); memset(addr, 0, sizeof(*addr)); addr->can_family = AF_CAN; addr->can_ifindex = skb->dev->ifindex; if (sock_queue_rcv_skb(sk, skb) < 0) kfree_skb(skb); } static u8 padlen(u8 datalen) { static const u8 plen[] = { 8, 8, 8, 8, 8, 8, 8, 8, 8, /* 0 - 8 */ 12, 12, 12, 12, /* 9 - 12 */ 16, 16, 16, 16, /* 13 - 16 */ 20, 20, 20, 20, /* 17 - 20 */ 24, 24, 24, 24, /* 21 - 24 */ 32, 32, 32, 32, 32, 32, 32, 32, /* 25 - 32 */ 48, 48, 48, 48, 48, 48, 48, 48, /* 33 - 40 */ 48, 48, 48, 48, 48, 48, 48, 48 /* 41 - 48 */ }; if (datalen > 48) return 64; return plen[datalen]; } /* check for length optimization and return 1/true when the check fails */ static int check_optimized(struct canfd_frame *cf, int start_index) { /* for CAN_DL <= 8 the start_index is equal to the CAN_DL as the * padding would start at this point. E.g. if the padding would * start at cf.data[7] cf->len has to be 7 to be optimal. * Note: The data[] index starts with zero. */ if (cf->len <= CAN_MAX_DLEN) return (cf->len != start_index); /* This relation is also valid in the non-linear DLC range, where * we need to take care of the minimal next possible CAN_DL. * The correct check would be (padlen(cf->len) != padlen(start_index)). * But as cf->len can only take discrete values from 12, .., 64 at this * point the padlen(cf->len) is always equal to cf->len. */ return (cf->len != padlen(start_index)); } /* check padding and return 1/true when the check fails */ static int check_pad(struct isotp_sock *so, struct canfd_frame *cf, int start_index, u8 content) { int i; /* no RX_PADDING value => check length of optimized frame length */ if (!(so->opt.flags & CAN_ISOTP_RX_PADDING)) { if (so->opt.flags & CAN_ISOTP_CHK_PAD_LEN) return check_optimized(cf, start_index); /* no valid test against empty value => ignore frame */ return 1; } /* check datalength of correctly padded CAN frame */ if ((so->opt.flags & CAN_ISOTP_CHK_PAD_LEN) && cf->len != padlen(cf->len)) return 1; /* check padding content */ if (so->opt.flags & CAN_ISOTP_CHK_PAD_DATA) { for (i = start_index; i < cf->len; i++) if (cf->data[i] != content) return 1; } return 0; } static void isotp_send_cframe(struct isotp_sock *so); static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae) { struct sock *sk = &so->sk; if (so->tx.state != ISOTP_WAIT_FC && so->tx.state != ISOTP_WAIT_FIRST_FC) return 0; hrtimer_cancel(&so->txtimer); if ((cf->len < ae + FC_CONTENT_SZ) || ((so->opt.flags & ISOTP_CHECK_PADDING) && check_pad(so, cf, ae + FC_CONTENT_SZ, so->opt.rxpad_content))) { /* malformed PDU - report 'not a data message' */ sk->sk_err = EBADMSG; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); return 1; } /* get static/dynamic communication params from first/every FC frame */ if (so->tx.state == ISOTP_WAIT_FIRST_FC || so->opt.flags & CAN_ISOTP_DYN_FC_PARMS) { so->txfc.bs = cf->data[ae + 1]; so->txfc.stmin = cf->data[ae + 2]; /* fix wrong STmin values according spec */ if (so->txfc.stmin > 0x7F && (so->txfc.stmin < 0xF1 || so->txfc.stmin > 0xF9)) so->txfc.stmin = 0x7F; so->tx_gap = ktime_set(0, 0); /* add transmission time for CAN frame N_As */ so->tx_gap = ktime_add_ns(so->tx_gap, so->frame_txtime); /* add waiting time for consecutive frames N_Cs */ if (so->opt.flags & CAN_ISOTP_FORCE_TXSTMIN) so->tx_gap = ktime_add_ns(so->tx_gap, so->force_tx_stmin); else if (so->txfc.stmin < 0x80) so->tx_gap = ktime_add_ns(so->tx_gap, so->txfc.stmin * 1000000); else so->tx_gap = ktime_add_ns(so->tx_gap, (so->txfc.stmin - 0xF0) * 100000); so->tx.state = ISOTP_WAIT_FC; } switch (cf->data[ae] & 0x0F) { case ISOTP_FC_CTS: so->tx.bs = 0; so->tx.state = ISOTP_SENDING; /* send CF frame and enable echo timeout handling */ hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); isotp_send_cframe(so); break; case ISOTP_FC_WT: /* start timer to wait for next FC frame */ hrtimer_start(&so->txtimer, ktime_set(ISOTP_FC_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); break; case ISOTP_FC_OVFLW: /* overflow on receiver side - report 'message too long' */ sk->sk_err = EMSGSIZE; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); fallthrough; default: /* stop this tx job */ so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); } return 0; } static int isotp_rcv_sf(struct sock *sk, struct canfd_frame *cf, int pcilen, struct sk_buff *skb, int len) { struct isotp_sock *so = isotp_sk(sk); struct sk_buff *nskb; hrtimer_cancel(&so->rxtimer); so->rx.state = ISOTP_IDLE; if (!len || len > cf->len - pcilen) return 1; if ((so->opt.flags & ISOTP_CHECK_PADDING) && check_pad(so, cf, pcilen + len, so->opt.rxpad_content)) { /* malformed PDU - report 'not a data message' */ sk->sk_err = EBADMSG; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); return 1; } nskb = alloc_skb(len, gfp_any()); if (!nskb) return 1; memcpy(skb_put(nskb, len), &cf->data[pcilen], len); nskb->tstamp = skb->tstamp; nskb->dev = skb->dev; isotp_rcv_skb(nskb, sk); return 0; } static int isotp_rcv_ff(struct sock *sk, struct canfd_frame *cf, int ae) { struct isotp_sock *so = isotp_sk(sk); int i; int off; int ff_pci_sz; hrtimer_cancel(&so->rxtimer); so->rx.state = ISOTP_IDLE; /* get the used sender LL_DL from the (first) CAN frame data length */ so->rx.ll_dl = padlen(cf->len); /* the first frame has to use the entire frame up to LL_DL length */ if (cf->len != so->rx.ll_dl) return 1; /* get the FF_DL */ so->rx.len = (cf->data[ae] & 0x0F) << 8; so->rx.len += cf->data[ae + 1]; /* Check for FF_DL escape sequence supporting 32 bit PDU length */ if (so->rx.len) { ff_pci_sz = FF_PCI_SZ12; } else { /* FF_DL = 0 => get real length from next 4 bytes */ so->rx.len = cf->data[ae + 2] << 24; so->rx.len += cf->data[ae + 3] << 16; so->rx.len += cf->data[ae + 4] << 8; so->rx.len += cf->data[ae + 5]; ff_pci_sz = FF_PCI_SZ32; } /* take care of a potential SF_DL ESC offset for TX_DL > 8 */ off = (so->rx.ll_dl > CAN_MAX_DLEN) ? 1 : 0; if (so->rx.len + ae + off + ff_pci_sz < so->rx.ll_dl) return 1; /* PDU size > default => try max_pdu_size */ if (so->rx.len > so->rx.buflen && so->rx.buflen < max_pdu_size) { u8 *newbuf = kmalloc(max_pdu_size, GFP_ATOMIC); if (newbuf) { so->rx.buf = newbuf; so->rx.buflen = max_pdu_size; } } if (so->rx.len > so->rx.buflen) { /* send FC frame with overflow status */ isotp_send_fc(sk, ae, ISOTP_FC_OVFLW); return 1; } /* copy the first received data bytes */ so->rx.idx = 0; for (i = ae + ff_pci_sz; i < so->rx.ll_dl; i++) so->rx.buf[so->rx.idx++] = cf->data[i]; /* initial setup for this pdu reception */ so->rx.sn = 1; so->rx.state = ISOTP_WAIT_DATA; /* no creation of flow control frames */ if (so->opt.flags & CAN_ISOTP_LISTEN_MODE) return 0; /* send our first FC frame */ isotp_send_fc(sk, ae, ISOTP_FC_CTS); return 0; } static int isotp_rcv_cf(struct sock *sk, struct canfd_frame *cf, int ae, struct sk_buff *skb) { struct isotp_sock *so = isotp_sk(sk); struct sk_buff *nskb; int i; if (so->rx.state != ISOTP_WAIT_DATA) return 0; /* drop if timestamp gap is less than force_rx_stmin nano secs */ if (so->opt.flags & CAN_ISOTP_FORCE_RXSTMIN) { if (ktime_to_ns(ktime_sub(skb->tstamp, so->lastrxcf_tstamp)) < so->force_rx_stmin) return 0; so->lastrxcf_tstamp = skb->tstamp; } hrtimer_cancel(&so->rxtimer); /* CFs are never longer than the FF */ if (cf->len > so->rx.ll_dl) return 1; /* CFs have usually the LL_DL length */ if (cf->len < so->rx.ll_dl) { /* this is only allowed for the last CF */ if (so->rx.len - so->rx.idx > so->rx.ll_dl - ae - N_PCI_SZ) return 1; } if ((cf->data[ae] & 0x0F) != so->rx.sn) { /* wrong sn detected - report 'illegal byte sequence' */ sk->sk_err = EILSEQ; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); /* reset rx state */ so->rx.state = ISOTP_IDLE; return 1; } so->rx.sn++; so->rx.sn %= 16; for (i = ae + N_PCI_SZ; i < cf->len; i++) { so->rx.buf[so->rx.idx++] = cf->data[i]; if (so->rx.idx >= so->rx.len) break; } if (so->rx.idx >= so->rx.len) { /* we are done */ so->rx.state = ISOTP_IDLE; if ((so->opt.flags & ISOTP_CHECK_PADDING) && check_pad(so, cf, i + 1, so->opt.rxpad_content)) { /* malformed PDU - report 'not a data message' */ sk->sk_err = EBADMSG; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); return 1; } nskb = alloc_skb(so->rx.len, gfp_any()); if (!nskb) return 1; memcpy(skb_put(nskb, so->rx.len), so->rx.buf, so->rx.len); nskb->tstamp = skb->tstamp; nskb->dev = skb->dev; isotp_rcv_skb(nskb, sk); return 0; } /* perform blocksize handling, if enabled */ if (!so->rxfc.bs || ++so->rx.bs < so->rxfc.bs) { /* start rx timeout watchdog */ hrtimer_start(&so->rxtimer, ktime_set(ISOTP_FC_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); return 0; } /* no creation of flow control frames */ if (so->opt.flags & CAN_ISOTP_LISTEN_MODE) return 0; /* we reached the specified blocksize so->rxfc.bs */ isotp_send_fc(sk, ae, ISOTP_FC_CTS); return 0; } static void isotp_rcv(struct sk_buff *skb, void *data) { struct sock *sk = (struct sock *)data; struct isotp_sock *so = isotp_sk(sk); struct canfd_frame *cf; int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; u8 n_pci_type, sf_dl; /* Strictly receive only frames with the configured MTU size * => clear separation of CAN2.0 / CAN FD transport channels */ if (skb->len != so->ll.mtu) return; cf = (struct canfd_frame *)skb->data; /* if enabled: check reception of my configured extended address */ if (ae && cf->data[0] != so->opt.rx_ext_address) return; n_pci_type = cf->data[ae] & 0xF0; /* Make sure the state changes and data structures stay consistent at * CAN frame reception time. This locking is not needed in real world * use cases but the inconsistency can be triggered with syzkaller. */ spin_lock(&so->rx_lock); if (so->opt.flags & CAN_ISOTP_HALF_DUPLEX) { /* check rx/tx path half duplex expectations */ if ((so->tx.state != ISOTP_IDLE && n_pci_type != N_PCI_FC) || (so->rx.state != ISOTP_IDLE && n_pci_type == N_PCI_FC)) goto out_unlock; } switch (n_pci_type) { case N_PCI_FC: /* tx path: flow control frame containing the FC parameters */ isotp_rcv_fc(so, cf, ae); break; case N_PCI_SF: /* rx path: single frame * * As we do not have a rx.ll_dl configuration, we can only test * if the CAN frames payload length matches the LL_DL == 8 * requirements - no matter if it's CAN 2.0 or CAN FD */ /* get the SF_DL from the N_PCI byte */ sf_dl = cf->data[ae] & 0x0F; if (cf->len <= CAN_MAX_DLEN) { isotp_rcv_sf(sk, cf, SF_PCI_SZ4 + ae, skb, sf_dl); } else { if (can_is_canfd_skb(skb)) { /* We have a CAN FD frame and CAN_DL is greater than 8: * Only frames with the SF_DL == 0 ESC value are valid. * * If so take care of the increased SF PCI size * (SF_PCI_SZ8) to point to the message content behind * the extended SF PCI info and get the real SF_DL * length value from the formerly first data byte. */ if (sf_dl == 0) isotp_rcv_sf(sk, cf, SF_PCI_SZ8 + ae, skb, cf->data[SF_PCI_SZ4 + ae]); } } break; case N_PCI_FF: /* rx path: first frame */ isotp_rcv_ff(sk, cf, ae); break; case N_PCI_CF: /* rx path: consecutive frame */ isotp_rcv_cf(sk, cf, ae, skb); break; } out_unlock: spin_unlock(&so->rx_lock); } static void isotp_fill_dataframe(struct canfd_frame *cf, struct isotp_sock *so, int ae, int off) { int pcilen = N_PCI_SZ + ae + off; int space = so->tx.ll_dl - pcilen; int num = min_t(int, so->tx.len - so->tx.idx, space); int i; cf->can_id = so->txid; cf->len = num + pcilen; if (num < space) { if (so->opt.flags & CAN_ISOTP_TX_PADDING) { /* user requested padding */ cf->len = padlen(cf->len); memset(cf->data, so->opt.txpad_content, cf->len); } else if (cf->len > CAN_MAX_DLEN) { /* mandatory padding for CAN FD frames */ cf->len = padlen(cf->len); memset(cf->data, CAN_ISOTP_DEFAULT_PAD_CONTENT, cf->len); } } for (i = 0; i < num; i++) cf->data[pcilen + i] = so->tx.buf[so->tx.idx++]; if (ae) cf->data[0] = so->opt.ext_address; } static void isotp_send_cframe(struct isotp_sock *so) { struct sock *sk = &so->sk; struct sk_buff *skb; struct net_device *dev; struct canfd_frame *cf; int can_send_ret; int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; dev = dev_get_by_index(sock_net(sk), so->ifindex); if (!dev) return; skb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), GFP_ATOMIC); if (!skb) { dev_put(dev); return; } can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; cf = (struct canfd_frame *)skb->data; skb_put_zero(skb, so->ll.mtu); /* create consecutive frame */ isotp_fill_dataframe(cf, so, ae, 0); /* place consecutive frame N_PCI in appropriate index */ cf->data[ae] = N_PCI_CF | so->tx.sn++; so->tx.sn %= 16; so->tx.bs++; cf->flags = so->ll.tx_flags; skb->dev = dev; can_skb_set_owner(skb, sk); /* cfecho should have been zero'ed by init/isotp_rcv_echo() */ if (so->cfecho) pr_notice_once("can-isotp: cfecho is %08X != 0\n", so->cfecho); /* set consecutive frame echo tag */ so->cfecho = *(u32 *)cf->data; /* send frame with local echo enabled */ can_send_ret = can_send(skb, 1); if (can_send_ret) { pr_notice_once("can-isotp: %s: can_send_ret %pe\n", __func__, ERR_PTR(can_send_ret)); if (can_send_ret == -ENOBUFS) pr_notice_once("can-isotp: tx queue is full\n"); } dev_put(dev); } static void isotp_create_fframe(struct canfd_frame *cf, struct isotp_sock *so, int ae) { int i; int ff_pci_sz; cf->can_id = so->txid; cf->len = so->tx.ll_dl; if (ae) cf->data[0] = so->opt.ext_address; /* create N_PCI bytes with 12/32 bit FF_DL data length */ if (so->tx.len > MAX_12BIT_PDU_SIZE) { /* use 32 bit FF_DL notation */ cf->data[ae] = N_PCI_FF; cf->data[ae + 1] = 0; cf->data[ae + 2] = (u8)(so->tx.len >> 24) & 0xFFU; cf->data[ae + 3] = (u8)(so->tx.len >> 16) & 0xFFU; cf->data[ae + 4] = (u8)(so->tx.len >> 8) & 0xFFU; cf->data[ae + 5] = (u8)so->tx.len & 0xFFU; ff_pci_sz = FF_PCI_SZ32; } else { /* use 12 bit FF_DL notation */ cf->data[ae] = (u8)(so->tx.len >> 8) | N_PCI_FF; cf->data[ae + 1] = (u8)so->tx.len & 0xFFU; ff_pci_sz = FF_PCI_SZ12; } /* add first data bytes depending on ae */ for (i = ae + ff_pci_sz; i < so->tx.ll_dl; i++) cf->data[i] = so->tx.buf[so->tx.idx++]; so->tx.sn = 1; } static void isotp_rcv_echo(struct sk_buff *skb, void *data) { struct sock *sk = (struct sock *)data; struct isotp_sock *so = isotp_sk(sk); struct canfd_frame *cf = (struct canfd_frame *)skb->data; /* only handle my own local echo CF/SF skb's (no FF!) */ if (skb->sk != sk || so->cfecho != *(u32 *)cf->data) return; /* cancel local echo timeout */ hrtimer_cancel(&so->txtimer); /* local echo skb with consecutive frame has been consumed */ so->cfecho = 0; if (so->tx.idx >= so->tx.len) { /* we are done */ so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); return; } if (so->txfc.bs && so->tx.bs >= so->txfc.bs) { /* stop and wait for FC with timeout */ so->tx.state = ISOTP_WAIT_FC; hrtimer_start(&so->txtimer, ktime_set(ISOTP_FC_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); return; } /* no gap between data frames needed => use burst mode */ if (!so->tx_gap) { /* enable echo timeout handling */ hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); isotp_send_cframe(so); return; } /* start timer to send next consecutive frame with correct delay */ hrtimer_start(&so->txfrtimer, so->tx_gap, HRTIMER_MODE_REL_SOFT); } static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer) { struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, txtimer); struct sock *sk = &so->sk; /* don't handle timeouts in IDLE or SHUTDOWN state */ if (so->tx.state == ISOTP_IDLE || so->tx.state == ISOTP_SHUTDOWN) return HRTIMER_NORESTART; /* we did not get any flow control or echo frame in time */ /* report 'communication error on send' */ sk->sk_err = ECOMM; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); /* reset tx state */ so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); return HRTIMER_NORESTART; } static enum hrtimer_restart isotp_txfr_timer_handler(struct hrtimer *hrtimer) { struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, txfrtimer); /* start echo timeout handling and cover below protocol error */ hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); /* cfecho should be consumed by isotp_rcv_echo() here */ if (so->tx.state == ISOTP_SENDING && !so->cfecho) isotp_send_cframe(so); return HRTIMER_NORESTART; } static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); struct sk_buff *skb; struct net_device *dev; struct canfd_frame *cf; int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; int wait_tx_done = (so->opt.flags & CAN_ISOTP_WAIT_TX_DONE) ? 1 : 0; s64 hrtimer_sec = ISOTP_ECHO_TIMEOUT; int off; int err; if (!so->bound || so->tx.state == ISOTP_SHUTDOWN) return -EADDRNOTAVAIL; while (cmpxchg(&so->tx.state, ISOTP_IDLE, ISOTP_SENDING) != ISOTP_IDLE) { /* we do not support multiple buffers - for now */ if (msg->msg_flags & MSG_DONTWAIT) return -EAGAIN; if (so->tx.state == ISOTP_SHUTDOWN) return -EADDRNOTAVAIL; /* wait for complete transmission of current pdu */ err = wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); if (err) goto err_event_drop; } /* PDU size > default => try max_pdu_size */ if (size > so->tx.buflen && so->tx.buflen < max_pdu_size) { u8 *newbuf = kmalloc(max_pdu_size, GFP_KERNEL); if (newbuf) { so->tx.buf = newbuf; so->tx.buflen = max_pdu_size; } } if (!size || size > so->tx.buflen) { err = -EINVAL; goto err_out_drop; } /* take care of a potential SF_DL ESC offset for TX_DL > 8 */ off = (so->tx.ll_dl > CAN_MAX_DLEN) ? 1 : 0; /* does the given data fit into a single frame for SF_BROADCAST? */ if ((isotp_bc_flags(so) == CAN_ISOTP_SF_BROADCAST) && (size > so->tx.ll_dl - SF_PCI_SZ4 - ae - off)) { err = -EINVAL; goto err_out_drop; } err = memcpy_from_msg(so->tx.buf, msg, size); if (err < 0) goto err_out_drop; dev = dev_get_by_index(sock_net(sk), so->ifindex); if (!dev) { err = -ENXIO; goto err_out_drop; } skb = sock_alloc_send_skb(sk, so->ll.mtu + sizeof(struct can_skb_priv), msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) { dev_put(dev); goto err_out_drop; } can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; so->tx.len = size; so->tx.idx = 0; cf = (struct canfd_frame *)skb->data; skb_put_zero(skb, so->ll.mtu); /* cfecho should have been zero'ed by init / former isotp_rcv_echo() */ if (so->cfecho) pr_notice_once("can-isotp: uninit cfecho %08X\n", so->cfecho); /* check for single frame transmission depending on TX_DL */ if (size <= so->tx.ll_dl - SF_PCI_SZ4 - ae - off) { /* The message size generally fits into a SingleFrame - good. * * SF_DL ESC offset optimization: * * When TX_DL is greater 8 but the message would still fit * into a 8 byte CAN frame, we can omit the offset. * This prevents a protocol caused length extension from * CAN_DL = 8 to CAN_DL = 12 due to the SF_SL ESC handling. */ if (size <= CAN_MAX_DLEN - SF_PCI_SZ4 - ae) off = 0; isotp_fill_dataframe(cf, so, ae, off); /* place single frame N_PCI w/o length in appropriate index */ cf->data[ae] = N_PCI_SF; /* place SF_DL size value depending on the SF_DL ESC offset */ if (off) cf->data[SF_PCI_SZ4 + ae] = size; else cf->data[ae] |= size; /* set CF echo tag for isotp_rcv_echo() (SF-mode) */ so->cfecho = *(u32 *)cf->data; } else { /* send first frame */ isotp_create_fframe(cf, so, ae); if (isotp_bc_flags(so) == CAN_ISOTP_CF_BROADCAST) { /* set timer for FC-less operation (STmin = 0) */ if (so->opt.flags & CAN_ISOTP_FORCE_TXSTMIN) so->tx_gap = ktime_set(0, so->force_tx_stmin); else so->tx_gap = ktime_set(0, so->frame_txtime); /* disable wait for FCs due to activated block size */ so->txfc.bs = 0; /* set CF echo tag for isotp_rcv_echo() (CF-mode) */ so->cfecho = *(u32 *)cf->data; } else { /* standard flow control check */ so->tx.state = ISOTP_WAIT_FIRST_FC; /* start timeout for FC */ hrtimer_sec = ISOTP_FC_TIMEOUT; /* no CF echo tag for isotp_rcv_echo() (FF-mode) */ so->cfecho = 0; } } hrtimer_start(&so->txtimer, ktime_set(hrtimer_sec, 0), HRTIMER_MODE_REL_SOFT); /* send the first or only CAN frame */ cf->flags = so->ll.tx_flags; skb->dev = dev; skb->sk = sk; err = can_send(skb, 1); dev_put(dev); if (err) { pr_notice_once("can-isotp: %s: can_send_ret %pe\n", __func__, ERR_PTR(err)); /* no transmission -> no timeout monitoring */ hrtimer_cancel(&so->txtimer); /* reset consecutive frame echo tag */ so->cfecho = 0; goto err_out_drop; } if (wait_tx_done) { /* wait for complete transmission of current pdu */ err = wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); if (err) goto err_event_drop; err = sock_error(sk); if (err) return err; } return size; err_event_drop: /* got signal: force tx state machine to be idle */ so->tx.state = ISOTP_IDLE; hrtimer_cancel(&so->txfrtimer); hrtimer_cancel(&so->txtimer); err_out_drop: /* drop this PDU and unlock a potential wait queue */ so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); return err; } static int isotp_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; struct isotp_sock *so = isotp_sk(sk); int ret = 0; if (flags & ~(MSG_DONTWAIT | MSG_TRUNC | MSG_PEEK | MSG_CMSG_COMPAT)) return -EINVAL; if (!so->bound) return -EADDRNOTAVAIL; skb = skb_recv_datagram(sk, flags, &ret); if (!skb) return ret; if (size < skb->len) msg->msg_flags |= MSG_TRUNC; else size = skb->len; ret = memcpy_to_msg(msg, skb->data, size); if (ret < 0) goto out_err; sock_recv_cmsgs(msg, sk, skb); if (msg->msg_name) { __sockaddr_check_size(ISOTP_MIN_NAMELEN); msg->msg_namelen = ISOTP_MIN_NAMELEN; memcpy(msg->msg_name, skb->cb, msg->msg_namelen); } /* set length of return value */ ret = (flags & MSG_TRUNC) ? skb->len : size; out_err: skb_free_datagram(sk, skb); return ret; } static int isotp_release(struct socket *sock) { struct sock *sk = sock->sk; struct isotp_sock *so; struct net *net; if (!sk) return 0; so = isotp_sk(sk); net = sock_net(sk); /* wait for complete transmission of current pdu */ while (wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE) == 0 && cmpxchg(&so->tx.state, ISOTP_IDLE, ISOTP_SHUTDOWN) != ISOTP_IDLE) ; /* force state machines to be idle also when a signal occurred */ so->tx.state = ISOTP_SHUTDOWN; so->rx.state = ISOTP_IDLE; spin_lock(&isotp_notifier_lock); while (isotp_busy_notifier == so) { spin_unlock(&isotp_notifier_lock); schedule_timeout_uninterruptible(1); spin_lock(&isotp_notifier_lock); } list_del(&so->notifier); spin_unlock(&isotp_notifier_lock); lock_sock(sk); /* remove current filters & unregister */ if (so->bound) { if (so->ifindex) { struct net_device *dev; dev = dev_get_by_index(net, so->ifindex); if (dev) { if (isotp_register_rxid(so)) can_rx_unregister(net, dev, so->rxid, SINGLE_MASK(so->rxid), isotp_rcv, sk); can_rx_unregister(net, dev, so->txid, SINGLE_MASK(so->txid), isotp_rcv_echo, sk); dev_put(dev); synchronize_rcu(); } } } hrtimer_cancel(&so->txfrtimer); hrtimer_cancel(&so->txtimer); hrtimer_cancel(&so->rxtimer); so->ifindex = 0; so->bound = 0; if (so->rx.buf != so->rx.sbuf) kfree(so->rx.buf); if (so->tx.buf != so->tx.sbuf) kfree(so->tx.buf); sock_orphan(sk); sock->sk = NULL; release_sock(sk); sock_put(sk); return 0; } static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); struct net *net = sock_net(sk); int ifindex; struct net_device *dev; canid_t tx_id = addr->can_addr.tp.tx_id; canid_t rx_id = addr->can_addr.tp.rx_id; int err = 0; int notify_enetdown = 0; if (len < ISOTP_MIN_NAMELEN) return -EINVAL; if (addr->can_family != AF_CAN) return -EINVAL; /* sanitize tx CAN identifier */ if (tx_id & CAN_EFF_FLAG) tx_id &= (CAN_EFF_FLAG | CAN_EFF_MASK); else tx_id &= CAN_SFF_MASK; /* give feedback on wrong CAN-ID value */ if (tx_id != addr->can_addr.tp.tx_id) return -EINVAL; /* sanitize rx CAN identifier (if needed) */ if (isotp_register_rxid(so)) { if (rx_id & CAN_EFF_FLAG) rx_id &= (CAN_EFF_FLAG | CAN_EFF_MASK); else rx_id &= CAN_SFF_MASK; /* give feedback on wrong CAN-ID value */ if (rx_id != addr->can_addr.tp.rx_id) return -EINVAL; } if (!addr->can_ifindex) return -ENODEV; lock_sock(sk); if (so->bound) { err = -EINVAL; goto out; } /* ensure different CAN IDs when the rx_id is to be registered */ if (isotp_register_rxid(so) && rx_id == tx_id) { err = -EADDRNOTAVAIL; goto out; } dev = dev_get_by_index(net, addr->can_ifindex); if (!dev) { err = -ENODEV; goto out; } if (dev->type != ARPHRD_CAN) { dev_put(dev); err = -ENODEV; goto out; } if (dev->mtu < so->ll.mtu) { dev_put(dev); err = -EINVAL; goto out; } if (!(dev->flags & IFF_UP)) notify_enetdown = 1; ifindex = dev->ifindex; if (isotp_register_rxid(so)) can_rx_register(net, dev, rx_id, SINGLE_MASK(rx_id), isotp_rcv, sk, "isotp", sk); /* no consecutive frame echo skb in flight */ so->cfecho = 0; /* register for echo skb's */ can_rx_register(net, dev, tx_id, SINGLE_MASK(tx_id), isotp_rcv_echo, sk, "isotpe", sk); dev_put(dev); /* switch to new settings */ so->ifindex = ifindex; so->rxid = rx_id; so->txid = tx_id; so->bound = 1; out: release_sock(sk); if (notify_enetdown) { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } return err; } static int isotp_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); if (peer) return -EOPNOTSUPP; memset(addr, 0, ISOTP_MIN_NAMELEN); addr->can_family = AF_CAN; addr->can_ifindex = so->ifindex; addr->can_addr.tp.rx_id = so->rxid; addr->can_addr.tp.tx_id = so->txid; return ISOTP_MIN_NAMELEN; } static int isotp_setsockopt_locked(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); int ret = 0; if (so->bound) return -EISCONN; switch (optname) { case CAN_ISOTP_OPTS: if (optlen != sizeof(struct can_isotp_options)) return -EINVAL; if (copy_from_sockptr(&so->opt, optval, optlen)) return -EFAULT; /* no separate rx_ext_address is given => use ext_address */ if (!(so->opt.flags & CAN_ISOTP_RX_EXT_ADDR)) so->opt.rx_ext_address = so->opt.ext_address; /* these broadcast flags are not allowed together */ if (isotp_bc_flags(so) == ISOTP_ALL_BC_FLAGS) { /* CAN_ISOTP_SF_BROADCAST is prioritized */ so->opt.flags &= ~CAN_ISOTP_CF_BROADCAST; /* give user feedback on wrong config attempt */ ret = -EINVAL; } /* check for frame_txtime changes (0 => no changes) */ if (so->opt.frame_txtime) { if (so->opt.frame_txtime == CAN_ISOTP_FRAME_TXTIME_ZERO) so->frame_txtime = 0; else so->frame_txtime = so->opt.frame_txtime; } break; case CAN_ISOTP_RECV_FC: if (optlen != sizeof(struct can_isotp_fc_options)) return -EINVAL; if (copy_from_sockptr(&so->rxfc, optval, optlen)) return -EFAULT; break; case CAN_ISOTP_TX_STMIN: if (optlen != sizeof(u32)) return -EINVAL; if (copy_from_sockptr(&so->force_tx_stmin, optval, optlen)) return -EFAULT; break; case CAN_ISOTP_RX_STMIN: if (optlen != sizeof(u32)) return -EINVAL; if (copy_from_sockptr(&so->force_rx_stmin, optval, optlen)) return -EFAULT; break; case CAN_ISOTP_LL_OPTS: if (optlen == sizeof(struct can_isotp_ll_options)) { struct can_isotp_ll_options ll; if (copy_from_sockptr(&ll, optval, optlen)) return -EFAULT; /* check for correct ISO 11898-1 DLC data length */ if (ll.tx_dl != padlen(ll.tx_dl)) return -EINVAL; if (ll.mtu != CAN_MTU && ll.mtu != CANFD_MTU) return -EINVAL; if (ll.mtu == CAN_MTU && (ll.tx_dl > CAN_MAX_DLEN || ll.tx_flags != 0)) return -EINVAL; memcpy(&so->ll, &ll, sizeof(ll)); /* set ll_dl for tx path to similar place as for rx */ so->tx.ll_dl = ll.tx_dl; } else { return -EINVAL; } break; default: ret = -ENOPROTOOPT; } return ret; } static int isotp_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int ret; if (level != SOL_CAN_ISOTP) return -EINVAL; lock_sock(sk); ret = isotp_setsockopt_locked(sock, level, optname, optval, optlen); release_sock(sk); return ret; } static int isotp_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); int len; void *val; if (level != SOL_CAN_ISOTP) return -EINVAL; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case CAN_ISOTP_OPTS: len = min_t(int, len, sizeof(struct can_isotp_options)); val = &so->opt; break; case CAN_ISOTP_RECV_FC: len = min_t(int, len, sizeof(struct can_isotp_fc_options)); val = &so->rxfc; break; case CAN_ISOTP_TX_STMIN: len = min_t(int, len, sizeof(u32)); val = &so->force_tx_stmin; break; case CAN_ISOTP_RX_STMIN: len = min_t(int, len, sizeof(u32)); val = &so->force_rx_stmin; break; case CAN_ISOTP_LL_OPTS: len = min_t(int, len, sizeof(struct can_isotp_ll_options)); val = &so->ll; break; default: return -ENOPROTOOPT; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, val, len)) return -EFAULT; return 0; } static void isotp_notify(struct isotp_sock *so, unsigned long msg, struct net_device *dev) { struct sock *sk = &so->sk; if (!net_eq(dev_net(dev), sock_net(sk))) return; if (so->ifindex != dev->ifindex) return; switch (msg) { case NETDEV_UNREGISTER: lock_sock(sk); /* remove current filters & unregister */ if (so->bound) { if (isotp_register_rxid(so)) can_rx_unregister(dev_net(dev), dev, so->rxid, SINGLE_MASK(so->rxid), isotp_rcv, sk); can_rx_unregister(dev_net(dev), dev, so->txid, SINGLE_MASK(so->txid), isotp_rcv_echo, sk); } so->ifindex = 0; so->bound = 0; release_sock(sk); sk->sk_err = ENODEV; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); break; case NETDEV_DOWN: sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); break; } } static int isotp_notifier(struct notifier_block *nb, unsigned long msg, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (dev->type != ARPHRD_CAN) return NOTIFY_DONE; if (msg != NETDEV_UNREGISTER && msg != NETDEV_DOWN) return NOTIFY_DONE; if (unlikely(isotp_busy_notifier)) /* Check for reentrant bug. */ return NOTIFY_DONE; spin_lock(&isotp_notifier_lock); list_for_each_entry(isotp_busy_notifier, &isotp_notifier_list, notifier) { spin_unlock(&isotp_notifier_lock); isotp_notify(isotp_busy_notifier, msg, dev); spin_lock(&isotp_notifier_lock); } isotp_busy_notifier = NULL; spin_unlock(&isotp_notifier_lock); return NOTIFY_DONE; } static int isotp_init(struct sock *sk) { struct isotp_sock *so = isotp_sk(sk); so->ifindex = 0; so->bound = 0; so->opt.flags = CAN_ISOTP_DEFAULT_FLAGS; so->opt.ext_address = CAN_ISOTP_DEFAULT_EXT_ADDRESS; so->opt.rx_ext_address = CAN_ISOTP_DEFAULT_EXT_ADDRESS; so->opt.rxpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT; so->opt.txpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT; so->opt.frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME; so->frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME; so->rxfc.bs = CAN_ISOTP_DEFAULT_RECV_BS; so->rxfc.stmin = CAN_ISOTP_DEFAULT_RECV_STMIN; so->rxfc.wftmax = CAN_ISOTP_DEFAULT_RECV_WFTMAX; so->ll.mtu = CAN_ISOTP_DEFAULT_LL_MTU; so->ll.tx_dl = CAN_ISOTP_DEFAULT_LL_TX_DL; so->ll.tx_flags = CAN_ISOTP_DEFAULT_LL_TX_FLAGS; /* set ll_dl for tx path to similar place as for rx */ so->tx.ll_dl = so->ll.tx_dl; so->rx.state = ISOTP_IDLE; so->tx.state = ISOTP_IDLE; so->rx.buf = so->rx.sbuf; so->tx.buf = so->tx.sbuf; so->rx.buflen = ARRAY_SIZE(so->rx.sbuf); so->tx.buflen = ARRAY_SIZE(so->tx.sbuf); hrtimer_init(&so->rxtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); so->rxtimer.function = isotp_rx_timer_handler; hrtimer_init(&so->txtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); so->txtimer.function = isotp_tx_timer_handler; hrtimer_init(&so->txfrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); so->txfrtimer.function = isotp_txfr_timer_handler; init_waitqueue_head(&so->wait); spin_lock_init(&so->rx_lock); spin_lock(&isotp_notifier_lock); list_add_tail(&so->notifier, &isotp_notifier_list); spin_unlock(&isotp_notifier_lock); return 0; } static __poll_t isotp_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); __poll_t mask = datagram_poll(file, sock, wait); poll_wait(file, &so->wait, wait); /* Check for false positives due to TX state */ if ((mask & EPOLLWRNORM) && (so->tx.state != ISOTP_IDLE)) mask &= ~(EPOLLOUT | EPOLLWRNORM); return mask; } static int isotp_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd, unsigned long arg) { /* no ioctls for socket layer -> hand it down to NIC layer */ return -ENOIOCTLCMD; } static const struct proto_ops isotp_ops = { .family = PF_CAN, .release = isotp_release, .bind = isotp_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = isotp_getname, .poll = isotp_poll, .ioctl = isotp_sock_no_ioctlcmd, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = isotp_setsockopt, .getsockopt = isotp_getsockopt, .sendmsg = isotp_sendmsg, .recvmsg = isotp_recvmsg, .mmap = sock_no_mmap, }; static struct proto isotp_proto __read_mostly = { .name = "CAN_ISOTP", .owner = THIS_MODULE, .obj_size = sizeof(struct isotp_sock), .init = isotp_init, }; static const struct can_proto isotp_can_proto = { .type = SOCK_DGRAM, .protocol = CAN_ISOTP, .ops = &isotp_ops, .prot = &isotp_proto, }; static struct notifier_block canisotp_notifier = { .notifier_call = isotp_notifier }; static __init int isotp_module_init(void) { int err; max_pdu_size = max_t(unsigned int, max_pdu_size, MAX_12BIT_PDU_SIZE); max_pdu_size = min_t(unsigned int, max_pdu_size, MAX_PDU_SIZE); pr_info("can: isotp protocol (max_pdu_size %d)\n", max_pdu_size); err = can_proto_register(&isotp_can_proto); if (err < 0) pr_err("can: registration of isotp protocol failed %pe\n", ERR_PTR(err)); else register_netdevice_notifier(&canisotp_notifier); return err; } static __exit void isotp_module_exit(void) { can_proto_unregister(&isotp_can_proto); unregister_netdevice_notifier(&canisotp_notifier); } module_init(isotp_module_init); module_exit(isotp_module_exit);
7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 // SPDX-License-Identifier: GPL-2.0 #include <linux/export.h> #include <linux/icmpv6.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/spinlock.h> #include <net/ipv6.h> #if IS_ENABLED(CONFIG_IPV6) #if !IS_BUILTIN(CONFIG_IPV6) static ip6_icmp_send_t __rcu *ip6_icmp_send; int inet6_register_icmp_sender(ip6_icmp_send_t *fn) { return (cmpxchg((ip6_icmp_send_t **)&ip6_icmp_send, NULL, fn) == NULL) ? 0 : -EBUSY; } EXPORT_SYMBOL(inet6_register_icmp_sender); int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn) { int ret; ret = (cmpxchg((ip6_icmp_send_t **)&ip6_icmp_send, fn, NULL) == fn) ? 0 : -EINVAL; synchronize_net(); return ret; } EXPORT_SYMBOL(inet6_unregister_icmp_sender); void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct inet6_skb_parm *parm) { ip6_icmp_send_t *send; rcu_read_lock(); send = rcu_dereference(ip6_icmp_send); if (send) send(skb, type, code, info, NULL, parm); rcu_read_unlock(); } EXPORT_SYMBOL(__icmpv6_send); #endif #if IS_ENABLED(CONFIG_NF_NAT) #include <net/netfilter/nf_conntrack.h> void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) { struct inet6_skb_parm parm = { 0 }; struct sk_buff *cloned_skb = NULL; enum ip_conntrack_info ctinfo; struct in6_addr orig_ip; struct nf_conn *ct; ct = nf_ct_get(skb_in, &ctinfo); if (!ct || !(ct->status & IPS_SRC_NAT)) { __icmpv6_send(skb_in, type, code, info, &parm); return; } if (skb_shared(skb_in)) skb_in = cloned_skb = skb_clone(skb_in, GFP_ATOMIC); if (unlikely(!skb_in || skb_network_header(skb_in) < skb_in->head || (skb_network_header(skb_in) + sizeof(struct ipv6hdr)) > skb_tail_pointer(skb_in) || skb_ensure_writable(skb_in, skb_network_offset(skb_in) + sizeof(struct ipv6hdr)))) goto out; orig_ip = ipv6_hdr(skb_in)->saddr; ipv6_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.in6; __icmpv6_send(skb_in, type, code, info, &parm); ipv6_hdr(skb_in)->saddr = orig_ip; out: consume_skb(cloned_skb); } EXPORT_SYMBOL(icmpv6_ndo_send); #endif #endif
4 4 4 9 9 9 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _XFRM_HASH_H #define _XFRM_HASH_H #include <linux/xfrm.h> #include <linux/socket.h> #include <linux/jhash.h> static inline unsigned int __xfrm4_addr_hash(const xfrm_address_t *addr) { return ntohl(addr->a4); } static inline unsigned int __xfrm6_addr_hash(const xfrm_address_t *addr) { return jhash2((__force u32 *)addr->a6, 4, 0); } static inline unsigned int __xfrm4_daddr_saddr_hash(const xfrm_address_t *daddr, const xfrm_address_t *saddr) { u32 sum = (__force u32)daddr->a4 + (__force u32)saddr->a4; return ntohl((__force __be32)sum); } static inline unsigned int __xfrm6_daddr_saddr_hash(const xfrm_address_t *daddr, const xfrm_address_t *saddr) { return __xfrm6_addr_hash(daddr) ^ __xfrm6_addr_hash(saddr); } static inline u32 __bits2mask32(__u8 bits) { u32 mask32 = 0xffffffff; if (bits == 0) mask32 = 0; else if (bits < 32) mask32 <<= (32 - bits); return mask32; } static inline unsigned int __xfrm4_dpref_spref_hash(const xfrm_address_t *daddr, const xfrm_address_t *saddr, __u8 dbits, __u8 sbits) { return jhash_2words(ntohl(daddr->a4) & __bits2mask32(dbits), ntohl(saddr->a4) & __bits2mask32(sbits), 0); } static inline unsigned int __xfrm6_pref_hash(const xfrm_address_t *addr, __u8 prefixlen) { unsigned int pdw; unsigned int pbi; u32 initval = 0; pdw = prefixlen >> 5; /* num of whole u32 in prefix */ pbi = prefixlen & 0x1f; /* num of bits in incomplete u32 in prefix */ if (pbi) { __be32 mask; mask = htonl((0xffffffff) << (32 - pbi)); initval = (__force u32)(addr->a6[pdw] & mask); } return jhash2((__force u32 *)addr->a6, pdw, initval); } static inline unsigned int __xfrm6_dpref_spref_hash(const xfrm_address_t *daddr, const xfrm_address_t *saddr, __u8 dbits, __u8 sbits) { return __xfrm6_pref_hash(daddr, dbits) ^ __xfrm6_pref_hash(saddr, sbits); } static inline unsigned int __xfrm_dst_hash(const xfrm_address_t *daddr, const xfrm_address_t *saddr, u32 reqid, unsigned short family, unsigned int hmask) { unsigned int h = family ^ reqid; switch (family) { case AF_INET: h ^= __xfrm4_daddr_saddr_hash(daddr, saddr); break; case AF_INET6: h ^= __xfrm6_daddr_saddr_hash(daddr, saddr); break; } return (h ^ (h >> 16)) & hmask; } static inline unsigned int __xfrm_src_hash(const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned short family, unsigned int hmask) { unsigned int h = family; switch (family) { case AF_INET: h ^= __xfrm4_daddr_saddr_hash(daddr, saddr); break; case AF_INET6: h ^= __xfrm6_daddr_saddr_hash(daddr, saddr); break; } return (h ^ (h >> 16)) & hmask; } static inline unsigned int __xfrm_spi_hash(const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family, unsigned int hmask) { unsigned int h = (__force u32)spi ^ proto; switch (family) { case AF_INET: h ^= __xfrm4_addr_hash(daddr); break; case AF_INET6: h ^= __xfrm6_addr_hash(daddr); break; } return (h ^ (h >> 10) ^ (h >> 20)) & hmask; } static inline unsigned int __xfrm_seq_hash(u32 seq, unsigned int hmask) { unsigned int h = seq; return (h ^ (h >> 10) ^ (h >> 20)) & hmask; } static inline unsigned int __idx_hash(u32 index, unsigned int hmask) { return (index ^ (index >> 8)) & hmask; } static inline unsigned int __sel_hash(const struct xfrm_selector *sel, unsigned short family, unsigned int hmask, u8 dbits, u8 sbits) { const xfrm_address_t *daddr = &sel->daddr; const xfrm_address_t *saddr = &sel->saddr; unsigned int h = 0; switch (family) { case AF_INET: if (sel->prefixlen_d < dbits || sel->prefixlen_s < sbits) return hmask + 1; h = __xfrm4_dpref_spref_hash(daddr, saddr, dbits, sbits); break; case AF_INET6: if (sel->prefixlen_d < dbits || sel->prefixlen_s < sbits) return hmask + 1; h = __xfrm6_dpref_spref_hash(daddr, saddr, dbits, sbits); break; } h ^= (h >> 16); return h & hmask; } static inline unsigned int __addr_hash(const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned short family, unsigned int hmask, u8 dbits, u8 sbits) { unsigned int h = 0; switch (family) { case AF_INET: h = __xfrm4_dpref_spref_hash(daddr, saddr, dbits, sbits); break; case AF_INET6: h = __xfrm6_dpref_spref_hash(daddr, saddr, dbits, sbits); break; } h ^= (h >> 16); return h & hmask; } struct hlist_head *xfrm_hash_alloc(unsigned int sz); void xfrm_hash_free(struct hlist_head *n, unsigned int sz); #endif /* _XFRM_HASH_H */
3 3 90 85 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 /* SPDX-License-Identifier: GPL-2.0+ */ #ifndef _LINUX_MAPLE_TREE_H #define _LINUX_MAPLE_TREE_H /* * Maple Tree - An RCU-safe adaptive tree for storing ranges * Copyright (c) 2018-2022 Oracle * Authors: Liam R. Howlett <Liam.Howlett@Oracle.com> * Matthew Wilcox <willy@infradead.org> */ #include <linux/kernel.h> #include <linux/rcupdate.h> #include <linux/spinlock.h> /* #define CONFIG_MAPLE_RCU_DISABLED */ /* * Allocated nodes are mutable until they have been inserted into the tree, * at which time they cannot change their type until they have been removed * from the tree and an RCU grace period has passed. * * Removed nodes have their ->parent set to point to themselves. RCU readers * check ->parent before relying on the value that they loaded from the * slots array. This lets us reuse the slots array for the RCU head. * * Nodes in the tree point to their parent unless bit 0 is set. */ #if defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) /* 64bit sizes */ #define MAPLE_NODE_SLOTS 31 /* 256 bytes including ->parent */ #define MAPLE_RANGE64_SLOTS 16 /* 256 bytes */ #define MAPLE_ARANGE64_SLOTS 10 /* 240 bytes */ #define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 1) #else /* 32bit sizes */ #define MAPLE_NODE_SLOTS 63 /* 256 bytes including ->parent */ #define MAPLE_RANGE64_SLOTS 32 /* 256 bytes */ #define MAPLE_ARANGE64_SLOTS 21 /* 240 bytes */ #define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 2) #endif /* defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) */ #define MAPLE_NODE_MASK 255UL /* * The node->parent of the root node has bit 0 set and the rest of the pointer * is a pointer to the tree itself. No more bits are available in this pointer * (on m68k, the data structure may only be 2-byte aligned). * * Internal non-root nodes can only have maple_range_* nodes as parents. The * parent pointer is 256B aligned like all other tree nodes. When storing a 32 * or 64 bit values, the offset can fit into 4 bits. The 16 bit values need an * extra bit to store the offset. This extra bit comes from a reuse of the last * bit in the node type. This is possible by using bit 1 to indicate if bit 2 * is part of the type or the slot. * * Once the type is decided, the decision of an allocation range type or a range * type is done by examining the immutable tree flag for the MAPLE_ALLOC_RANGE * flag. * * Node types: * 0x??1 = Root * 0x?00 = 16 bit nodes * 0x010 = 32 bit nodes * 0x110 = 64 bit nodes * * Slot size and location in the parent pointer: * type : slot location * 0x??1 : Root * 0x?00 : 16 bit values, type in 0-1, slot in 2-6 * 0x010 : 32 bit values, type in 0-2, slot in 3-6 * 0x110 : 64 bit values, type in 0-2, slot in 3-6 */ /* * This metadata is used to optimize the gap updating code and in reverse * searching for gaps or any other code that needs to find the end of the data. */ struct maple_metadata { unsigned char end; unsigned char gap; }; /* * Leaf nodes do not store pointers to nodes, they store user data. Users may * store almost any bit pattern. As noted above, the optimisation of storing an * entry at 0 in the root pointer cannot be done for data which have the bottom * two bits set to '10'. We also reserve values with the bottom two bits set to * '10' which are below 4096 (ie 2, 6, 10 .. 4094) for internal use. Some APIs * return errnos as a negative errno shifted right by two bits and the bottom * two bits set to '10', and while choosing to store these values in the array * is not an error, it may lead to confusion if you're testing for an error with * mas_is_err(). * * Non-leaf nodes store the type of the node pointed to (enum maple_type in bits * 3-6), bit 2 is reserved. That leaves bits 0-1 unused for now. * * In regular B-Tree terms, pivots are called keys. The term pivot is used to * indicate that the tree is specifying ranges, Pivots may appear in the * subtree with an entry attached to the value whereas keys are unique to a * specific position of a B-tree. Pivot values are inclusive of the slot with * the same index. */ struct maple_range_64 { struct maple_pnode *parent; unsigned long pivot[MAPLE_RANGE64_SLOTS - 1]; union { void __rcu *slot[MAPLE_RANGE64_SLOTS]; struct { void __rcu *pad[MAPLE_RANGE64_SLOTS - 1]; struct maple_metadata meta; }; }; }; /* * At tree creation time, the user can specify that they're willing to trade off * storing fewer entries in a tree in return for storing more information in * each node. * * The maple tree supports recording the largest range of NULL entries available * in this node, also called gaps. This optimises the tree for allocating a * range. */ struct maple_arange_64 { struct maple_pnode *parent; unsigned long pivot[MAPLE_ARANGE64_SLOTS - 1]; void __rcu *slot[MAPLE_ARANGE64_SLOTS]; unsigned long gap[MAPLE_ARANGE64_SLOTS]; struct maple_metadata meta; }; struct maple_alloc { unsigned long total; unsigned char node_count; unsigned int request_count; struct maple_alloc *slot[MAPLE_ALLOC_SLOTS]; }; struct maple_topiary { struct maple_pnode *parent; struct maple_enode *next; /* Overlaps the pivot */ }; enum maple_type { maple_dense, maple_leaf_64, maple_range_64, maple_arange_64, }; /** * DOC: Maple tree flags * * * MT_FLAGS_ALLOC_RANGE - Track gaps in this tree * * MT_FLAGS_USE_RCU - Operate in RCU mode * * MT_FLAGS_HEIGHT_OFFSET - The position of the tree height in the flags * * MT_FLAGS_HEIGHT_MASK - The mask for the maple tree height value * * MT_FLAGS_LOCK_MASK - How the mt_lock is used * * MT_FLAGS_LOCK_IRQ - Acquired irq-safe * * MT_FLAGS_LOCK_BH - Acquired bh-safe * * MT_FLAGS_LOCK_EXTERN - mt_lock is not used * * MAPLE_HEIGHT_MAX The largest height that can be stored */ #define MT_FLAGS_ALLOC_RANGE 0x01 #define MT_FLAGS_USE_RCU 0x02 #define MT_FLAGS_HEIGHT_OFFSET 0x02 #define MT_FLAGS_HEIGHT_MASK 0x7C #define MT_FLAGS_LOCK_MASK 0x300 #define MT_FLAGS_LOCK_IRQ 0x100 #define MT_FLAGS_LOCK_BH 0x200 #define MT_FLAGS_LOCK_EXTERN 0x300 #define MT_FLAGS_ALLOC_WRAPPED 0x0800 #define MAPLE_HEIGHT_MAX 31 #define MAPLE_NODE_TYPE_MASK 0x0F #define MAPLE_NODE_TYPE_SHIFT 0x03 #define MAPLE_RESERVED_RANGE 4096 #ifdef CONFIG_LOCKDEP typedef struct lockdep_map *lockdep_map_p; #define mt_lock_is_held(mt) \ (!(mt)->ma_external_lock || lock_is_held((mt)->ma_external_lock)) #define mt_write_lock_is_held(mt) \ (!(mt)->ma_external_lock || \ lock_is_held_type((mt)->ma_external_lock, 0)) #define mt_set_external_lock(mt, lock) \ (mt)->ma_external_lock = &(lock)->dep_map #define mt_on_stack(mt) (mt).ma_external_lock = NULL #else typedef struct { /* nothing */ } lockdep_map_p; #define mt_lock_is_held(mt) 1 #define mt_write_lock_is_held(mt) 1 #define mt_set_external_lock(mt, lock) do { } while (0) #define mt_on_stack(mt) do { } while (0) #endif /* * If the tree contains a single entry at index 0, it is usually stored in * tree->ma_root. To optimise for the page cache, an entry which ends in '00', * '01' or '11' is stored in the root, but an entry which ends in '10' will be * stored in a node. Bits 3-6 are used to store enum maple_type. * * The flags are used both to store some immutable information about this tree * (set at tree creation time) and dynamic information set under the spinlock. * * Another use of flags are to indicate global states of the tree. This is the * case with the MAPLE_USE_RCU flag, which indicates the tree is currently in * RCU mode. This mode was added to allow the tree to reuse nodes instead of * re-allocating and RCU freeing nodes when there is a single user. */ struct maple_tree { union { spinlock_t ma_lock; lockdep_map_p ma_external_lock; }; unsigned int ma_flags; void __rcu *ma_root; }; /** * MTREE_INIT() - Initialize a maple tree * @name: The maple tree name * @__flags: The maple tree flags * */ #define MTREE_INIT(name, __flags) { \ .ma_lock = __SPIN_LOCK_UNLOCKED((name).ma_lock), \ .ma_flags = __flags, \ .ma_root = NULL, \ } /** * MTREE_INIT_EXT() - Initialize a maple tree with an external lock. * @name: The tree name * @__flags: The maple tree flags * @__lock: The external lock */ #ifdef CONFIG_LOCKDEP #define MTREE_INIT_EXT(name, __flags, __lock) { \ .ma_external_lock = &(__lock).dep_map, \ .ma_flags = (__flags), \ .ma_root = NULL, \ } #else #define MTREE_INIT_EXT(name, __flags, __lock) MTREE_INIT(name, __flags) #endif #define DEFINE_MTREE(name) \ struct maple_tree name = MTREE_INIT(name, 0) #define mtree_lock(mt) spin_lock((&(mt)->ma_lock)) #define mtree_lock_nested(mas, subclass) \ spin_lock_nested((&(mt)->ma_lock), subclass) #define mtree_unlock(mt) spin_unlock((&(mt)->ma_lock)) /* * The Maple Tree squeezes various bits in at various points which aren't * necessarily obvious. Usually, this is done by observing that pointers are * N-byte aligned and thus the bottom log_2(N) bits are available for use. We * don't use the high bits of pointers to store additional information because * we don't know what bits are unused on any given architecture. * * Nodes are 256 bytes in size and are also aligned to 256 bytes, giving us 8 * low bits for our own purposes. Nodes are currently of 4 types: * 1. Single pointer (Range is 0-0) * 2. Non-leaf Allocation Range nodes * 3. Non-leaf Range nodes * 4. Leaf Range nodes All nodes consist of a number of node slots, * pivots, and a parent pointer. */ struct maple_node { union { struct { struct maple_pnode *parent; void __rcu *slot[MAPLE_NODE_SLOTS]; }; struct { void *pad; struct rcu_head rcu; struct maple_enode *piv_parent; unsigned char parent_slot; enum maple_type type; unsigned char slot_len; unsigned int ma_flags; }; struct maple_range_64 mr64; struct maple_arange_64 ma64; struct maple_alloc alloc; }; }; /* * More complicated stores can cause two nodes to become one or three and * potentially alter the height of the tree. Either half of the tree may need * to be rebalanced against the other. The ma_topiary struct is used to track * which nodes have been 'cut' from the tree so that the change can be done * safely at a later date. This is done to support RCU. */ struct ma_topiary { struct maple_enode *head; struct maple_enode *tail; struct maple_tree *mtree; }; void *mtree_load(struct maple_tree *mt, unsigned long index); int mtree_insert(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp); int mtree_insert_range(struct maple_tree *mt, unsigned long first, unsigned long last, void *entry, gfp_t gfp); int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp); int mtree_alloc_cyclic(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long range_lo, unsigned long range_hi, unsigned long *next, gfp_t gfp); int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp); int mtree_store_range(struct maple_tree *mt, unsigned long first, unsigned long last, void *entry, gfp_t gfp); int mtree_store(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp); void *mtree_erase(struct maple_tree *mt, unsigned long index); int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp); int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp); void mtree_destroy(struct maple_tree *mt); void __mt_destroy(struct maple_tree *mt); /** * mtree_empty() - Determine if a tree has any present entries. * @mt: Maple Tree. * * Context: Any context. * Return: %true if the tree contains only NULL pointers. */ static inline bool mtree_empty(const struct maple_tree *mt) { return mt->ma_root == NULL; } /* Advanced API */ /* * Maple State Status * ma_active means the maple state is pointing to a node and offset and can * continue operating on the tree. * ma_start means we have not searched the tree. * ma_root means we have searched the tree and the entry we found lives in * the root of the tree (ie it has index 0, length 1 and is the only entry in * the tree). * ma_none means we have searched the tree and there is no node in the * tree for this entry. For example, we searched for index 1 in an empty * tree. Or we have a tree which points to a full leaf node and we * searched for an entry which is larger than can be contained in that * leaf node. * ma_pause means the data within the maple state may be stale, restart the * operation * ma_overflow means the search has reached the upper limit of the search * ma_underflow means the search has reached the lower limit of the search * ma_error means there was an error, check the node for the error number. */ enum maple_status { ma_active, ma_start, ma_root, ma_none, ma_pause, ma_overflow, ma_underflow, ma_error, }; /* * The maple state is defined in the struct ma_state and is used to keep track * of information during operations, and even between operations when using the * advanced API. * * If state->node has bit 0 set then it references a tree location which is not * a node (eg the root). If bit 1 is set, the rest of the bits are a negative * errno. Bit 2 (the 'unallocated slots' bit) is clear. Bits 3-6 indicate the * node type. * * state->alloc either has a request number of nodes or an allocated node. If * stat->alloc has a requested number of nodes, the first bit will be set (0x1) * and the remaining bits are the value. If state->alloc is a node, then the * node will be of type maple_alloc. maple_alloc has MAPLE_NODE_SLOTS - 1 for * storing more allocated nodes, a total number of nodes allocated, and the * node_count in this node. node_count is the number of allocated nodes in this * node. The scaling beyond MAPLE_NODE_SLOTS - 1 is handled by storing further * nodes into state->alloc->slot[0]'s node. Nodes are taken from state->alloc * by removing a node from the state->alloc node until state->alloc->node_count * is 1, when state->alloc is returned and the state->alloc->slot[0] is promoted * to state->alloc. Nodes are pushed onto state->alloc by putting the current * state->alloc into the pushed node's slot[0]. * * The state also contains the implied min/max of the state->node, the depth of * this search, and the offset. The implied min/max are either from the parent * node or are 0-oo for the root node. The depth is incremented or decremented * every time a node is walked down or up. The offset is the slot/pivot of * interest in the node - either for reading or writing. * * When returning a value the maple state index and last respectively contain * the start and end of the range for the entry. Ranges are inclusive in the * Maple Tree. * * The status of the state is used to determine how the next action should treat * the state. For instance, if the status is ma_start then the next action * should start at the root of the tree and walk down. If the status is * ma_pause then the node may be stale data and should be discarded. If the * status is ma_overflow, then the last action hit the upper limit. * */ struct ma_state { struct maple_tree *tree; /* The tree we're operating in */ unsigned long index; /* The index we're operating on - range start */ unsigned long last; /* The last index we're operating on - range end */ struct maple_enode *node; /* The node containing this entry */ unsigned long min; /* The minimum index of this node - implied pivot min */ unsigned long max; /* The maximum index of this node - implied pivot max */ struct maple_alloc *alloc; /* Allocated nodes for this operation */ enum maple_status status; /* The status of the state (active, start, none, etc) */ unsigned char depth; /* depth of tree descent during write */ unsigned char offset; unsigned char mas_flags; unsigned char end; /* The end of the node */ }; struct ma_wr_state { struct ma_state *mas; struct maple_node *node; /* Decoded mas->node */ unsigned long r_min; /* range min */ unsigned long r_max; /* range max */ enum maple_type type; /* mas->node type */ unsigned char offset_end; /* The offset where the write ends */ unsigned long *pivots; /* mas->node->pivots pointer */ unsigned long end_piv; /* The pivot at the offset end */ void __rcu **slots; /* mas->node->slots pointer */ void *entry; /* The entry to write */ void *content; /* The existing entry that is being overwritten */ }; #define mas_lock(mas) spin_lock(&((mas)->tree->ma_lock)) #define mas_lock_nested(mas, subclass) \ spin_lock_nested(&((mas)->tree->ma_lock), subclass) #define mas_unlock(mas) spin_unlock(&((mas)->tree->ma_lock)) /* * Special values for ma_state.node. * MA_ERROR represents an errno. After dropping the lock and attempting * to resolve the error, the walk would have to be restarted from the * top of the tree as the tree may have been modified. */ #define MA_ERROR(err) \ ((struct maple_enode *)(((unsigned long)err << 2) | 2UL)) #define MA_STATE(name, mt, first, end) \ struct ma_state name = { \ .tree = mt, \ .index = first, \ .last = end, \ .node = NULL, \ .status = ma_start, \ .min = 0, \ .max = ULONG_MAX, \ .alloc = NULL, \ .mas_flags = 0, \ } #define MA_WR_STATE(name, ma_state, wr_entry) \ struct ma_wr_state name = { \ .mas = ma_state, \ .content = NULL, \ .entry = wr_entry, \ } #define MA_TOPIARY(name, tree) \ struct ma_topiary name = { \ .head = NULL, \ .tail = NULL, \ .mtree = tree, \ } void *mas_walk(struct ma_state *mas); void *mas_store(struct ma_state *mas, void *entry); void *mas_erase(struct ma_state *mas); int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp); void mas_store_prealloc(struct ma_state *mas, void *entry); void *mas_find(struct ma_state *mas, unsigned long max); void *mas_find_range(struct ma_state *mas, unsigned long max); void *mas_find_rev(struct ma_state *mas, unsigned long min); void *mas_find_range_rev(struct ma_state *mas, unsigned long max); int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp); int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp, void *entry, unsigned long range_lo, unsigned long range_hi, unsigned long *next, gfp_t gfp); bool mas_nomem(struct ma_state *mas, gfp_t gfp); void mas_pause(struct ma_state *mas); void maple_tree_init(void); void mas_destroy(struct ma_state *mas); int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries); void *mas_prev(struct ma_state *mas, unsigned long min); void *mas_prev_range(struct ma_state *mas, unsigned long max); void *mas_next(struct ma_state *mas, unsigned long max); void *mas_next_range(struct ma_state *mas, unsigned long max); int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size); /* * This finds an empty area from the highest address to the lowest. * AKA "Topdown" version, */ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size); static inline void mas_init(struct ma_state *mas, struct maple_tree *tree, unsigned long addr) { memset(mas, 0, sizeof(struct ma_state)); mas->tree = tree; mas->index = mas->last = addr; mas->max = ULONG_MAX; mas->status = ma_start; mas->node = NULL; } static inline bool mas_is_active(struct ma_state *mas) { return mas->status == ma_active; } static inline bool mas_is_err(struct ma_state *mas) { return mas->status == ma_error; } /** * mas_reset() - Reset a Maple Tree operation state. * @mas: Maple Tree operation state. * * Resets the error or walk state of the @mas so future walks of the * array will start from the root. Use this if you have dropped the * lock and want to reuse the ma_state. * * Context: Any context. */ static __always_inline void mas_reset(struct ma_state *mas) { mas->status = ma_start; mas->node = NULL; } /** * mas_for_each() - Iterate over a range of the maple tree. * @__mas: Maple Tree operation state (maple_state) * @__entry: Entry retrieved from the tree * @__max: maximum index to retrieve from the tree * * When returned, mas->index and mas->last will hold the entire range for the * entry. * * Note: may return the zero entry. */ #define mas_for_each(__mas, __entry, __max) \ while (((__entry) = mas_find((__mas), (__max))) != NULL) #ifdef CONFIG_DEBUG_MAPLE_TREE enum mt_dump_format { mt_dump_dec, mt_dump_hex, }; extern atomic_t maple_tree_tests_run; extern atomic_t maple_tree_tests_passed; void mt_dump(const struct maple_tree *mt, enum mt_dump_format format); void mas_dump(const struct ma_state *mas); void mas_wr_dump(const struct ma_wr_state *wr_mas); void mt_validate(struct maple_tree *mt); void mt_cache_shrink(void); #define MT_BUG_ON(__tree, __x) do { \ atomic_inc(&maple_tree_tests_run); \ if (__x) { \ pr_info("BUG at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mt_dump(__tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ } while (0) #define MAS_BUG_ON(__mas, __x) do { \ atomic_inc(&maple_tree_tests_run); \ if (__x) { \ pr_info("BUG at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_dump(__mas); \ mt_dump((__mas)->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ } while (0) #define MAS_WR_BUG_ON(__wrmas, __x) do { \ atomic_inc(&maple_tree_tests_run); \ if (__x) { \ pr_info("BUG at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_wr_dump(__wrmas); \ mas_dump((__wrmas)->mas); \ mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ } while (0) #define MT_WARN_ON(__tree, __x) ({ \ int ret = !!(__x); \ atomic_inc(&maple_tree_tests_run); \ if (ret) { \ pr_info("WARN at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mt_dump(__tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ unlikely(ret); \ }) #define MAS_WARN_ON(__mas, __x) ({ \ int ret = !!(__x); \ atomic_inc(&maple_tree_tests_run); \ if (ret) { \ pr_info("WARN at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_dump(__mas); \ mt_dump((__mas)->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ unlikely(ret); \ }) #define MAS_WR_WARN_ON(__wrmas, __x) ({ \ int ret = !!(__x); \ atomic_inc(&maple_tree_tests_run); \ if (ret) { \ pr_info("WARN at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_wr_dump(__wrmas); \ mas_dump((__wrmas)->mas); \ mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ unlikely(ret); \ }) #else #define MT_BUG_ON(__tree, __x) BUG_ON(__x) #define MAS_BUG_ON(__mas, __x) BUG_ON(__x) #define MAS_WR_BUG_ON(__mas, __x) BUG_ON(__x) #define MT_WARN_ON(__tree, __x) WARN_ON(__x) #define MAS_WARN_ON(__mas, __x) WARN_ON(__x) #define MAS_WR_WARN_ON(__mas, __x) WARN_ON(__x) #endif /* CONFIG_DEBUG_MAPLE_TREE */ /** * __mas_set_range() - Set up Maple Tree operation state to a sub-range of the * current location. * @mas: Maple Tree operation state. * @start: New start of range in the Maple Tree. * @last: New end of range in the Maple Tree. * * set the internal maple state values to a sub-range. * Please use mas_set_range() if you do not know where you are in the tree. */ static inline void __mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { /* Ensure the range starts within the current slot */ MAS_WARN_ON(mas, mas_is_active(mas) && (mas->index > start || mas->last < start)); mas->index = start; mas->last = last; } /** * mas_set_range() - Set up Maple Tree operation state for a different index. * @mas: Maple Tree operation state. * @start: New start of range in the Maple Tree. * @last: New end of range in the Maple Tree. * * Move the operation state to refer to a different range. This will * have the effect of starting a walk from the top; see mas_next() * to move to an adjacent index. */ static inline void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { mas_reset(mas); __mas_set_range(mas, start, last); } /** * mas_set() - Set up Maple Tree operation state for a different index. * @mas: Maple Tree operation state. * @index: New index into the Maple Tree. * * Move the operation state to refer to a different index. This will * have the effect of starting a walk from the top; see mas_next() * to move to an adjacent index. */ static inline void mas_set(struct ma_state *mas, unsigned long index) { mas_set_range(mas, index, index); } static inline bool mt_external_lock(const struct maple_tree *mt) { return (mt->ma_flags & MT_FLAGS_LOCK_MASK) == MT_FLAGS_LOCK_EXTERN; } /** * mt_init_flags() - Initialise an empty maple tree with flags. * @mt: Maple Tree * @flags: maple tree flags. * * If you need to initialise a Maple Tree with special flags (eg, an * allocation tree), use this function. * * Context: Any context. */ static inline void mt_init_flags(struct maple_tree *mt, unsigned int flags) { mt->ma_flags = flags; if (!mt_external_lock(mt)) spin_lock_init(&mt->ma_lock); rcu_assign_pointer(mt->ma_root, NULL); } /** * mt_init() - Initialise an empty maple tree. * @mt: Maple Tree * * An empty Maple Tree. * * Context: Any context. */ static inline void mt_init(struct maple_tree *mt) { mt_init_flags(mt, 0); } static inline bool mt_in_rcu(struct maple_tree *mt) { #ifdef CONFIG_MAPLE_RCU_DISABLED return false; #endif return mt->ma_flags & MT_FLAGS_USE_RCU; } /** * mt_clear_in_rcu() - Switch the tree to non-RCU mode. * @mt: The Maple Tree */ static inline void mt_clear_in_rcu(struct maple_tree *mt) { if (!mt_in_rcu(mt)) return; if (mt_external_lock(mt)) { WARN_ON(!mt_lock_is_held(mt)); mt->ma_flags &= ~MT_FLAGS_USE_RCU; } else { mtree_lock(mt); mt->ma_flags &= ~MT_FLAGS_USE_RCU; mtree_unlock(mt); } } /** * mt_set_in_rcu() - Switch the tree to RCU safe mode. * @mt: The Maple Tree */ static inline void mt_set_in_rcu(struct maple_tree *mt) { if (mt_in_rcu(mt)) return; if (mt_external_lock(mt)) { WARN_ON(!mt_lock_is_held(mt)); mt->ma_flags |= MT_FLAGS_USE_RCU; } else { mtree_lock(mt); mt->ma_flags |= MT_FLAGS_USE_RCU; mtree_unlock(mt); } } static inline unsigned int mt_height(const struct maple_tree *mt) { return (mt->ma_flags & MT_FLAGS_HEIGHT_MASK) >> MT_FLAGS_HEIGHT_OFFSET; } void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max); void *mt_find_after(struct maple_tree *mt, unsigned long *index, unsigned long max); void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min); void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max); /** * mt_for_each - Iterate over each entry starting at index until max. * @__tree: The Maple Tree * @__entry: The current entry * @__index: The index to start the search from. Subsequently used as iterator. * @__max: The maximum limit for @index * * This iterator skips all entries, which resolve to a NULL pointer, * e.g. entries which has been reserved with XA_ZERO_ENTRY. */ #define mt_for_each(__tree, __entry, __index, __max) \ for (__entry = mt_find(__tree, &(__index), __max); \ __entry; __entry = mt_find_after(__tree, &(__index), __max)) #endif /*_LINUX_MAPLE_TREE_H */
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 // SPDX-License-Identifier: GPL-2.0 /* * devtmpfs - kernel-maintained tmpfs-based /dev * * Copyright (C) 2009, Kay Sievers <kay.sievers@vrfy.org> * * During bootup, before any driver core device is registered, * devtmpfs, a tmpfs-based filesystem is created. Every driver-core * device which requests a device node, will add a node in this * filesystem. * By default, all devices are named after the name of the device, * owned by root and have a default mode of 0600. Subsystems can * overwrite the default setting if needed. */ #define pr_fmt(fmt) "devtmpfs: " fmt #include <linux/kernel.h> #include <linux/syscalls.h> #include <linux/mount.h> #include <linux/device.h> #include <linux/blkdev.h> #include <linux/namei.h> #include <linux/fs.h> #include <linux/shmem_fs.h> #include <linux/ramfs.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/kthread.h> #include <linux/init_syscalls.h> #include <uapi/linux/mount.h> #include "base.h" #ifdef CONFIG_DEVTMPFS_SAFE #define DEVTMPFS_MFLAGS (MS_SILENT | MS_NOEXEC | MS_NOSUID) #else #define DEVTMPFS_MFLAGS (MS_SILENT) #endif static struct task_struct *thread; static int __initdata mount_dev = IS_ENABLED(CONFIG_DEVTMPFS_MOUNT); static DEFINE_SPINLOCK(req_lock); static struct req { struct req *next; struct completion done; int err; const char *name; umode_t mode; /* 0 => delete */ kuid_t uid; kgid_t gid; struct device *dev; } *requests; static int __init mount_param(char *str) { mount_dev = simple_strtoul(str, NULL, 0); return 1; } __setup("devtmpfs.mount=", mount_param); static struct vfsmount *mnt; static struct dentry *public_dev_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { struct super_block *s = mnt->mnt_sb; int err; atomic_inc(&s->s_active); down_write(&s->s_umount); err = reconfigure_single(s, flags, data); if (err < 0) { deactivate_locked_super(s); return ERR_PTR(err); } return dget(s->s_root); } static struct file_system_type internal_fs_type = { .name = "devtmpfs", #ifdef CONFIG_TMPFS .init_fs_context = shmem_init_fs_context, #else .init_fs_context = ramfs_init_fs_context, #endif .kill_sb = kill_litter_super, }; static struct file_system_type dev_fs_type = { .name = "devtmpfs", .mount = public_dev_mount, }; static int devtmpfs_submit_req(struct req *req, const char *tmp) { init_completion(&req->done); spin_lock(&req_lock); req->next = requests; requests = req; spin_unlock(&req_lock); wake_up_process(thread); wait_for_completion(&req->done); kfree(tmp); return req->err; } int devtmpfs_create_node(struct device *dev) { const char *tmp = NULL; struct req req; if (!thread) return 0; req.mode = 0; req.uid = GLOBAL_ROOT_UID; req.gid = GLOBAL_ROOT_GID; req.name = device_get_devnode(dev, &req.mode, &req.uid, &req.gid, &tmp); if (!req.name) return -ENOMEM; if (req.mode == 0) req.mode = 0600; if (is_blockdev(dev)) req.mode |= S_IFBLK; else req.mode |= S_IFCHR; req.dev = dev; return devtmpfs_submit_req(&req, tmp); } int devtmpfs_delete_node(struct device *dev) { const char *tmp = NULL; struct req req; if (!thread) return 0; req.name = device_get_devnode(dev, NULL, NULL, NULL, &tmp); if (!req.name) return -ENOMEM; req.mode = 0; req.dev = dev; return devtmpfs_submit_req(&req, tmp); } static int dev_mkdir(const char *name, umode_t mode) { struct dentry *dentry; struct path path; int err; dentry = kern_path_create(AT_FDCWD, name, &path, LOOKUP_DIRECTORY); if (IS_ERR(dentry)) return PTR_ERR(dentry); err = vfs_mkdir(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode); if (!err) /* mark as kernel-created inode */ d_inode(dentry)->i_private = &thread; done_path_create(&path, dentry); return err; } static int create_path(const char *nodepath) { char *path; char *s; int err = 0; /* parent directories do not exist, create them */ path = kstrdup(nodepath, GFP_KERNEL); if (!path) return -ENOMEM; s = path; for (;;) { s = strchr(s, '/'); if (!s) break; s[0] = '\0'; err = dev_mkdir(path, 0755); if (err && err != -EEXIST) break; s[0] = '/'; s++; } kfree(path); return err; } static int handle_create(const char *nodename, umode_t mode, kuid_t uid, kgid_t gid, struct device *dev) { struct dentry *dentry; struct path path; int err; dentry = kern_path_create(AT_FDCWD, nodename, &path, 0); if (dentry == ERR_PTR(-ENOENT)) { create_path(nodename); dentry = kern_path_create(AT_FDCWD, nodename, &path, 0); } if (IS_ERR(dentry)) return PTR_ERR(dentry); err = vfs_mknod(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode, dev->devt); if (!err) { struct iattr newattrs; newattrs.ia_mode = mode; newattrs.ia_uid = uid; newattrs.ia_gid = gid; newattrs.ia_valid = ATTR_MODE|ATTR_UID|ATTR_GID; inode_lock(d_inode(dentry)); notify_change(&nop_mnt_idmap, dentry, &newattrs, NULL); inode_unlock(d_inode(dentry)); /* mark as kernel-created inode */ d_inode(dentry)->i_private = &thread; } done_path_create(&path, dentry); return err; } static int dev_rmdir(const char *name) { struct path parent; struct dentry *dentry; int err; dentry = kern_path_locked(name, &parent); if (IS_ERR(dentry)) return PTR_ERR(dentry); if (d_really_is_positive(dentry)) { if (d_inode(dentry)->i_private == &thread) err = vfs_rmdir(&nop_mnt_idmap, d_inode(parent.dentry), dentry); else err = -EPERM; } else { err = -ENOENT; } dput(dentry); inode_unlock(d_inode(parent.dentry)); path_put(&parent); return err; } static int delete_path(const char *nodepath) { char *path; int err = 0; path = kstrdup(nodepath, GFP_KERNEL); if (!path) return -ENOMEM; for (;;) { char *base; base = strrchr(path, '/'); if (!base) break; base[0] = '\0'; err = dev_rmdir(path); if (err) break; } kfree(path); return err; } static int dev_mynode(struct device *dev, struct inode *inode, struct kstat *stat) { /* did we create it */ if (inode->i_private != &thread) return 0; /* does the dev_t match */ if (is_blockdev(dev)) { if (!S_ISBLK(stat->mode)) return 0; } else { if (!S_ISCHR(stat->mode)) return 0; } if (stat->rdev != dev->devt) return 0; /* ours */ return 1; } static int handle_remove(const char *nodename, struct device *dev) { struct path parent; struct dentry *dentry; int deleted = 0; int err; dentry = kern_path_locked(nodename, &parent); if (IS_ERR(dentry)) return PTR_ERR(dentry); if (d_really_is_positive(dentry)) { struct kstat stat; struct path p = {.mnt = parent.mnt, .dentry = dentry}; err = vfs_getattr(&p, &stat, STATX_TYPE | STATX_MODE, AT_STATX_SYNC_AS_STAT); if (!err && dev_mynode(dev, d_inode(dentry), &stat)) { struct iattr newattrs; /* * before unlinking this node, reset permissions * of possible references like hardlinks */ newattrs.ia_uid = GLOBAL_ROOT_UID; newattrs.ia_gid = GLOBAL_ROOT_GID; newattrs.ia_mode = stat.mode & ~0777; newattrs.ia_valid = ATTR_UID|ATTR_GID|ATTR_MODE; inode_lock(d_inode(dentry)); notify_change(&nop_mnt_idmap, dentry, &newattrs, NULL); inode_unlock(d_inode(dentry)); err = vfs_unlink(&nop_mnt_idmap, d_inode(parent.dentry), dentry, NULL); if (!err || err == -ENOENT) deleted = 1; } } else { err = -ENOENT; } dput(dentry); inode_unlock(d_inode(parent.dentry)); path_put(&parent); if (deleted && strchr(nodename, '/')) delete_path(nodename); return err; } /* * If configured, or requested by the commandline, devtmpfs will be * auto-mounted after the kernel mounted the root filesystem. */ int __init devtmpfs_mount(void) { int err; if (!mount_dev) return 0; if (!thread) return 0; err = init_mount("devtmpfs", "dev", "devtmpfs", DEVTMPFS_MFLAGS, NULL); if (err) pr_info("error mounting %d\n", err); else pr_info("mounted\n"); return err; } static __initdata DECLARE_COMPLETION(setup_done); static int handle(const char *name, umode_t mode, kuid_t uid, kgid_t gid, struct device *dev) { if (mode) return handle_create(name, mode, uid, gid, dev); else return handle_remove(name, dev); } static void __noreturn devtmpfs_work_loop(void) { while (1) { spin_lock(&req_lock); while (requests) { struct req *req = requests; requests = NULL; spin_unlock(&req_lock); while (req) { struct req *next = req->next; req->err = handle(req->name, req->mode, req->uid, req->gid, req->dev); complete(&req->done); req = next; } spin_lock(&req_lock); } __set_current_state(TASK_INTERRUPTIBLE); spin_unlock(&req_lock); schedule(); } } static noinline int __init devtmpfs_setup(void *p) { int err; err = ksys_unshare(CLONE_NEWNS); if (err) goto out; err = init_mount("devtmpfs", "/", "devtmpfs", DEVTMPFS_MFLAGS, NULL); if (err) goto out; init_chdir("/.."); /* will traverse into overmounted root */ init_chroot("."); out: *(int *)p = err; return err; } /* * The __ref is because devtmpfs_setup needs to be __init for the routines it * calls. That call is done while devtmpfs_init, which is marked __init, * synchronously waits for it to complete. */ static int __ref devtmpfsd(void *p) { int err = devtmpfs_setup(p); complete(&setup_done); if (err) return err; devtmpfs_work_loop(); return 0; } /* * Create devtmpfs instance, driver-core devices will add their device * nodes here. */ int __init devtmpfs_init(void) { char opts[] = "mode=0755"; int err; mnt = vfs_kern_mount(&internal_fs_type, 0, "devtmpfs", opts); if (IS_ERR(mnt)) { pr_err("unable to create devtmpfs %ld\n", PTR_ERR(mnt)); return PTR_ERR(mnt); } err = register_filesystem(&dev_fs_type); if (err) { pr_err("unable to register devtmpfs type %d\n", err); return err; } thread = kthread_run(devtmpfsd, &err, "kdevtmpfs"); if (!IS_ERR(thread)) { wait_for_completion(&setup_done); } else { err = PTR_ERR(thread); thread = NULL; } if (err) { pr_err("unable to create devtmpfs %d\n", err); unregister_filesystem(&dev_fs_type); thread = NULL; return err; } pr_info("initialized\n"); return 0; }
2 2 2 2 2 2 2 2 2 4 2 2 4 4 4 4 456 456 35 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/core/netprio_cgroup.c Priority Control Group * * Authors: Neil Horman <nhorman@tuxdriver.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/cgroup.h> #include <linux/rcupdate.h> #include <linux/atomic.h> #include <linux/sched/task.h> #include <net/rtnetlink.h> #include <net/pkt_cls.h> #include <net/sock.h> #include <net/netprio_cgroup.h> #include <linux/fdtable.h> /* * netprio allocates per-net_device priomap array which is indexed by * css->id. Limiting css ID to 16bits doesn't lose anything. */ #define NETPRIO_ID_MAX USHRT_MAX #define PRIOMAP_MIN_SZ 128 /* * Extend @dev->priomap so that it's large enough to accommodate * @target_idx. @dev->priomap.priomap_len > @target_idx after successful * return. Must be called under rtnl lock. */ static int extend_netdev_table(struct net_device *dev, u32 target_idx) { struct netprio_map *old, *new; size_t new_sz, new_len; /* is the existing priomap large enough? */ old = rtnl_dereference(dev->priomap); if (old && old->priomap_len > target_idx) return 0; /* * Determine the new size. Let's keep it power-of-two. We start * from PRIOMAP_MIN_SZ and double it until it's large enough to * accommodate @target_idx. */ new_sz = PRIOMAP_MIN_SZ; while (true) { new_len = (new_sz - offsetof(struct netprio_map, priomap)) / sizeof(new->priomap[0]); if (new_len > target_idx) break; new_sz *= 2; /* overflowed? */ if (WARN_ON(new_sz < PRIOMAP_MIN_SZ)) return -ENOSPC; } /* allocate & copy */ new = kzalloc(new_sz, GFP_KERNEL); if (!new) return -ENOMEM; if (old) memcpy(new->priomap, old->priomap, old->priomap_len * sizeof(old->priomap[0])); new->priomap_len = new_len; /* install the new priomap */ rcu_assign_pointer(dev->priomap, new); if (old) kfree_rcu(old, rcu); return 0; } /** * netprio_prio - return the effective netprio of a cgroup-net_device pair * @css: css part of the target pair * @dev: net_device part of the target pair * * Should be called under RCU read or rtnl lock. */ static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev) { struct netprio_map *map = rcu_dereference_rtnl(dev->priomap); int id = css->id; if (map && id < map->priomap_len) return map->priomap[id]; return 0; } /** * netprio_set_prio - set netprio on a cgroup-net_device pair * @css: css part of the target pair * @dev: net_device part of the target pair * @prio: prio to set * * Set netprio to @prio on @css-@dev pair. Should be called under rtnl * lock and may fail under memory pressure for non-zero @prio. */ static int netprio_set_prio(struct cgroup_subsys_state *css, struct net_device *dev, u32 prio) { struct netprio_map *map; int id = css->id; int ret; /* avoid extending priomap for zero writes */ map = rtnl_dereference(dev->priomap); if (!prio && (!map || map->priomap_len <= id)) return 0; ret = extend_netdev_table(dev, id); if (ret) return ret; map = rtnl_dereference(dev->priomap); map->priomap[id] = prio; return 0; } static struct cgroup_subsys_state * cgrp_css_alloc(struct cgroup_subsys_state *parent_css) { struct cgroup_subsys_state *css; css = kzalloc(sizeof(*css), GFP_KERNEL); if (!css) return ERR_PTR(-ENOMEM); return css; } static int cgrp_css_online(struct cgroup_subsys_state *css) { struct cgroup_subsys_state *parent_css = css->parent; struct net_device *dev; int ret = 0; if (css->id > NETPRIO_ID_MAX) return -ENOSPC; if (!parent_css) return 0; rtnl_lock(); /* * Inherit prios from the parent. As all prios are set during * onlining, there is no need to clear them on offline. */ for_each_netdev(&init_net, dev) { u32 prio = netprio_prio(parent_css, dev); ret = netprio_set_prio(css, dev, prio); if (ret) break; } rtnl_unlock(); return ret; } static void cgrp_css_free(struct cgroup_subsys_state *css) { kfree(css); } static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft) { return css->id; } static int read_priomap(struct seq_file *sf, void *v) { struct net_device *dev; rcu_read_lock(); for_each_netdev_rcu(&init_net, dev) seq_printf(sf, "%s %u\n", dev->name, netprio_prio(seq_css(sf), dev)); rcu_read_unlock(); return 0; } static ssize_t write_priomap(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { char devname[IFNAMSIZ + 1]; struct net_device *dev; u32 prio; int ret; if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2) return -EINVAL; dev = dev_get_by_name(&init_net, devname); if (!dev) return -ENODEV; rtnl_lock(); ret = netprio_set_prio(of_css(of), dev, prio); rtnl_unlock(); dev_put(dev); return ret ?: nbytes; } static int update_netprio(const void *v, struct file *file, unsigned n) { struct socket *sock = sock_from_file(file); if (sock) sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data, (unsigned long)v); return 0; } static void net_prio_attach(struct cgroup_taskset *tset) { struct task_struct *p; struct cgroup_subsys_state *css; cgroup_taskset_for_each(p, css, tset) { void *v = (void *)(unsigned long)css->id; task_lock(p); iterate_fd(p->files, 0, update_netprio, v); task_unlock(p); } } static struct cftype ss_files[] = { { .name = "prioidx", .read_u64 = read_prioidx, }, { .name = "ifpriomap", .seq_show = read_priomap, .write = write_priomap, }, { } /* terminate */ }; struct cgroup_subsys net_prio_cgrp_subsys = { .css_alloc = cgrp_css_alloc, .css_online = cgrp_css_online, .css_free = cgrp_css_free, .attach = net_prio_attach, .legacy_cftypes = ss_files, }; static int netprio_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netprio_map *old; /* * Note this is called with rtnl_lock held so we have update side * protection on our rcu assignments */ switch (event) { case NETDEV_UNREGISTER: old = rtnl_dereference(dev->priomap); RCU_INIT_POINTER(dev->priomap, NULL); if (old) kfree_rcu(old, rcu); break; } return NOTIFY_DONE; } static struct notifier_block netprio_device_notifier = { .notifier_call = netprio_device_event }; static int __init init_cgroup_netprio(void) { register_netdevice_notifier(&netprio_device_notifier); return 0; } subsys_initcall(init_cgroup_netprio);
16 66 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ /* * Copyright (c) 2005 Voltaire Inc. All rights reserved. * Copyright (c) 2005 Intel Corporation. All rights reserved. */ #ifndef IB_ADDR_H #define IB_ADDR_H #include <linux/ethtool.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/if_arp.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/socket.h> #include <linux/if_vlan.h> #include <net/ipv6.h> #include <net/if_inet6.h> #include <net/ip.h> #include <rdma/ib_verbs.h> #include <rdma/ib_pack.h> #include <net/net_namespace.h> /** * struct rdma_dev_addr - Contains resolved RDMA hardware addresses * @src_dev_addr: Source MAC address. * @dst_dev_addr: Destination MAC address. * @broadcast: Broadcast address of the device. * @dev_type: The interface hardware type of the device. * @bound_dev_if: An optional device interface index. * @transport: The transport type used. * @net: Network namespace containing the bound_dev_if net_dev. * @sgid_attr: GID attribute to use for identified SGID */ struct rdma_dev_addr { unsigned char src_dev_addr[MAX_ADDR_LEN]; unsigned char dst_dev_addr[MAX_ADDR_LEN]; unsigned char broadcast[MAX_ADDR_LEN]; unsigned short dev_type; int bound_dev_if; enum rdma_transport_type transport; struct net *net; const struct ib_gid_attr *sgid_attr; enum rdma_network_type network; int hoplimit; }; /** * rdma_translate_ip - Translate a local IP address to an RDMA hardware * address. * * The dev_addr->net field must be initialized. */ int rdma_translate_ip(const struct sockaddr *addr, struct rdma_dev_addr *dev_addr); /** * rdma_resolve_ip - Resolve source and destination IP addresses to * RDMA hardware addresses. * @src_addr: An optional source address to use in the resolution. If a * source address is not provided, a usable address will be returned via * the callback. * @dst_addr: The destination address to resolve. * @addr: A reference to a data location that will receive the resolved * addresses. The data location must remain valid until the callback has * been invoked. The net field of the addr struct must be valid. * @timeout_ms: Amount of time to wait for the address resolution to complete. * @callback: Call invoked once address resolution has completed, timed out, * or been canceled. A status of 0 indicates success. * @resolve_by_gid_attr: Resolve the ip based on the GID attribute from * rdma_dev_addr. * @context: User-specified context associated with the call. */ int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, struct rdma_dev_addr *addr, unsigned long timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), bool resolve_by_gid_attr, void *context); void rdma_addr_cancel(struct rdma_dev_addr *addr); int rdma_addr_size(const struct sockaddr *addr); int rdma_addr_size_in6(struct sockaddr_in6 *addr); int rdma_addr_size_kss(struct __kernel_sockaddr_storage *addr); static inline u16 ib_addr_get_pkey(struct rdma_dev_addr *dev_addr) { return ((u16)dev_addr->broadcast[8] << 8) | (u16)dev_addr->broadcast[9]; } static inline void ib_addr_set_pkey(struct rdma_dev_addr *dev_addr, u16 pkey) { dev_addr->broadcast[8] = pkey >> 8; dev_addr->broadcast[9] = (unsigned char) pkey; } static inline void ib_addr_get_mgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(gid, dev_addr->broadcast + 4, sizeof *gid); } static inline int rdma_addr_gid_offset(struct rdma_dev_addr *dev_addr) { return dev_addr->dev_type == ARPHRD_INFINIBAND ? 4 : 0; } static inline u16 rdma_vlan_dev_vlan_id(const struct net_device *dev) { return is_vlan_dev(dev) ? vlan_dev_vlan_id(dev) : 0xffff; } static inline int rdma_ip2gid(struct sockaddr *addr, union ib_gid *gid) { switch (addr->sa_family) { case AF_INET: ipv6_addr_set_v4mapped(((struct sockaddr_in *) addr)->sin_addr.s_addr, (struct in6_addr *)gid); break; case AF_INET6: *(struct in6_addr *)&gid->raw = ((struct sockaddr_in6 *)addr)->sin6_addr; break; default: return -EINVAL; } return 0; } /* Important - sockaddr should be a union of sockaddr_in and sockaddr_in6 */ static inline void rdma_gid2ip(struct sockaddr *out, const union ib_gid *gid) { if (ipv6_addr_v4mapped((struct in6_addr *)gid)) { struct sockaddr_in *out_in = (struct sockaddr_in *)out; memset(out_in, 0, sizeof(*out_in)); out_in->sin_family = AF_INET; memcpy(&out_in->sin_addr.s_addr, gid->raw + 12, 4); } else { struct sockaddr_in6 *out_in = (struct sockaddr_in6 *)out; memset(out_in, 0, sizeof(*out_in)); out_in->sin6_family = AF_INET6; memcpy(&out_in->sin6_addr.s6_addr, gid->raw, 16); } } /* * rdma_get/set_sgid/dgid() APIs are applicable to IB, and iWarp. * They are not applicable to RoCE. * RoCE GIDs are derived from the IP addresses. */ static inline void rdma_addr_get_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof(*gid)); } static inline void rdma_addr_set_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), gid, sizeof *gid); } static inline void rdma_addr_get_dgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(gid, dev_addr->dst_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof *gid); } static inline void rdma_addr_set_dgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(dev_addr->dst_dev_addr + rdma_addr_gid_offset(dev_addr), gid, sizeof *gid); } static inline enum ib_mtu iboe_get_mtu(int mtu) { /* * Reduce IB headers from effective IBoE MTU. */ mtu = mtu - (IB_GRH_BYTES + IB_UDP_BYTES + IB_BTH_BYTES + IB_EXT_XRC_BYTES + IB_EXT_ATOMICETH_BYTES + IB_ICRC_BYTES); if (mtu >= ib_mtu_enum_to_int(IB_MTU_4096)) return IB_MTU_4096; else if (mtu >= ib_mtu_enum_to_int(IB_MTU_2048)) return IB_MTU_2048; else if (mtu >= ib_mtu_enum_to_int(IB_MTU_1024)) return IB_MTU_1024; else if (mtu >= ib_mtu_enum_to_int(IB_MTU_512)) return IB_MTU_512; else if (mtu >= ib_mtu_enum_to_int(IB_MTU_256)) return IB_MTU_256; else return 0; } static inline int rdma_link_local_addr(struct in6_addr *addr) { if (addr->s6_addr32[0] == htonl(0xfe800000) && addr->s6_addr32[1] == 0) return 1; return 0; } static inline void rdma_get_ll_mac(struct in6_addr *addr, u8 *mac) { memcpy(mac, &addr->s6_addr[8], 3); memcpy(mac + 3, &addr->s6_addr[13], 3); mac[0] ^= 2; } static inline int rdma_is_multicast_addr(struct in6_addr *addr) { __be32 ipv4_addr; if (addr->s6_addr[0] == 0xff) return 1; ipv4_addr = addr->s6_addr32[3]; return (ipv6_addr_v4mapped(addr) && ipv4_is_multicast(ipv4_addr)); } static inline void rdma_get_mcast_mac(struct in6_addr *addr, u8 *mac) { int i; mac[0] = 0x33; mac[1] = 0x33; for (i = 2; i < 6; ++i) mac[i] = addr->s6_addr[i + 10]; } static inline u16 rdma_get_vlan_id(union ib_gid *dgid) { u16 vid; vid = dgid->raw[11] << 8 | dgid->raw[12]; return vid < 0x1000 ? vid : 0xffff; } static inline struct net_device *rdma_vlan_dev_real_dev(const struct net_device *dev) { return is_vlan_dev(dev) ? vlan_dev_real_dev(dev) : NULL; } #endif /* IB_ADDR_H */
171 171 61 62 61 62 62 62 61 62 60 62 61 61 2 62 62 62 61 32 31 176 32 32 32 32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 // SPDX-License-Identifier: GPL-2.0 /* * drivers/base/power/runtime.c - Helper functions for device runtime PM * * Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. * Copyright (C) 2010 Alan Stern <stern@rowland.harvard.edu> */ #include <linux/sched/mm.h> #include <linux/ktime.h> #include <linux/hrtimer.h> #include <linux/export.h> #include <linux/pm_runtime.h> #include <linux/pm_wakeirq.h> #include <linux/rculist.h> #include <trace/events/rpm.h> #include "../base.h" #include "power.h" typedef int (*pm_callback_t)(struct device *); static pm_callback_t __rpm_get_callback(struct device *dev, size_t cb_offset) { pm_callback_t cb; const struct dev_pm_ops *ops; if (dev->pm_domain) ops = &dev->pm_domain->ops; else if (dev->type && dev->type->pm) ops = dev->type->pm; else if (dev->class && dev->class->pm) ops = dev->class->pm; else if (dev->bus && dev->bus->pm) ops = dev->bus->pm; else ops = NULL; if (ops) cb = *(pm_callback_t *)((void *)ops + cb_offset); else cb = NULL; if (!cb && dev->driver && dev->driver->pm) cb = *(pm_callback_t *)((void *)dev->driver->pm + cb_offset); return cb; } #define RPM_GET_CALLBACK(dev, callback) \ __rpm_get_callback(dev, offsetof(struct dev_pm_ops, callback)) static int rpm_resume(struct device *dev, int rpmflags); static int rpm_suspend(struct device *dev, int rpmflags); /** * update_pm_runtime_accounting - Update the time accounting of power states * @dev: Device to update the accounting for * * In order to be able to have time accounting of the various power states * (as used by programs such as PowerTOP to show the effectiveness of runtime * PM), we need to track the time spent in each state. * update_pm_runtime_accounting must be called each time before the * runtime_status field is updated, to account the time in the old state * correctly. */ static void update_pm_runtime_accounting(struct device *dev) { u64 now, last, delta; if (dev->power.disable_depth > 0) return; last = dev->power.accounting_timestamp; now = ktime_get_mono_fast_ns(); dev->power.accounting_timestamp = now; /* * Because ktime_get_mono_fast_ns() is not monotonic during * timekeeping updates, ensure that 'now' is after the last saved * timesptamp. */ if (now < last) return; delta = now - last; if (dev->power.runtime_status == RPM_SUSPENDED) dev->power.suspended_time += delta; else dev->power.active_time += delta; } static void __update_runtime_status(struct device *dev, enum rpm_status status) { update_pm_runtime_accounting(dev); dev->power.runtime_status = status; } static u64 rpm_get_accounted_time(struct device *dev, bool suspended) { u64 time; unsigned long flags; spin_lock_irqsave(&dev->power.lock, flags); update_pm_runtime_accounting(dev); time = suspended ? dev->power.suspended_time : dev->power.active_time; spin_unlock_irqrestore(&dev->power.lock, flags); return time; } u64 pm_runtime_active_time(struct device *dev) { return rpm_get_accounted_time(dev, false); } u64 pm_runtime_suspended_time(struct device *dev) { return rpm_get_accounted_time(dev, true); } EXPORT_SYMBOL_GPL(pm_runtime_suspended_time); /** * pm_runtime_deactivate_timer - Deactivate given device's suspend timer. * @dev: Device to handle. */ static void pm_runtime_deactivate_timer(struct device *dev) { if (dev->power.timer_expires > 0) { hrtimer_try_to_cancel(&dev->power.suspend_timer); dev->power.timer_expires = 0; } } /** * pm_runtime_cancel_pending - Deactivate suspend timer and cancel requests. * @dev: Device to handle. */ static void pm_runtime_cancel_pending(struct device *dev) { pm_runtime_deactivate_timer(dev); /* * In case there's a request pending, make sure its work function will * return without doing anything. */ dev->power.request = RPM_REQ_NONE; } /* * pm_runtime_autosuspend_expiration - Get a device's autosuspend-delay expiration time. * @dev: Device to handle. * * Compute the autosuspend-delay expiration time based on the device's * power.last_busy time. If the delay has already expired or is disabled * (negative) or the power.use_autosuspend flag isn't set, return 0. * Otherwise return the expiration time in nanoseconds (adjusted to be nonzero). * * This function may be called either with or without dev->power.lock held. * Either way it can be racy, since power.last_busy may be updated at any time. */ u64 pm_runtime_autosuspend_expiration(struct device *dev) { int autosuspend_delay; u64 expires; if (!dev->power.use_autosuspend) return 0; autosuspend_delay = READ_ONCE(dev->power.autosuspend_delay); if (autosuspend_delay < 0) return 0; expires = READ_ONCE(dev->power.last_busy); expires += (u64)autosuspend_delay * NSEC_PER_MSEC; if (expires > ktime_get_mono_fast_ns()) return expires; /* Expires in the future */ return 0; } EXPORT_SYMBOL_GPL(pm_runtime_autosuspend_expiration); static int dev_memalloc_noio(struct device *dev, void *data) { return dev->power.memalloc_noio; } /* * pm_runtime_set_memalloc_noio - Set a device's memalloc_noio flag. * @dev: Device to handle. * @enable: True for setting the flag and False for clearing the flag. * * Set the flag for all devices in the path from the device to the * root device in the device tree if @enable is true, otherwise clear * the flag for devices in the path whose siblings don't set the flag. * * The function should only be called by block device, or network * device driver for solving the deadlock problem during runtime * resume/suspend: * * If memory allocation with GFP_KERNEL is called inside runtime * resume/suspend callback of any one of its ancestors(or the * block device itself), the deadlock may be triggered inside the * memory allocation since it might not complete until the block * device becomes active and the involed page I/O finishes. The * situation is pointed out first by Alan Stern. Network device * are involved in iSCSI kind of situation. * * The lock of dev_hotplug_mutex is held in the function for handling * hotplug race because pm_runtime_set_memalloc_noio() may be called * in async probe(). * * The function should be called between device_add() and device_del() * on the affected device(block/network device). */ void pm_runtime_set_memalloc_noio(struct device *dev, bool enable) { static DEFINE_MUTEX(dev_hotplug_mutex); mutex_lock(&dev_hotplug_mutex); for (;;) { bool enabled; /* hold power lock since bitfield is not SMP-safe. */ spin_lock_irq(&dev->power.lock); enabled = dev->power.memalloc_noio; dev->power.memalloc_noio = enable; spin_unlock_irq(&dev->power.lock); /* * not need to enable ancestors any more if the device * has been enabled. */ if (enabled && enable) break; dev = dev->parent; /* * clear flag of the parent device only if all the * children don't set the flag because ancestor's * flag was set by any one of the descendants. */ if (!dev || (!enable && device_for_each_child(dev, NULL, dev_memalloc_noio))) break; } mutex_unlock(&dev_hotplug_mutex); } EXPORT_SYMBOL_GPL(pm_runtime_set_memalloc_noio); /** * rpm_check_suspend_allowed - Test whether a device may be suspended. * @dev: Device to test. */ static int rpm_check_suspend_allowed(struct device *dev) { int retval = 0; if (dev->power.runtime_error) retval = -EINVAL; else if (dev->power.disable_depth > 0) retval = -EACCES; else if (atomic_read(&dev->power.usage_count)) retval = -EAGAIN; else if (!dev->power.ignore_children && atomic_read(&dev->power.child_count)) retval = -EBUSY; /* Pending resume requests take precedence over suspends. */ else if ((dev->power.deferred_resume && dev->power.runtime_status == RPM_SUSPENDING) || (dev->power.request_pending && dev->power.request == RPM_REQ_RESUME)) retval = -EAGAIN; else if (__dev_pm_qos_resume_latency(dev) == 0) retval = -EPERM; else if (dev->power.runtime_status == RPM_SUSPENDED) retval = 1; return retval; } static int rpm_get_suppliers(struct device *dev) { struct device_link *link; list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, device_links_read_lock_held()) { int retval; if (!(link->flags & DL_FLAG_PM_RUNTIME)) continue; retval = pm_runtime_get_sync(link->supplier); /* Ignore suppliers with disabled runtime PM. */ if (retval < 0 && retval != -EACCES) { pm_runtime_put_noidle(link->supplier); return retval; } refcount_inc(&link->rpm_active); } return 0; } /** * pm_runtime_release_supplier - Drop references to device link's supplier. * @link: Target device link. * * Drop all runtime PM references associated with @link to its supplier device. */ void pm_runtime_release_supplier(struct device_link *link) { struct device *supplier = link->supplier; /* * The additional power.usage_count check is a safety net in case * the rpm_active refcount becomes saturated, in which case * refcount_dec_not_one() would return true forever, but it is not * strictly necessary. */ while (refcount_dec_not_one(&link->rpm_active) && atomic_read(&supplier->power.usage_count) > 0) pm_runtime_put_noidle(supplier); } static void __rpm_put_suppliers(struct device *dev, bool try_to_suspend) { struct device_link *link; list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, device_links_read_lock_held()) { pm_runtime_release_supplier(link); if (try_to_suspend) pm_request_idle(link->supplier); } } static void rpm_put_suppliers(struct device *dev) { __rpm_put_suppliers(dev, true); } static void rpm_suspend_suppliers(struct device *dev) { struct device_link *link; int idx = device_links_read_lock(); list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, device_links_read_lock_held()) pm_request_idle(link->supplier); device_links_read_unlock(idx); } /** * __rpm_callback - Run a given runtime PM callback for a given device. * @cb: Runtime PM callback to run. * @dev: Device to run the callback for. */ static int __rpm_callback(int (*cb)(struct device *), struct device *dev) __releases(&dev->power.lock) __acquires(&dev->power.lock) { int retval = 0, idx; bool use_links = dev->power.links_count > 0; if (dev->power.irq_safe) { spin_unlock(&dev->power.lock); } else { spin_unlock_irq(&dev->power.lock); /* * Resume suppliers if necessary. * * The device's runtime PM status cannot change until this * routine returns, so it is safe to read the status outside of * the lock. */ if (use_links && dev->power.runtime_status == RPM_RESUMING) { idx = device_links_read_lock(); retval = rpm_get_suppliers(dev); if (retval) { rpm_put_suppliers(dev); goto fail; } device_links_read_unlock(idx); } } if (cb) retval = cb(dev); if (dev->power.irq_safe) { spin_lock(&dev->power.lock); } else { /* * If the device is suspending and the callback has returned * success, drop the usage counters of the suppliers that have * been reference counted on its resume. * * Do that if resume fails too. */ if (use_links && ((dev->power.runtime_status == RPM_SUSPENDING && !retval) || (dev->power.runtime_status == RPM_RESUMING && retval))) { idx = device_links_read_lock(); __rpm_put_suppliers(dev, false); fail: device_links_read_unlock(idx); } spin_lock_irq(&dev->power.lock); } return retval; } /** * rpm_callback - Run a given runtime PM callback for a given device. * @cb: Runtime PM callback to run. * @dev: Device to run the callback for. */ static int rpm_callback(int (*cb)(struct device *), struct device *dev) { int retval; if (dev->power.memalloc_noio) { unsigned int noio_flag; /* * Deadlock might be caused if memory allocation with * GFP_KERNEL happens inside runtime_suspend and * runtime_resume callbacks of one block device's * ancestor or the block device itself. Network * device might be thought as part of iSCSI block * device, so network device and its ancestor should * be marked as memalloc_noio too. */ noio_flag = memalloc_noio_save(); retval = __rpm_callback(cb, dev); memalloc_noio_restore(noio_flag); } else { retval = __rpm_callback(cb, dev); } dev->power.runtime_error = retval; return retval != -EACCES ? retval : -EIO; } /** * rpm_idle - Notify device bus type if the device can be suspended. * @dev: Device to notify the bus type about. * @rpmflags: Flag bits. * * Check if the device's runtime PM status allows it to be suspended. If * another idle notification has been started earlier, return immediately. If * the RPM_ASYNC flag is set then queue an idle-notification request; otherwise * run the ->runtime_idle() callback directly. If the ->runtime_idle callback * doesn't exist or if it returns 0, call rpm_suspend with the RPM_AUTO flag. * * This function must be called under dev->power.lock with interrupts disabled. */ static int rpm_idle(struct device *dev, int rpmflags) { int (*callback)(struct device *); int retval; trace_rpm_idle(dev, rpmflags); retval = rpm_check_suspend_allowed(dev); if (retval < 0) ; /* Conditions are wrong. */ /* Idle notifications are allowed only in the RPM_ACTIVE state. */ else if (dev->power.runtime_status != RPM_ACTIVE) retval = -EAGAIN; /* * Any pending request other than an idle notification takes * precedence over us, except that the timer may be running. */ else if (dev->power.request_pending && dev->power.request > RPM_REQ_IDLE) retval = -EAGAIN; /* Act as though RPM_NOWAIT is always set. */ else if (dev->power.idle_notification) retval = -EINPROGRESS; if (retval) goto out; /* Pending requests need to be canceled. */ dev->power.request = RPM_REQ_NONE; callback = RPM_GET_CALLBACK(dev, runtime_idle); /* If no callback assume success. */ if (!callback || dev->power.no_callbacks) goto out; /* Carry out an asynchronous or a synchronous idle notification. */ if (rpmflags & RPM_ASYNC) { dev->power.request = RPM_REQ_IDLE; if (!dev->power.request_pending) { dev->power.request_pending = true; queue_work(pm_wq, &dev->power.work); } trace_rpm_return_int(dev, _THIS_IP_, 0); return 0; } dev->power.idle_notification = true; if (dev->power.irq_safe) spin_unlock(&dev->power.lock); else spin_unlock_irq(&dev->power.lock); retval = callback(dev); if (dev->power.irq_safe) spin_lock(&dev->power.lock); else spin_lock_irq(&dev->power.lock); dev->power.idle_notification = false; wake_up_all(&dev->power.wait_queue); out: trace_rpm_return_int(dev, _THIS_IP_, retval); return retval ? retval : rpm_suspend(dev, rpmflags | RPM_AUTO); } /** * rpm_suspend - Carry out runtime suspend of given device. * @dev: Device to suspend. * @rpmflags: Flag bits. * * Check if the device's runtime PM status allows it to be suspended. * Cancel a pending idle notification, autosuspend or suspend. If * another suspend has been started earlier, either return immediately * or wait for it to finish, depending on the RPM_NOWAIT and RPM_ASYNC * flags. If the RPM_ASYNC flag is set then queue a suspend request; * otherwise run the ->runtime_suspend() callback directly. When * ->runtime_suspend succeeded, if a deferred resume was requested while * the callback was running then carry it out, otherwise send an idle * notification for its parent (if the suspend succeeded and both * ignore_children of parent->power and irq_safe of dev->power are not set). * If ->runtime_suspend failed with -EAGAIN or -EBUSY, and if the RPM_AUTO * flag is set and the next autosuspend-delay expiration time is in the * future, schedule another autosuspend attempt. * * This function must be called under dev->power.lock with interrupts disabled. */ static int rpm_suspend(struct device *dev, int rpmflags) __releases(&dev->power.lock) __acquires(&dev->power.lock) { int (*callback)(struct device *); struct device *parent = NULL; int retval; trace_rpm_suspend(dev, rpmflags); repeat: retval = rpm_check_suspend_allowed(dev); if (retval < 0) goto out; /* Conditions are wrong. */ /* Synchronous suspends are not allowed in the RPM_RESUMING state. */ if (dev->power.runtime_status == RPM_RESUMING && !(rpmflags & RPM_ASYNC)) retval = -EAGAIN; if (retval) goto out; /* If the autosuspend_delay time hasn't expired yet, reschedule. */ if ((rpmflags & RPM_AUTO) && dev->power.runtime_status != RPM_SUSPENDING) { u64 expires = pm_runtime_autosuspend_expiration(dev); if (expires != 0) { /* Pending requests need to be canceled. */ dev->power.request = RPM_REQ_NONE; /* * Optimization: If the timer is already running and is * set to expire at or before the autosuspend delay, * avoid the overhead of resetting it. Just let it * expire; pm_suspend_timer_fn() will take care of the * rest. */ if (!(dev->power.timer_expires && dev->power.timer_expires <= expires)) { /* * We add a slack of 25% to gather wakeups * without sacrificing the granularity. */ u64 slack = (u64)READ_ONCE(dev->power.autosuspend_delay) * (NSEC_PER_MSEC >> 2); dev->power.timer_expires = expires; hrtimer_start_range_ns(&dev->power.suspend_timer, ns_to_ktime(expires), slack, HRTIMER_MODE_ABS); } dev->power.timer_autosuspends = 1; goto out; } } /* Other scheduled or pending requests need to be canceled. */ pm_runtime_cancel_pending(dev); if (dev->power.runtime_status == RPM_SUSPENDING) { DEFINE_WAIT(wait); if (rpmflags & (RPM_ASYNC | RPM_NOWAIT)) { retval = -EINPROGRESS; goto out; } if (dev->power.irq_safe) { spin_unlock(&dev->power.lock); cpu_relax(); spin_lock(&dev->power.lock); goto repeat; } /* Wait for the other suspend running in parallel with us. */ for (;;) { prepare_to_wait(&dev->power.wait_queue, &wait, TASK_UNINTERRUPTIBLE); if (dev->power.runtime_status != RPM_SUSPENDING) break; spin_unlock_irq(&dev->power.lock); schedule(); spin_lock_irq(&dev->power.lock); } finish_wait(&dev->power.wait_queue, &wait); goto repeat; } if (dev->power.no_callbacks) goto no_callback; /* Assume success. */ /* Carry out an asynchronous or a synchronous suspend. */ if (rpmflags & RPM_ASYNC) { dev->power.request = (rpmflags & RPM_AUTO) ? RPM_REQ_AUTOSUSPEND : RPM_REQ_SUSPEND; if (!dev->power.request_pending) { dev->power.request_pending = true; queue_work(pm_wq, &dev->power.work); } goto out; } __update_runtime_status(dev, RPM_SUSPENDING); callback = RPM_GET_CALLBACK(dev, runtime_suspend); dev_pm_enable_wake_irq_check(dev, true); retval = rpm_callback(callback, dev); if (retval) goto fail; dev_pm_enable_wake_irq_complete(dev); no_callback: __update_runtime_status(dev, RPM_SUSPENDED); pm_runtime_deactivate_timer(dev); if (dev->parent) { parent = dev->parent; atomic_add_unless(&parent->power.child_count, -1, 0); } wake_up_all(&dev->power.wait_queue); if (dev->power.deferred_resume) { dev->power.deferred_resume = false; rpm_resume(dev, 0); retval = -EAGAIN; goto out; } if (dev->power.irq_safe) goto out; /* Maybe the parent is now able to suspend. */ if (parent && !parent->power.ignore_children) { spin_unlock(&dev->power.lock); spin_lock(&parent->power.lock); rpm_idle(parent, RPM_ASYNC); spin_unlock(&parent->power.lock); spin_lock(&dev->power.lock); } /* Maybe the suppliers are now able to suspend. */ if (dev->power.links_count > 0) { spin_unlock_irq(&dev->power.lock); rpm_suspend_suppliers(dev); spin_lock_irq(&dev->power.lock); } out: trace_rpm_return_int(dev, _THIS_IP_, retval); return retval; fail: dev_pm_disable_wake_irq_check(dev, true); __update_runtime_status(dev, RPM_ACTIVE); dev->power.deferred_resume = false; wake_up_all(&dev->power.wait_queue); if (retval == -EAGAIN || retval == -EBUSY) { dev->power.runtime_error = 0; /* * If the callback routine failed an autosuspend, and * if the last_busy time has been updated so that there * is a new autosuspend expiration time, automatically * reschedule another autosuspend. */ if ((rpmflags & RPM_AUTO) && pm_runtime_autosuspend_expiration(dev) != 0) goto repeat; } else { pm_runtime_cancel_pending(dev); } goto out; } /** * rpm_resume - Carry out runtime resume of given device. * @dev: Device to resume. * @rpmflags: Flag bits. * * Check if the device's runtime PM status allows it to be resumed. Cancel * any scheduled or pending requests. If another resume has been started * earlier, either return immediately or wait for it to finish, depending on the * RPM_NOWAIT and RPM_ASYNC flags. Similarly, if there's a suspend running in * parallel with this function, either tell the other process to resume after * suspending (deferred_resume) or wait for it to finish. If the RPM_ASYNC * flag is set then queue a resume request; otherwise run the * ->runtime_resume() callback directly. Queue an idle notification for the * device if the resume succeeded. * * This function must be called under dev->power.lock with interrupts disabled. */ static int rpm_resume(struct device *dev, int rpmflags) __releases(&dev->power.lock) __acquires(&dev->power.lock) { int (*callback)(struct device *); struct device *parent = NULL; int retval = 0; trace_rpm_resume(dev, rpmflags); repeat: if (dev->power.runtime_error) { retval = -EINVAL; } else if (dev->power.disable_depth > 0) { if (dev->power.runtime_status == RPM_ACTIVE && dev->power.last_status == RPM_ACTIVE) retval = 1; else retval = -EACCES; } if (retval) goto out; /* * Other scheduled or pending requests need to be canceled. Small * optimization: If an autosuspend timer is running, leave it running * rather than cancelling it now only to restart it again in the near * future. */ dev->power.request = RPM_REQ_NONE; if (!dev->power.timer_autosuspends) pm_runtime_deactivate_timer(dev); if (dev->power.runtime_status == RPM_ACTIVE) { retval = 1; goto out; } if (dev->power.runtime_status == RPM_RESUMING || dev->power.runtime_status == RPM_SUSPENDING) { DEFINE_WAIT(wait); if (rpmflags & (RPM_ASYNC | RPM_NOWAIT)) { if (dev->power.runtime_status == RPM_SUSPENDING) { dev->power.deferred_resume = true; if (rpmflags & RPM_NOWAIT) retval = -EINPROGRESS; } else { retval = -EINPROGRESS; } goto out; } if (dev->power.irq_safe) { spin_unlock(&dev->power.lock); cpu_relax(); spin_lock(&dev->power.lock); goto repeat; } /* Wait for the operation carried out in parallel with us. */ for (;;) { prepare_to_wait(&dev->power.wait_queue, &wait, TASK_UNINTERRUPTIBLE); if (dev->power.runtime_status != RPM_RESUMING && dev->power.runtime_status != RPM_SUSPENDING) break; spin_unlock_irq(&dev->power.lock); schedule(); spin_lock_irq(&dev->power.lock); } finish_wait(&dev->power.wait_queue, &wait); goto repeat; } /* * See if we can skip waking up the parent. This is safe only if * power.no_callbacks is set, because otherwise we don't know whether * the resume will actually succeed. */ if (dev->power.no_callbacks && !parent && dev->parent) { spin_lock_nested(&dev->parent->power.lock, SINGLE_DEPTH_NESTING); if (dev->parent->power.disable_depth > 0 || dev->parent->power.ignore_children || dev->parent->power.runtime_status == RPM_ACTIVE) { atomic_inc(&dev->parent->power.child_count); spin_unlock(&dev->parent->power.lock); retval = 1; goto no_callback; /* Assume success. */ } spin_unlock(&dev->parent->power.lock); } /* Carry out an asynchronous or a synchronous resume. */ if (rpmflags & RPM_ASYNC) { dev->power.request = RPM_REQ_RESUME; if (!dev->power.request_pending) { dev->power.request_pending = true; queue_work(pm_wq, &dev->power.work); } retval = 0; goto out; } if (!parent && dev->parent) { /* * Increment the parent's usage counter and resume it if * necessary. Not needed if dev is irq-safe; then the * parent is permanently resumed. */ parent = dev->parent; if (dev->power.irq_safe) goto skip_parent; spin_unlock(&dev->power.lock); pm_runtime_get_noresume(parent); spin_lock(&parent->power.lock); /* * Resume the parent if it has runtime PM enabled and not been * set to ignore its children. */ if (!parent->power.disable_depth && !parent->power.ignore_children) { rpm_resume(parent, 0); if (parent->power.runtime_status != RPM_ACTIVE) retval = -EBUSY; } spin_unlock(&parent->power.lock); spin_lock(&dev->power.lock); if (retval) goto out; goto repeat; } skip_parent: if (dev->power.no_callbacks) goto no_callback; /* Assume success. */ __update_runtime_status(dev, RPM_RESUMING); callback = RPM_GET_CALLBACK(dev, runtime_resume); dev_pm_disable_wake_irq_check(dev, false); retval = rpm_callback(callback, dev); if (retval) { __update_runtime_status(dev, RPM_SUSPENDED); pm_runtime_cancel_pending(dev); dev_pm_enable_wake_irq_check(dev, false); } else { no_callback: __update_runtime_status(dev, RPM_ACTIVE); pm_runtime_mark_last_busy(dev); if (parent) atomic_inc(&parent->power.child_count); } wake_up_all(&dev->power.wait_queue); if (retval >= 0) rpm_idle(dev, RPM_ASYNC); out: if (parent && !dev->power.irq_safe) { spin_unlock_irq(&dev->power.lock); pm_runtime_put(parent); spin_lock_irq(&dev->power.lock); } trace_rpm_return_int(dev, _THIS_IP_, retval); return retval; } /** * pm_runtime_work - Universal runtime PM work function. * @work: Work structure used for scheduling the execution of this function. * * Use @work to get the device object the work is to be done for, determine what * is to be done and execute the appropriate runtime PM function. */ static void pm_runtime_work(struct work_struct *work) { struct device *dev = container_of(work, struct device, power.work); enum rpm_request req; spin_lock_irq(&dev->power.lock); if (!dev->power.request_pending) goto out; req = dev->power.request; dev->power.request = RPM_REQ_NONE; dev->power.request_pending = false; switch (req) { case RPM_REQ_NONE: break; case RPM_REQ_IDLE: rpm_idle(dev, RPM_NOWAIT); break; case RPM_REQ_SUSPEND: rpm_suspend(dev, RPM_NOWAIT); break; case RPM_REQ_AUTOSUSPEND: rpm_suspend(dev, RPM_NOWAIT | RPM_AUTO); break; case RPM_REQ_RESUME: rpm_resume(dev, RPM_NOWAIT); break; } out: spin_unlock_irq(&dev->power.lock); } /** * pm_suspend_timer_fn - Timer function for pm_schedule_suspend(). * @timer: hrtimer used by pm_schedule_suspend(). * * Check if the time is right and queue a suspend request. */ static enum hrtimer_restart pm_suspend_timer_fn(struct hrtimer *timer) { struct device *dev = container_of(timer, struct device, power.suspend_timer); unsigned long flags; u64 expires; spin_lock_irqsave(&dev->power.lock, flags); expires = dev->power.timer_expires; /* * If 'expires' is after the current time, we've been called * too early. */ if (expires > 0 && expires < ktime_get_mono_fast_ns()) { dev->power.timer_expires = 0; rpm_suspend(dev, dev->power.timer_autosuspends ? (RPM_ASYNC | RPM_AUTO) : RPM_ASYNC); } spin_unlock_irqrestore(&dev->power.lock, flags); return HRTIMER_NORESTART; } /** * pm_schedule_suspend - Set up a timer to submit a suspend request in future. * @dev: Device to suspend. * @delay: Time to wait before submitting a suspend request, in milliseconds. */ int pm_schedule_suspend(struct device *dev, unsigned int delay) { unsigned long flags; u64 expires; int retval; spin_lock_irqsave(&dev->power.lock, flags); if (!delay) { retval = rpm_suspend(dev, RPM_ASYNC); goto out; } retval = rpm_check_suspend_allowed(dev); if (retval) goto out; /* Other scheduled or pending requests need to be canceled. */ pm_runtime_cancel_pending(dev); expires = ktime_get_mono_fast_ns() + (u64)delay * NSEC_PER_MSEC; dev->power.timer_expires = expires; dev->power.timer_autosuspends = 0; hrtimer_start(&dev->power.suspend_timer, expires, HRTIMER_MODE_ABS); out: spin_unlock_irqrestore(&dev->power.lock, flags); return retval; } EXPORT_SYMBOL_GPL(pm_schedule_suspend); static int rpm_drop_usage_count(struct device *dev) { int ret; ret = atomic_sub_return(1, &dev->power.usage_count); if (ret >= 0) return ret; /* * Because rpm_resume() does not check the usage counter, it will resume * the device even if the usage counter is 0 or negative, so it is * sufficient to increment the usage counter here to reverse the change * made above. */ atomic_inc(&dev->power.usage_count); dev_warn(dev, "Runtime PM usage count underflow!\n"); return -EINVAL; } /** * __pm_runtime_idle - Entry point for runtime idle operations. * @dev: Device to send idle notification for. * @rpmflags: Flag bits. * * If the RPM_GET_PUT flag is set, decrement the device's usage count and * return immediately if it is larger than zero (if it becomes negative, log a * warning, increment it, and return an error). Then carry out an idle * notification, either synchronous or asynchronous. * * This routine may be called in atomic context if the RPM_ASYNC flag is set, * or if pm_runtime_irq_safe() has been called. */ int __pm_runtime_idle(struct device *dev, int rpmflags) { unsigned long flags; int retval; if (rpmflags & RPM_GET_PUT) { retval = rpm_drop_usage_count(dev); if (retval < 0) { return retval; } else if (retval > 0) { trace_rpm_usage(dev, rpmflags); return 0; } } might_sleep_if(!(rpmflags & RPM_ASYNC) && !dev->power.irq_safe); spin_lock_irqsave(&dev->power.lock, flags); retval = rpm_idle(dev, rpmflags); spin_unlock_irqrestore(&dev->power.lock, flags); return retval; } EXPORT_SYMBOL_GPL(__pm_runtime_idle); /** * __pm_runtime_suspend - Entry point for runtime put/suspend operations. * @dev: Device to suspend. * @rpmflags: Flag bits. * * If the RPM_GET_PUT flag is set, decrement the device's usage count and * return immediately if it is larger than zero (if it becomes negative, log a * warning, increment it, and return an error). Then carry out a suspend, * either synchronous or asynchronous. * * This routine may be called in atomic context if the RPM_ASYNC flag is set, * or if pm_runtime_irq_safe() has been called. */ int __pm_runtime_suspend(struct device *dev, int rpmflags) { unsigned long flags; int retval; if (rpmflags & RPM_GET_PUT) { retval = rpm_drop_usage_count(dev); if (retval < 0) { return retval; } else if (retval > 0) { trace_rpm_usage(dev, rpmflags); return 0; } } might_sleep_if(!(rpmflags & RPM_ASYNC) && !dev->power.irq_safe); spin_lock_irqsave(&dev->power.lock, flags); retval = rpm_suspend(dev, rpmflags); spin_unlock_irqrestore(&dev->power.lock, flags); return retval; } EXPORT_SYMBOL_GPL(__pm_runtime_suspend); /** * __pm_runtime_resume - Entry point for runtime resume operations. * @dev: Device to resume. * @rpmflags: Flag bits. * * If the RPM_GET_PUT flag is set, increment the device's usage count. Then * carry out a resume, either synchronous or asynchronous. * * This routine may be called in atomic context if the RPM_ASYNC flag is set, * or if pm_runtime_irq_safe() has been called. */ int __pm_runtime_resume(struct device *dev, int rpmflags) { unsigned long flags; int retval; might_sleep_if(!(rpmflags & RPM_ASYNC) && !dev->power.irq_safe && dev->power.runtime_status != RPM_ACTIVE); if (rpmflags & RPM_GET_PUT) atomic_inc(&dev->power.usage_count); spin_lock_irqsave(&dev->power.lock, flags); retval = rpm_resume(dev, rpmflags); spin_unlock_irqrestore(&dev->power.lock, flags); return retval; } EXPORT_SYMBOL_GPL(__pm_runtime_resume); /** * pm_runtime_get_if_active - Conditionally bump up device usage counter. * @dev: Device to handle. * @ign_usage_count: Whether or not to look at the current usage counter value. * * Return -EINVAL if runtime PM is disabled for @dev. * * Otherwise, if the runtime PM status of @dev is %RPM_ACTIVE and either * @ign_usage_count is %true or the runtime PM usage counter of @dev is not * zero, increment the usage counter of @dev and return 1. Otherwise, return 0 * without changing the usage counter. * * If @ign_usage_count is %true, this function can be used to prevent suspending * the device when its runtime PM status is %RPM_ACTIVE. * * If @ign_usage_count is %false, this function can be used to prevent * suspending the device when both its runtime PM status is %RPM_ACTIVE and its * runtime PM usage counter is not zero. * * The caller is responsible for decrementing the runtime PM usage counter of * @dev after this function has returned a positive value for it. */ int pm_runtime_get_if_active(struct device *dev, bool ign_usage_count) { unsigned long flags; int retval; spin_lock_irqsave(&dev->power.lock, flags); if (dev->power.disable_depth > 0) { retval = -EINVAL; } else if (dev->power.runtime_status != RPM_ACTIVE) { retval = 0; } else if (ign_usage_count) { retval = 1; atomic_inc(&dev->power.usage_count); } else { retval = atomic_inc_not_zero(&dev->power.usage_count); } trace_rpm_usage(dev, 0); spin_unlock_irqrestore(&dev->power.lock, flags); return retval; } EXPORT_SYMBOL_GPL(pm_runtime_get_if_active); /** * __pm_runtime_set_status - Set runtime PM status of a device. * @dev: Device to handle. * @status: New runtime PM status of the device. * * If runtime PM of the device is disabled or its power.runtime_error field is * different from zero, the status may be changed either to RPM_ACTIVE, or to * RPM_SUSPENDED, as long as that reflects the actual state of the device. * However, if the device has a parent and the parent is not active, and the * parent's power.ignore_children flag is unset, the device's status cannot be * set to RPM_ACTIVE, so -EBUSY is returned in that case. * * If successful, __pm_runtime_set_status() clears the power.runtime_error field * and the device parent's counter of unsuspended children is modified to * reflect the new status. If the new status is RPM_SUSPENDED, an idle * notification request for the parent is submitted. * * If @dev has any suppliers (as reflected by device links to them), and @status * is RPM_ACTIVE, they will be activated upfront and if the activation of one * of them fails, the status of @dev will be changed to RPM_SUSPENDED (instead * of the @status value) and the suppliers will be deacticated on exit. The * error returned by the failing supplier activation will be returned in that * case. */ int __pm_runtime_set_status(struct device *dev, unsigned int status) { struct device *parent = dev->parent; bool notify_parent = false; unsigned long flags; int error = 0; if (status != RPM_ACTIVE && status != RPM_SUSPENDED) return -EINVAL; spin_lock_irqsave(&dev->power.lock, flags); /* * Prevent PM-runtime from being enabled for the device or return an * error if it is enabled already and working. */ if (dev->power.runtime_error || dev->power.disable_depth) dev->power.disable_depth++; else error = -EAGAIN; spin_unlock_irqrestore(&dev->power.lock, flags); if (error) return error; /* * If the new status is RPM_ACTIVE, the suppliers can be activated * upfront regardless of the current status, because next time * rpm_put_suppliers() runs, the rpm_active refcounts of the links * involved will be dropped down to one anyway. */ if (status == RPM_ACTIVE) { int idx = device_links_read_lock(); error = rpm_get_suppliers(dev); if (error) status = RPM_SUSPENDED; device_links_read_unlock(idx); } spin_lock_irqsave(&dev->power.lock, flags); if (dev->power.runtime_status == status || !parent) goto out_set; if (status == RPM_SUSPENDED) { atomic_add_unless(&parent->power.child_count, -1, 0); notify_parent = !parent->power.ignore_children; } else { spin_lock_nested(&parent->power.lock, SINGLE_DEPTH_NESTING); /* * It is invalid to put an active child under a parent that is * not active, has runtime PM enabled and the * 'power.ignore_children' flag unset. */ if (!parent->power.disable_depth && !parent->power.ignore_children && parent->power.runtime_status != RPM_ACTIVE) { dev_err(dev, "runtime PM trying to activate child device %s but parent (%s) is not active\n", dev_name(dev), dev_name(parent)); error = -EBUSY; } else if (dev->power.runtime_status == RPM_SUSPENDED) { atomic_inc(&parent->power.child_count); } spin_unlock(&parent->power.lock); if (error) { status = RPM_SUSPENDED; goto out; } } out_set: __update_runtime_status(dev, status); if (!error) dev->power.runtime_error = 0; out: spin_unlock_irqrestore(&dev->power.lock, flags); if (notify_parent) pm_request_idle(parent); if (status == RPM_SUSPENDED) { int idx = device_links_read_lock(); rpm_put_suppliers(dev); device_links_read_unlock(idx); } pm_runtime_enable(dev); return error; } EXPORT_SYMBOL_GPL(__pm_runtime_set_status); /** * __pm_runtime_barrier - Cancel pending requests and wait for completions. * @dev: Device to handle. * * Flush all pending requests for the device from pm_wq and wait for all * runtime PM operations involving the device in progress to complete. * * Should be called under dev->power.lock with interrupts disabled. */ static void __pm_runtime_barrier(struct device *dev) { pm_runtime_deactivate_timer(dev); if (dev->power.request_pending) { dev->power.request = RPM_REQ_NONE; spin_unlock_irq(&dev->power.lock); cancel_work_sync(&dev->power.work); spin_lock_irq(&dev->power.lock); dev->power.request_pending = false; } if (dev->power.runtime_status == RPM_SUSPENDING || dev->power.runtime_status == RPM_RESUMING || dev->power.idle_notification) { DEFINE_WAIT(wait); /* Suspend, wake-up or idle notification in progress. */ for (;;) { prepare_to_wait(&dev->power.wait_queue, &wait, TASK_UNINTERRUPTIBLE); if (dev->power.runtime_status != RPM_SUSPENDING && dev->power.runtime_status != RPM_RESUMING && !dev->power.idle_notification) break; spin_unlock_irq(&dev->power.lock); schedule(); spin_lock_irq(&dev->power.lock); } finish_wait(&dev->power.wait_queue, &wait); } } /** * pm_runtime_barrier - Flush pending requests and wait for completions. * @dev: Device to handle. * * Prevent the device from being suspended by incrementing its usage counter and * if there's a pending resume request for the device, wake the device up. * Next, make sure that all pending requests for the device have been flushed * from pm_wq and wait for all runtime PM operations involving the device in * progress to complete. * * Return value: * 1, if there was a resume request pending and the device had to be woken up, * 0, otherwise */ int pm_runtime_barrier(struct device *dev) { int retval = 0; pm_runtime_get_noresume(dev); spin_lock_irq(&dev->power.lock); if (dev->power.request_pending && dev->power.request == RPM_REQ_RESUME) { rpm_resume(dev, 0); retval = 1; } __pm_runtime_barrier(dev); spin_unlock_irq(&dev->power.lock); pm_runtime_put_noidle(dev); return retval; } EXPORT_SYMBOL_GPL(pm_runtime_barrier); /** * __pm_runtime_disable - Disable runtime PM of a device. * @dev: Device to handle. * @check_resume: If set, check if there's a resume request for the device. * * Increment power.disable_depth for the device and if it was zero previously, * cancel all pending runtime PM requests for the device and wait for all * operations in progress to complete. The device can be either active or * suspended after its runtime PM has been disabled. * * If @check_resume is set and there's a resume request pending when * __pm_runtime_disable() is called and power.disable_depth is zero, the * function will wake up the device before disabling its runtime PM. */ void __pm_runtime_disable(struct device *dev, bool check_resume) { spin_lock_irq(&dev->power.lock); if (dev->power.disable_depth > 0) { dev->power.disable_depth++; goto out; } /* * Wake up the device if there's a resume request pending, because that * means there probably is some I/O to process and disabling runtime PM * shouldn't prevent the device from processing the I/O. */ if (check_resume && dev->power.request_pending && dev->power.request == RPM_REQ_RESUME) { /* * Prevent suspends and idle notifications from being carried * out after we have woken up the device. */ pm_runtime_get_noresume(dev); rpm_resume(dev, 0); pm_runtime_put_noidle(dev); } /* Update time accounting before disabling PM-runtime. */ update_pm_runtime_accounting(dev); if (!dev->power.disable_depth++) { __pm_runtime_barrier(dev); dev->power.last_status = dev->power.runtime_status; } out: spin_unlock_irq(&dev->power.lock); } EXPORT_SYMBOL_GPL(__pm_runtime_disable); /** * pm_runtime_enable - Enable runtime PM of a device. * @dev: Device to handle. */ void pm_runtime_enable(struct device *dev) { unsigned long flags; spin_lock_irqsave(&dev->power.lock, flags); if (!dev->power.disable_depth) { dev_warn(dev, "Unbalanced %s!\n", __func__); goto out; } if (--dev->power.disable_depth > 0) goto out; dev->power.last_status = RPM_INVALID; dev->power.accounting_timestamp = ktime_get_mono_fast_ns(); if (dev->power.runtime_status == RPM_SUSPENDED && !dev->power.ignore_children && atomic_read(&dev->power.child_count) > 0) dev_warn(dev, "Enabling runtime PM for inactive device with active children\n"); out: spin_unlock_irqrestore(&dev->power.lock, flags); } EXPORT_SYMBOL_GPL(pm_runtime_enable); static void pm_runtime_disable_action(void *data) { pm_runtime_dont_use_autosuspend(data); pm_runtime_disable(data); } /** * devm_pm_runtime_enable - devres-enabled version of pm_runtime_enable. * * NOTE: this will also handle calling pm_runtime_dont_use_autosuspend() for * you at driver exit time if needed. * * @dev: Device to handle. */ int devm_pm_runtime_enable(struct device *dev) { pm_runtime_enable(dev); return devm_add_action_or_reset(dev, pm_runtime_disable_action, dev); } EXPORT_SYMBOL_GPL(devm_pm_runtime_enable); /** * pm_runtime_forbid - Block runtime PM of a device. * @dev: Device to handle. * * Increase the device's usage count and clear its power.runtime_auto flag, * so that it cannot be suspended at run time until pm_runtime_allow() is called * for it. */ void pm_runtime_forbid(struct device *dev) { spin_lock_irq(&dev->power.lock); if (!dev->power.runtime_auto) goto out; dev->power.runtime_auto = false; atomic_inc(&dev->power.usage_count); rpm_resume(dev, 0); out: spin_unlock_irq(&dev->power.lock); } EXPORT_SYMBOL_GPL(pm_runtime_forbid); /** * pm_runtime_allow - Unblock runtime PM of a device. * @dev: Device to handle. * * Decrease the device's usage count and set its power.runtime_auto flag. */ void pm_runtime_allow(struct device *dev) { int ret; spin_lock_irq(&dev->power.lock); if (dev->power.runtime_auto) goto out; dev->power.runtime_auto = true; ret = rpm_drop_usage_count(dev); if (ret == 0) rpm_idle(dev, RPM_AUTO | RPM_ASYNC); else if (ret > 0) trace_rpm_usage(dev, RPM_AUTO | RPM_ASYNC); out: spin_unlock_irq(&dev->power.lock); } EXPORT_SYMBOL_GPL(pm_runtime_allow); /** * pm_runtime_no_callbacks - Ignore runtime PM callbacks for a device. * @dev: Device to handle. * * Set the power.no_callbacks flag, which tells the PM core that this * device is power-managed through its parent and has no runtime PM * callbacks of its own. The runtime sysfs attributes will be removed. */ void pm_runtime_no_callbacks(struct device *dev) { spin_lock_irq(&dev->power.lock); dev->power.no_callbacks = 1; spin_unlock_irq(&dev->power.lock); if (device_is_registered(dev)) rpm_sysfs_remove(dev); } EXPORT_SYMBOL_GPL(pm_runtime_no_callbacks); /** * pm_runtime_irq_safe - Leave interrupts disabled during callbacks. * @dev: Device to handle * * Set the power.irq_safe flag, which tells the PM core that the * ->runtime_suspend() and ->runtime_resume() callbacks for this device should * always be invoked with the spinlock held and interrupts disabled. It also * causes the parent's usage counter to be permanently incremented, preventing * the parent from runtime suspending -- otherwise an irq-safe child might have * to wait for a non-irq-safe parent. */ void pm_runtime_irq_safe(struct device *dev) { if (dev->parent) pm_runtime_get_sync(dev->parent); spin_lock_irq(&dev->power.lock); dev->power.irq_safe = 1; spin_unlock_irq(&dev->power.lock); } EXPORT_SYMBOL_GPL(pm_runtime_irq_safe); /** * update_autosuspend - Handle a change to a device's autosuspend settings. * @dev: Device to handle. * @old_delay: The former autosuspend_delay value. * @old_use: The former use_autosuspend value. * * Prevent runtime suspend if the new delay is negative and use_autosuspend is * set; otherwise allow it. Send an idle notification if suspends are allowed. * * This function must be called under dev->power.lock with interrupts disabled. */ static void update_autosuspend(struct device *dev, int old_delay, int old_use) { int delay = dev->power.autosuspend_delay; /* Should runtime suspend be prevented now? */ if (dev->power.use_autosuspend && delay < 0) { /* If it used to be allowed then prevent it. */ if (!old_use || old_delay >= 0) { atomic_inc(&dev->power.usage_count); rpm_resume(dev, 0); } else { trace_rpm_usage(dev, 0); } } /* Runtime suspend should be allowed now. */ else { /* If it used to be prevented then allow it. */ if (old_use && old_delay < 0) atomic_dec(&dev->power.usage_count); /* Maybe we can autosuspend now. */ rpm_idle(dev, RPM_AUTO); } } /** * pm_runtime_set_autosuspend_delay - Set a device's autosuspend_delay value. * @dev: Device to handle. * @delay: Value of the new delay in milliseconds. * * Set the device's power.autosuspend_delay value. If it changes to negative * and the power.use_autosuspend flag is set, prevent runtime suspends. If it * changes the other way, allow runtime suspends. */ void pm_runtime_set_autosuspend_delay(struct device *dev, int delay) { int old_delay, old_use; spin_lock_irq(&dev->power.lock); old_delay = dev->power.autosuspend_delay; old_use = dev->power.use_autosuspend; dev->power.autosuspend_delay = delay; update_autosuspend(dev, old_delay, old_use); spin_unlock_irq(&dev->power.lock); } EXPORT_SYMBOL_GPL(pm_runtime_set_autosuspend_delay); /** * __pm_runtime_use_autosuspend - Set a device's use_autosuspend flag. * @dev: Device to handle. * @use: New value for use_autosuspend. * * Set the device's power.use_autosuspend flag, and allow or prevent runtime * suspends as needed. */ void __pm_runtime_use_autosuspend(struct device *dev, bool use) { int old_delay, old_use; spin_lock_irq(&dev->power.lock); old_delay = dev->power.autosuspend_delay; old_use = dev->power.use_autosuspend; dev->power.use_autosuspend = use; update_autosuspend(dev, old_delay, old_use); spin_unlock_irq(&dev->power.lock); } EXPORT_SYMBOL_GPL(__pm_runtime_use_autosuspend); /** * pm_runtime_init - Initialize runtime PM fields in given device object. * @dev: Device object to initialize. */ void pm_runtime_init(struct device *dev) { dev->power.runtime_status = RPM_SUSPENDED; dev->power.last_status = RPM_INVALID; dev->power.idle_notification = false; dev->power.disable_depth = 1; atomic_set(&dev->power.usage_count, 0); dev->power.runtime_error = 0; atomic_set(&dev->power.child_count, 0); pm_suspend_ignore_children(dev, false); dev->power.runtime_auto = true; dev->power.request_pending = false; dev->power.request = RPM_REQ_NONE; dev->power.deferred_resume = false; dev->power.needs_force_resume = 0; INIT_WORK(&dev->power.work, pm_runtime_work); dev->power.timer_expires = 0; hrtimer_init(&dev->power.suspend_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); dev->power.suspend_timer.function = pm_suspend_timer_fn; init_waitqueue_head(&dev->power.wait_queue); } /** * pm_runtime_reinit - Re-initialize runtime PM fields in given device object. * @dev: Device object to re-initialize. */ void pm_runtime_reinit(struct device *dev) { if (!pm_runtime_enabled(dev)) { if (dev->power.runtime_status == RPM_ACTIVE) pm_runtime_set_suspended(dev); if (dev->power.irq_safe) { spin_lock_irq(&dev->power.lock); dev->power.irq_safe = 0; spin_unlock_irq(&dev->power.lock); if (dev->parent) pm_runtime_put(dev->parent); } } } /** * pm_runtime_remove - Prepare for removing a device from device hierarchy. * @dev: Device object being removed from device hierarchy. */ void pm_runtime_remove(struct device *dev) { __pm_runtime_disable(dev, false); pm_runtime_reinit(dev); } /** * pm_runtime_get_suppliers - Resume and reference-count supplier devices. * @dev: Consumer device. */ void pm_runtime_get_suppliers(struct device *dev) { struct device_link *link; int idx; idx = device_links_read_lock(); list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, device_links_read_lock_held()) if (link->flags & DL_FLAG_PM_RUNTIME) { link->supplier_preactivated = true; pm_runtime_get_sync(link->supplier); } device_links_read_unlock(idx); } /** * pm_runtime_put_suppliers - Drop references to supplier devices. * @dev: Consumer device. */ void pm_runtime_put_suppliers(struct device *dev) { struct device_link *link; int idx; idx = device_links_read_lock(); list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, device_links_read_lock_held()) if (link->supplier_preactivated) { link->supplier_preactivated = false; pm_runtime_put(link->supplier); } device_links_read_unlock(idx); } void pm_runtime_new_link(struct device *dev) { spin_lock_irq(&dev->power.lock); dev->power.links_count++; spin_unlock_irq(&dev->power.lock); } static void pm_runtime_drop_link_count(struct device *dev) { spin_lock_irq(&dev->power.lock); WARN_ON(dev->power.links_count == 0); dev->power.links_count--; spin_unlock_irq(&dev->power.lock); } /** * pm_runtime_drop_link - Prepare for device link removal. * @link: Device link going away. * * Drop the link count of the consumer end of @link and decrement the supplier * device's runtime PM usage counter as many times as needed to drop all of the * PM runtime reference to it from the consumer. */ void pm_runtime_drop_link(struct device_link *link) { if (!(link->flags & DL_FLAG_PM_RUNTIME)) return; pm_runtime_drop_link_count(link->consumer); pm_runtime_release_supplier(link); pm_request_idle(link->supplier); } static bool pm_runtime_need_not_resume(struct device *dev) { return atomic_read(&dev->power.usage_count) <= 1 && (atomic_read(&dev->power.child_count) == 0 || dev->power.ignore_children); } /** * pm_runtime_force_suspend - Force a device into suspend state if needed. * @dev: Device to suspend. * * Disable runtime PM so we safely can check the device's runtime PM status and * if it is active, invoke its ->runtime_suspend callback to suspend it and * change its runtime PM status field to RPM_SUSPENDED. Also, if the device's * usage and children counters don't indicate that the device was in use before * the system-wide transition under way, decrement its parent's children counter * (if there is a parent). Keep runtime PM disabled to preserve the state * unless we encounter errors. * * Typically this function may be invoked from a system suspend callback to make * sure the device is put into low power state and it should only be used during * system-wide PM transitions to sleep states. It assumes that the analogous * pm_runtime_force_resume() will be used to resume the device. * * Do not use with DPM_FLAG_SMART_SUSPEND as this can lead to an inconsistent * state where this function has called the ->runtime_suspend callback but the * PM core marks the driver as runtime active. */ int pm_runtime_force_suspend(struct device *dev) { int (*callback)(struct device *); int ret; pm_runtime_disable(dev); if (pm_runtime_status_suspended(dev)) return 0; callback = RPM_GET_CALLBACK(dev, runtime_suspend); dev_pm_enable_wake_irq_check(dev, true); ret = callback ? callback(dev) : 0; if (ret) goto err; dev_pm_enable_wake_irq_complete(dev); /* * If the device can stay in suspend after the system-wide transition * to the working state that will follow, drop the children counter of * its parent, but set its status to RPM_SUSPENDED anyway in case this * function will be called again for it in the meantime. */ if (pm_runtime_need_not_resume(dev)) { pm_runtime_set_suspended(dev); } else { __update_runtime_status(dev, RPM_SUSPENDED); dev->power.needs_force_resume = 1; } return 0; err: dev_pm_disable_wake_irq_check(dev, true); pm_runtime_enable(dev); return ret; } EXPORT_SYMBOL_GPL(pm_runtime_force_suspend); /** * pm_runtime_force_resume - Force a device into resume state if needed. * @dev: Device to resume. * * Prior invoking this function we expect the user to have brought the device * into low power state by a call to pm_runtime_force_suspend(). Here we reverse * those actions and bring the device into full power, if it is expected to be * used on system resume. In the other case, we defer the resume to be managed * via runtime PM. * * Typically this function may be invoked from a system resume callback. */ int pm_runtime_force_resume(struct device *dev) { int (*callback)(struct device *); int ret = 0; if (!pm_runtime_status_suspended(dev) || !dev->power.needs_force_resume) goto out; /* * The value of the parent's children counter is correct already, so * just update the status of the device. */ __update_runtime_status(dev, RPM_ACTIVE); callback = RPM_GET_CALLBACK(dev, runtime_resume); dev_pm_disable_wake_irq_check(dev, false); ret = callback ? callback(dev) : 0; if (ret) { pm_runtime_set_suspended(dev); dev_pm_enable_wake_irq_check(dev, false); goto out; } pm_runtime_mark_last_busy(dev); out: dev->power.needs_force_resume = 0; pm_runtime_enable(dev); return ret; } EXPORT_SYMBOL_GPL(pm_runtime_force_resume);
2 2 2 71 71 68 23 12 3 9 3 8 19 4 3 1 2 1 10 1005 3 3 23 5 23 23 23 23 24 1 5 21 3 83 82 83 81 1 66 66 66 17 21 4 8 9 11 4 4 4 4 4 4 4 133 8 25 100 60 59 59 59 1 51 8 4 4 2 18 3 1 11 4 2 11 2 11 3 9 6 1 5 1 1 239 429 241 430 52 53 266 11 4 26 222 10 3 4 5 5 4 2 3 4 6 2 4 3 3 3 2 1 2 2 1 187 45 5 5 3 2 4 2 2 2 2 3 1 7 11 4 2 5 2 2 6 1 1 1 7 3 3 2 6 3 2 4 1 1 5 2 1 8 7 2 2 2 2 1 1 17 3 2 2 1 188 202 1309 1156 168 1314 232 1094 1314 86 916 1312 168 1152 1002 1003 1000 1000 1001 999 1002 1003 142 860 1001 40 1001 1041 867 174 1042 1041 1064 1112 1313 1311 1311 1313 1311 57 48 105 107 60 47 2 105 107 2 105 105 1155 128 663 407 409 409 408 409 664 786 14 14 1053 1054 1049 1050 1052 2323 2324 485 485 72 73 72 13 13 1 12 338 2 339 319 320 319 9 9 2 6 1 910 909 911 2 15 7 2 9 4 901 36 26 7 2 5 3 3 12 2 2 26 21 5 9 12 38 93 159 126 7 47 48 135 126 126 61 62 62 135 128 117 118 21 2 2 9 9 9 8 127 126 127 126 125 127 126 98 98 98 97 95 3 46 1 1 1 9 3 6 142 143 142 116 31 142 66 66 66 59 12 66 2461 2466 2450 2459 2399 108 2461 290 293 3 291 284 4 291 1 409 407 1 911 11 9 2 88 45 59 118 81 70 1313 296 1019 1155 166 1310 291 1018 4565 4521 60 4562 4540 113 3701 940 4557 4553 45 48 2 4 4 3 2 2 3 6 3 62 7 3 2 2 1 3 27 16 11 934 64 11 53 64 64 34 35 9 6 1 2 3 2 9 3 3 5 18 21 53 35 10 8 23 11 26 39 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Generic socket support routines. Memory allocators, socket lock/release * handler for protocols to use and generic option handler. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Florian La Roche, <flla@stud.uni-sb.de> * Alan Cox, <A.Cox@swansea.ac.uk> * * Fixes: * Alan Cox : Numerous verify_area() problems * Alan Cox : Connecting on a connecting socket * now returns an error for tcp. * Alan Cox : sock->protocol is set correctly. * and is not sometimes left as 0. * Alan Cox : connect handles icmp errors on a * connect properly. Unfortunately there * is a restart syscall nasty there. I * can't match BSD without hacking the C * library. Ideas urgently sought! * Alan Cox : Disallow bind() to addresses that are * not ours - especially broadcast ones!! * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, * instead they leave that for the DESTROY timer. * Alan Cox : Clean up error flag in accept * Alan Cox : TCP ack handling is buggy, the DESTROY timer * was buggy. Put a remove_sock() in the handler * for memory when we hit 0. Also altered the timer * code. The ACK stuff can wait and needs major * TCP layer surgery. * Alan Cox : Fixed TCP ack bug, removed remove sock * and fixed timer/inet_bh race. * Alan Cox : Added zapped flag for TCP * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... * Rick Sladkey : Relaxed UDP rules for matching packets. * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support * Pauline Middelink : identd support * Alan Cox : Fixed connect() taking signals I think. * Alan Cox : SO_LINGER supported * Alan Cox : Error reporting fixes * Anonymous : inet_create tidied up (sk->reuse setting) * Alan Cox : inet sockets don't set sk->type! * Alan Cox : Split socket option code * Alan Cox : Callbacks * Alan Cox : Nagle flag for Charles & Johannes stuff * Alex : Removed restriction on inet fioctl * Alan Cox : Splitting INET from NET core * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code * Alan Cox : Split IP from generic code * Alan Cox : New kfree_skbmem() * Alan Cox : Make SO_DEBUG superuser only. * Alan Cox : Allow anyone to clear SO_DEBUG * (compatibility fix) * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. * Alan Cox : Allocator for a socket is settable. * Alan Cox : SO_ERROR includes soft errors. * Alan Cox : Allow NULL arguments on some SO_ opts * Alan Cox : Generic socket allocation to make hooks * easier (suggested by Craig Metz). * Michael Pall : SO_ERROR returns positive errno again * Steve Whitehouse: Added default destructor to free * protocol private data. * Steve Whitehouse: Added various other default routines * common to several socket families. * Chris Evans : Call suser() check last on F_SETOWN * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() * Andi Kleen : Fix write_space callback * Chris Evans : Security fixes - signedness again * Arnaldo C. Melo : cleanups, use skb_queue_purge * * To Fix: */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <asm/unaligned.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/errqueue.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/interrupt.h> #include <linux/poll.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/init.h> #include <linux/highmem.h> #include <linux/user_namespace.h> #include <linux/static_key.h> #include <linux/memcontrol.h> #include <linux/prefetch.h> #include <linux/compat.h> #include <linux/mroute.h> #include <linux/mroute6.h> #include <linux/icmpv6.h> #include <linux/uaccess.h> #include <linux/netdevice.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/net_namespace.h> #include <net/request_sock.h> #include <net/sock.h> #include <linux/net_tstamp.h> #include <net/xfrm.h> #include <linux/ipsec.h> #include <net/cls_cgroup.h> #include <net/netprio_cgroup.h> #include <linux/sock_diag.h> #include <linux/filter.h> #include <net/sock_reuseport.h> #include <net/bpf_sk_storage.h> #include <trace/events/sock.h> #include <net/tcp.h> #include <net/busy_poll.h> #include <net/phonet/phonet.h> #include <linux/ethtool.h> #include "dev.h" static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); static void sock_def_write_space_wfree(struct sock *sk); static void sock_def_write_space(struct sock *sk); /** * sk_ns_capable - General socket capability test * @sk: Socket to use a capability on or through * @user_ns: The user namespace of the capability to use * @cap: The capability to use * * Test to see if the opener of the socket had when the socket was * created and the current process has the capability @cap in the user * namespace @user_ns. */ bool sk_ns_capable(const struct sock *sk, struct user_namespace *user_ns, int cap) { return file_ns_capable(sk->sk_socket->file, user_ns, cap) && ns_capable(user_ns, cap); } EXPORT_SYMBOL(sk_ns_capable); /** * sk_capable - Socket global capability test * @sk: Socket to use a capability on or through * @cap: The global capability to use * * Test to see if the opener of the socket had when the socket was * created and the current process has the capability @cap in all user * namespaces. */ bool sk_capable(const struct sock *sk, int cap) { return sk_ns_capable(sk, &init_user_ns, cap); } EXPORT_SYMBOL(sk_capable); /** * sk_net_capable - Network namespace socket capability test * @sk: Socket to use a capability on or through * @cap: The capability to use * * Test to see if the opener of the socket had when the socket was created * and the current process has the capability @cap over the network namespace * the socket is a member of. */ bool sk_net_capable(const struct sock *sk, int cap) { return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); } EXPORT_SYMBOL(sk_net_capable); /* * Each address family might have different locking rules, so we have * one slock key per address family and separate keys for internal and * userspace sockets. */ static struct lock_class_key af_family_keys[AF_MAX]; static struct lock_class_key af_family_kern_keys[AF_MAX]; static struct lock_class_key af_family_slock_keys[AF_MAX]; static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; /* * Make lock validator output more readable. (we pre-construct these * strings build-time, so that runtime initialization of socket * locks is fast): */ #define _sock_locks(x) \ x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ x "27" , x "28" , x "AF_CAN" , \ x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ x "AF_MCTP" , \ x "AF_MAX" static const char *const af_family_key_strings[AF_MAX+1] = { _sock_locks("sk_lock-") }; static const char *const af_family_slock_key_strings[AF_MAX+1] = { _sock_locks("slock-") }; static const char *const af_family_clock_key_strings[AF_MAX+1] = { _sock_locks("clock-") }; static const char *const af_family_kern_key_strings[AF_MAX+1] = { _sock_locks("k-sk_lock-") }; static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { _sock_locks("k-slock-") }; static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { _sock_locks("k-clock-") }; static const char *const af_family_rlock_key_strings[AF_MAX+1] = { _sock_locks("rlock-") }; static const char *const af_family_wlock_key_strings[AF_MAX+1] = { _sock_locks("wlock-") }; static const char *const af_family_elock_key_strings[AF_MAX+1] = { _sock_locks("elock-") }; /* * sk_callback_lock and sk queues locking rules are per-address-family, * so split the lock classes by using a per-AF key: */ static struct lock_class_key af_callback_keys[AF_MAX]; static struct lock_class_key af_rlock_keys[AF_MAX]; static struct lock_class_key af_wlock_keys[AF_MAX]; static struct lock_class_key af_elock_keys[AF_MAX]; static struct lock_class_key af_kern_callback_keys[AF_MAX]; /* Run time adjustable parameters. */ __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; EXPORT_SYMBOL(sysctl_wmem_max); __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; EXPORT_SYMBOL(sysctl_rmem_max); __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; int sysctl_mem_pcpu_rsv __read_mostly = SK_MEMORY_PCPU_RESERVE; int sysctl_tstamp_allow_data __read_mostly = 1; DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); EXPORT_SYMBOL_GPL(memalloc_socks_key); /** * sk_set_memalloc - sets %SOCK_MEMALLOC * @sk: socket to set it on * * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. * It's the responsibility of the admin to adjust min_free_kbytes * to meet the requirements */ void sk_set_memalloc(struct sock *sk) { sock_set_flag(sk, SOCK_MEMALLOC); sk->sk_allocation |= __GFP_MEMALLOC; static_branch_inc(&memalloc_socks_key); } EXPORT_SYMBOL_GPL(sk_set_memalloc); void sk_clear_memalloc(struct sock *sk) { sock_reset_flag(sk, SOCK_MEMALLOC); sk->sk_allocation &= ~__GFP_MEMALLOC; static_branch_dec(&memalloc_socks_key); /* * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward * progress of swapping. SOCK_MEMALLOC may be cleared while * it has rmem allocations due to the last swapfile being deactivated * but there is a risk that the socket is unusable due to exceeding * the rmem limits. Reclaim the reserves and obey rmem limits again. */ sk_mem_reclaim(sk); } EXPORT_SYMBOL_GPL(sk_clear_memalloc); int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) { int ret; unsigned int noreclaim_flag; /* these should have been dropped before queueing */ BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); noreclaim_flag = memalloc_noreclaim_save(); ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv, tcp_v6_do_rcv, tcp_v4_do_rcv, sk, skb); memalloc_noreclaim_restore(noreclaim_flag); return ret; } EXPORT_SYMBOL(__sk_backlog_rcv); void sk_error_report(struct sock *sk) { sk->sk_error_report(sk); switch (sk->sk_family) { case AF_INET: fallthrough; case AF_INET6: trace_inet_sk_error_report(sk); break; default: break; } } EXPORT_SYMBOL(sk_error_report); int sock_get_timeout(long timeo, void *optval, bool old_timeval) { struct __kernel_sock_timeval tv; if (timeo == MAX_SCHEDULE_TIMEOUT) { tv.tv_sec = 0; tv.tv_usec = 0; } else { tv.tv_sec = timeo / HZ; tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; } if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; *(struct old_timeval32 *)optval = tv32; return sizeof(tv32); } if (old_timeval) { struct __kernel_old_timeval old_tv; old_tv.tv_sec = tv.tv_sec; old_tv.tv_usec = tv.tv_usec; *(struct __kernel_old_timeval *)optval = old_tv; return sizeof(old_tv); } *(struct __kernel_sock_timeval *)optval = tv; return sizeof(tv); } EXPORT_SYMBOL(sock_get_timeout); int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, sockptr_t optval, int optlen, bool old_timeval) { if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { struct old_timeval32 tv32; if (optlen < sizeof(tv32)) return -EINVAL; if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) return -EFAULT; tv->tv_sec = tv32.tv_sec; tv->tv_usec = tv32.tv_usec; } else if (old_timeval) { struct __kernel_old_timeval old_tv; if (optlen < sizeof(old_tv)) return -EINVAL; if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) return -EFAULT; tv->tv_sec = old_tv.tv_sec; tv->tv_usec = old_tv.tv_usec; } else { if (optlen < sizeof(*tv)) return -EINVAL; if (copy_from_sockptr(tv, optval, sizeof(*tv))) return -EFAULT; } return 0; } EXPORT_SYMBOL(sock_copy_user_timeval); static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, bool old_timeval) { struct __kernel_sock_timeval tv; int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval); long val; if (err) return err; if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) return -EDOM; if (tv.tv_sec < 0) { static int warned __read_mostly; WRITE_ONCE(*timeo_p, 0); if (warned < 10 && net_ratelimit()) { warned++; pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", __func__, current->comm, task_pid_nr(current)); } return 0; } val = MAX_SCHEDULE_TIMEOUT; if ((tv.tv_sec || tv.tv_usec) && (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))) val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ); WRITE_ONCE(*timeo_p, val); return 0; } static bool sock_needs_netstamp(const struct sock *sk) { switch (sk->sk_family) { case AF_UNSPEC: case AF_UNIX: return false; default: return true; } } static void sock_disable_timestamp(struct sock *sk, unsigned long flags) { if (sk->sk_flags & flags) { sk->sk_flags &= ~flags; if (sock_needs_netstamp(sk) && !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) net_disable_timestamp(); } } int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { atomic_inc(&sk->sk_drops); trace_sock_rcvqueue_full(sk, skb); return -ENOMEM; } if (!sk_rmem_schedule(sk, skb, skb->truesize)) { atomic_inc(&sk->sk_drops); return -ENOBUFS; } skb->dev = NULL; skb_set_owner_r(skb, sk); /* we escape from rcu protected region, make sure we dont leak * a norefcounted dst */ skb_dst_force(skb); spin_lock_irqsave(&list->lock, flags); sock_skb_set_dropcount(sk, skb); __skb_queue_tail(list, skb); spin_unlock_irqrestore(&list->lock, flags); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); return 0; } EXPORT_SYMBOL(__sock_queue_rcv_skb); int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason) { enum skb_drop_reason drop_reason; int err; err = sk_filter(sk, skb); if (err) { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; goto out; } err = __sock_queue_rcv_skb(sk, skb); switch (err) { case -ENOMEM: drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF; break; case -ENOBUFS: drop_reason = SKB_DROP_REASON_PROTO_MEM; break; default: drop_reason = SKB_NOT_DROPPED_YET; break; } out: if (reason) *reason = drop_reason; return err; } EXPORT_SYMBOL(sock_queue_rcv_skb_reason); int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested, unsigned int trim_cap, bool refcounted) { int rc = NET_RX_SUCCESS; if (sk_filter_trim_cap(sk, skb, trim_cap)) goto discard_and_relse; skb->dev = NULL; if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { atomic_inc(&sk->sk_drops); goto discard_and_relse; } if (nested) bh_lock_sock_nested(sk); else bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { /* * trylock + unlock semantics: */ mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); rc = sk_backlog_rcv(sk, skb); mutex_release(&sk->sk_lock.dep_map, _RET_IP_); } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { bh_unlock_sock(sk); atomic_inc(&sk->sk_drops); goto discard_and_relse; } bh_unlock_sock(sk); out: if (refcounted) sock_put(sk); return rc; discard_and_relse: kfree_skb(skb); goto out; } EXPORT_SYMBOL(__sk_receive_skb); INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, u32)); INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, u32)); struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = __sk_dst_get(sk); if (dst && dst->obsolete && INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, dst, cookie) == NULL) { sk_tx_queue_clear(sk); WRITE_ONCE(sk->sk_dst_pending_confirm, 0); RCU_INIT_POINTER(sk->sk_dst_cache, NULL); dst_release(dst); return NULL; } return dst; } EXPORT_SYMBOL(__sk_dst_check); struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = sk_dst_get(sk); if (dst && dst->obsolete && INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, dst, cookie) == NULL) { sk_dst_reset(sk); dst_release(dst); return NULL; } return dst; } EXPORT_SYMBOL(sk_dst_check); static int sock_bindtoindex_locked(struct sock *sk, int ifindex) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES struct net *net = sock_net(sk); /* Sorry... */ ret = -EPERM; if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) goto out; ret = -EINVAL; if (ifindex < 0) goto out; /* Paired with all READ_ONCE() done locklessly. */ WRITE_ONCE(sk->sk_bound_dev_if, ifindex); if (sk->sk_prot->rehash) sk->sk_prot->rehash(sk); sk_dst_reset(sk); ret = 0; out: #endif return ret; } int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) { int ret; if (lock_sk) lock_sock(sk); ret = sock_bindtoindex_locked(sk, ifindex); if (lock_sk) release_sock(sk); return ret; } EXPORT_SYMBOL(sock_bindtoindex); static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES struct net *net = sock_net(sk); char devname[IFNAMSIZ]; int index; ret = -EINVAL; if (optlen < 0) goto out; /* Bind this socket to a particular device like "eth0", * as specified in the passed interface name. If the * name is "" or the option length is zero the socket * is not bound. */ if (optlen > IFNAMSIZ - 1) optlen = IFNAMSIZ - 1; memset(devname, 0, sizeof(devname)); ret = -EFAULT; if (copy_from_sockptr(devname, optval, optlen)) goto out; index = 0; if (devname[0] != '\0') { struct net_device *dev; rcu_read_lock(); dev = dev_get_by_name_rcu(net, devname); if (dev) index = dev->ifindex; rcu_read_unlock(); ret = -ENODEV; if (!dev) goto out; } sockopt_lock_sock(sk); ret = sock_bindtoindex_locked(sk, index); sockopt_release_sock(sk); out: #endif return ret; } static int sock_getbindtodevice(struct sock *sk, sockptr_t optval, sockptr_t optlen, int len) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); struct net *net = sock_net(sk); char devname[IFNAMSIZ]; if (bound_dev_if == 0) { len = 0; goto zero; } ret = -EINVAL; if (len < IFNAMSIZ) goto out; ret = netdev_get_name(net, devname, bound_dev_if); if (ret) goto out; len = strlen(devname) + 1; ret = -EFAULT; if (copy_to_sockptr(optval, devname, len)) goto out; zero: ret = -EFAULT; if (copy_to_sockptr(optlen, &len, sizeof(int))) goto out; ret = 0; out: #endif return ret; } bool sk_mc_loop(const struct sock *sk) { if (dev_recursion_level()) return false; if (!sk) return true; /* IPV6_ADDRFORM can change sk->sk_family under us. */ switch (READ_ONCE(sk->sk_family)) { case AF_INET: return inet_test_bit(MC_LOOP, sk); #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: return inet6_test_bit(MC6_LOOP, sk); #endif } WARN_ON_ONCE(1); return true; } EXPORT_SYMBOL(sk_mc_loop); void sock_set_reuseaddr(struct sock *sk) { lock_sock(sk); sk->sk_reuse = SK_CAN_REUSE; release_sock(sk); } EXPORT_SYMBOL(sock_set_reuseaddr); void sock_set_reuseport(struct sock *sk) { lock_sock(sk); sk->sk_reuseport = true; release_sock(sk); } EXPORT_SYMBOL(sock_set_reuseport); void sock_no_linger(struct sock *sk) { lock_sock(sk); WRITE_ONCE(sk->sk_lingertime, 0); sock_set_flag(sk, SOCK_LINGER); release_sock(sk); } EXPORT_SYMBOL(sock_no_linger); void sock_set_priority(struct sock *sk, u32 priority) { WRITE_ONCE(sk->sk_priority, priority); } EXPORT_SYMBOL(sock_set_priority); void sock_set_sndtimeo(struct sock *sk, s64 secs) { lock_sock(sk); if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) WRITE_ONCE(sk->sk_sndtimeo, secs * HZ); else WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT); release_sock(sk); } EXPORT_SYMBOL(sock_set_sndtimeo); static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) { if (val) { sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns); sock_set_flag(sk, SOCK_RCVTSTAMP); sock_enable_timestamp(sk, SOCK_TIMESTAMP); } else { sock_reset_flag(sk, SOCK_RCVTSTAMP); sock_reset_flag(sk, SOCK_RCVTSTAMPNS); } } void sock_enable_timestamps(struct sock *sk) { lock_sock(sk); __sock_set_timestamps(sk, true, false, true); release_sock(sk); } EXPORT_SYMBOL(sock_enable_timestamps); void sock_set_timestamp(struct sock *sk, int optname, bool valbool) { switch (optname) { case SO_TIMESTAMP_OLD: __sock_set_timestamps(sk, valbool, false, false); break; case SO_TIMESTAMP_NEW: __sock_set_timestamps(sk, valbool, true, false); break; case SO_TIMESTAMPNS_OLD: __sock_set_timestamps(sk, valbool, false, true); break; case SO_TIMESTAMPNS_NEW: __sock_set_timestamps(sk, valbool, true, true); break; } } static int sock_timestamping_bind_phc(struct sock *sk, int phc_index) { struct net *net = sock_net(sk); struct net_device *dev = NULL; bool match = false; int *vclock_index; int i, num; if (sk->sk_bound_dev_if) dev = dev_get_by_index(net, sk->sk_bound_dev_if); if (!dev) { pr_err("%s: sock not bind to device\n", __func__); return -EOPNOTSUPP; } num = ethtool_get_phc_vclocks(dev, &vclock_index); dev_put(dev); for (i = 0; i < num; i++) { if (*(vclock_index + i) == phc_index) { match = true; break; } } if (num > 0) kfree(vclock_index); if (!match) return -EINVAL; WRITE_ONCE(sk->sk_bind_phc, phc_index); return 0; } int sock_set_timestamping(struct sock *sk, int optname, struct so_timestamping timestamping) { int val = timestamping.flags; int ret; if (val & ~SOF_TIMESTAMPING_MASK) return -EINVAL; if (val & SOF_TIMESTAMPING_OPT_ID_TCP && !(val & SOF_TIMESTAMPING_OPT_ID)) return -EINVAL; if (val & SOF_TIMESTAMPING_OPT_ID && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { if (sk_is_tcp(sk)) { if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) return -EINVAL; if (val & SOF_TIMESTAMPING_OPT_ID_TCP) atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq); else atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); } else { atomic_set(&sk->sk_tskey, 0); } } if (val & SOF_TIMESTAMPING_OPT_STATS && !(val & SOF_TIMESTAMPING_OPT_TSONLY)) return -EINVAL; if (val & SOF_TIMESTAMPING_BIND_PHC) { ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc); if (ret) return ret; } WRITE_ONCE(sk->sk_tsflags, val); sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); if (val & SOF_TIMESTAMPING_RX_SOFTWARE) sock_enable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE); else sock_disable_timestamp(sk, (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); return 0; } void sock_set_keepalive(struct sock *sk) { lock_sock(sk); if (sk->sk_prot->keepalive) sk->sk_prot->keepalive(sk, true); sock_valbool_flag(sk, SOCK_KEEPOPEN, true); release_sock(sk); } EXPORT_SYMBOL(sock_set_keepalive); static void __sock_set_rcvbuf(struct sock *sk, int val) { /* Ensure val * 2 fits into an int, to prevent max_t() from treating it * as a negative value. */ val = min_t(int, val, INT_MAX / 2); sk->sk_userlocks |= SOCK_RCVBUF_LOCK; /* We double it on the way in to account for "struct sk_buff" etc. * overhead. Applications assume that the SO_RCVBUF setting they make * will allow that much actual data to be received on that socket. * * Applications are unaware that "struct sk_buff" and other overheads * allocate from the receive buffer during socket buffer allocation. * * And after considering the possible alternatives, returning the value * we actually used in getsockopt is the most desirable behavior. */ WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); } void sock_set_rcvbuf(struct sock *sk, int val) { lock_sock(sk); __sock_set_rcvbuf(sk, val); release_sock(sk); } EXPORT_SYMBOL(sock_set_rcvbuf); static void __sock_set_mark(struct sock *sk, u32 val) { if (val != sk->sk_mark) { WRITE_ONCE(sk->sk_mark, val); sk_dst_reset(sk); } } void sock_set_mark(struct sock *sk, u32 val) { lock_sock(sk); __sock_set_mark(sk, val); release_sock(sk); } EXPORT_SYMBOL(sock_set_mark); static void sock_release_reserved_memory(struct sock *sk, int bytes) { /* Round down bytes to multiple of pages */ bytes = round_down(bytes, PAGE_SIZE); WARN_ON(bytes > sk->sk_reserved_mem); WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes); sk_mem_reclaim(sk); } static int sock_reserve_memory(struct sock *sk, int bytes) { long allocated; bool charged; int pages; if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk)) return -EOPNOTSUPP; if (!bytes) return 0; pages = sk_mem_pages(bytes); /* pre-charge to memcg */ charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!charged) return -ENOMEM; /* pre-charge to forward_alloc */ sk_memory_allocated_add(sk, pages); allocated = sk_memory_allocated(sk); /* If the system goes into memory pressure with this * precharge, give up and return error. */ if (allocated > sk_prot_mem_limits(sk, 1)) { sk_memory_allocated_sub(sk, pages); mem_cgroup_uncharge_skmem(sk->sk_memcg, pages); return -ENOMEM; } sk_forward_alloc_add(sk, pages << PAGE_SHIFT); WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem + (pages << PAGE_SHIFT)); return 0; } void sockopt_lock_sock(struct sock *sk) { /* When current->bpf_ctx is set, the setsockopt is called from * a bpf prog. bpf has ensured the sk lock has been * acquired before calling setsockopt(). */ if (has_current_bpf_ctx()) return; lock_sock(sk); } EXPORT_SYMBOL(sockopt_lock_sock); void sockopt_release_sock(struct sock *sk) { if (has_current_bpf_ctx()) return; release_sock(sk); } EXPORT_SYMBOL(sockopt_release_sock); bool sockopt_ns_capable(struct user_namespace *ns, int cap) { return has_current_bpf_ctx() || ns_capable(ns, cap); } EXPORT_SYMBOL(sockopt_ns_capable); bool sockopt_capable(int cap) { return has_current_bpf_ctx() || capable(cap); } EXPORT_SYMBOL(sockopt_capable); /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. */ int sk_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct so_timestamping timestamping; struct socket *sock = sk->sk_socket; struct sock_txtime sk_txtime; int val; int valbool; struct linger ling; int ret = 0; /* * Options without arguments */ if (optname == SO_BINDTODEVICE) return sock_setbindtodevice(sk, optval, optlen); if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; valbool = val ? 1 : 0; /* handle options which do not require locking the socket. */ switch (optname) { case SO_PRIORITY: if ((val >= 0 && val <= 6) || sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { sock_set_priority(sk, val); return 0; } return -EPERM; case SO_PASSSEC: assign_bit(SOCK_PASSSEC, &sock->flags, valbool); return 0; case SO_PASSCRED: assign_bit(SOCK_PASSCRED, &sock->flags, valbool); return 0; case SO_PASSPIDFD: assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool); return 0; case SO_TYPE: case SO_PROTOCOL: case SO_DOMAIN: case SO_ERROR: return -ENOPROTOOPT; #ifdef CONFIG_NET_RX_BUSY_POLL case SO_BUSY_POLL: if (val < 0) return -EINVAL; WRITE_ONCE(sk->sk_ll_usec, val); return 0; case SO_PREFER_BUSY_POLL: if (valbool && !sockopt_capable(CAP_NET_ADMIN)) return -EPERM; WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); return 0; case SO_BUSY_POLL_BUDGET: if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) return -EPERM; if (val < 0 || val > U16_MAX) return -EINVAL; WRITE_ONCE(sk->sk_busy_poll_budget, val); return 0; #endif case SO_MAX_PACING_RATE: { unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; unsigned long pacing_rate; if (sizeof(ulval) != sizeof(val) && optlen >= sizeof(ulval) && copy_from_sockptr(&ulval, optval, sizeof(ulval))) { return -EFAULT; } if (ulval != ~0UL) cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); /* Pairs with READ_ONCE() from sk_getsockopt() */ WRITE_ONCE(sk->sk_max_pacing_rate, ulval); pacing_rate = READ_ONCE(sk->sk_pacing_rate); if (ulval < pacing_rate) WRITE_ONCE(sk->sk_pacing_rate, ulval); return 0; } case SO_TXREHASH: if (val < -1 || val > 1) return -EINVAL; if ((u8)val == SOCK_TXREHASH_DEFAULT) val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); /* Paired with READ_ONCE() in tcp_rtx_synack() * and sk_getsockopt(). */ WRITE_ONCE(sk->sk_txrehash, (u8)val); return 0; case SO_PEEK_OFF: { int (*set_peek_off)(struct sock *sk, int val); set_peek_off = READ_ONCE(sock->ops)->set_peek_off; if (set_peek_off) ret = set_peek_off(sk, val); else ret = -EOPNOTSUPP; return ret; } } sockopt_lock_sock(sk); switch (optname) { case SO_DEBUG: if (val && !sockopt_capable(CAP_NET_ADMIN)) ret = -EACCES; else sock_valbool_flag(sk, SOCK_DBG, valbool); break; case SO_REUSEADDR: sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); break; case SO_REUSEPORT: sk->sk_reuseport = valbool; break; case SO_DONTROUTE: sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); sk_dst_reset(sk); break; case SO_BROADCAST: sock_valbool_flag(sk, SOCK_BROADCAST, valbool); break; case SO_SNDBUF: /* Don't error on this BSD doesn't and if you think * about it this is right. Otherwise apps have to * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints */ val = min_t(u32, val, READ_ONCE(sysctl_wmem_max)); set_sndbuf: /* Ensure val * 2 fits into an int, to prevent max_t() * from treating it as a negative value. */ val = min_t(int, val, INT_MAX / 2); sk->sk_userlocks |= SOCK_SNDBUF_LOCK; WRITE_ONCE(sk->sk_sndbuf, max_t(int, val * 2, SOCK_MIN_SNDBUF)); /* Wake up sending tasks if we upped the value. */ sk->sk_write_space(sk); break; case SO_SNDBUFFORCE: if (!sockopt_capable(CAP_NET_ADMIN)) { ret = -EPERM; break; } /* No negative values (to prevent underflow, as val will be * multiplied by 2). */ if (val < 0) val = 0; goto set_sndbuf; case SO_RCVBUF: /* Don't error on this BSD doesn't and if you think * about it this is right. Otherwise apps have to * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints */ __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max))); break; case SO_RCVBUFFORCE: if (!sockopt_capable(CAP_NET_ADMIN)) { ret = -EPERM; break; } /* No negative values (to prevent underflow, as val will be * multiplied by 2). */ __sock_set_rcvbuf(sk, max(val, 0)); break; case SO_KEEPALIVE: if (sk->sk_prot->keepalive) sk->sk_prot->keepalive(sk, valbool); sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); break; case SO_OOBINLINE: sock_valbool_flag(sk, SOCK_URGINLINE, valbool); break; case SO_NO_CHECK: sk->sk_no_check_tx = valbool; break; case SO_LINGER: if (optlen < sizeof(ling)) { ret = -EINVAL; /* 1003.1g */ break; } if (copy_from_sockptr(&ling, optval, sizeof(ling))) { ret = -EFAULT; break; } if (!ling.l_onoff) { sock_reset_flag(sk, SOCK_LINGER); } else { unsigned long t_sec = ling.l_linger; if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ) WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT); else WRITE_ONCE(sk->sk_lingertime, t_sec * HZ); sock_set_flag(sk, SOCK_LINGER); } break; case SO_BSDCOMPAT: break; case SO_TIMESTAMP_OLD: case SO_TIMESTAMP_NEW: case SO_TIMESTAMPNS_OLD: case SO_TIMESTAMPNS_NEW: sock_set_timestamp(sk, optname, valbool); break; case SO_TIMESTAMPING_NEW: case SO_TIMESTAMPING_OLD: if (optlen == sizeof(timestamping)) { if (copy_from_sockptr(&timestamping, optval, sizeof(timestamping))) { ret = -EFAULT; break; } } else { memset(&timestamping, 0, sizeof(timestamping)); timestamping.flags = val; } ret = sock_set_timestamping(sk, optname, timestamping); break; case SO_RCVLOWAT: { int (*set_rcvlowat)(struct sock *sk, int val) = NULL; if (val < 0) val = INT_MAX; if (sock) set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat; if (set_rcvlowat) ret = set_rcvlowat(sk, val); else WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); break; } case SO_RCVTIMEO_OLD: case SO_RCVTIMEO_NEW: ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen, optname == SO_RCVTIMEO_OLD); break; case SO_SNDTIMEO_OLD: case SO_SNDTIMEO_NEW: ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD); break; case SO_ATTACH_FILTER: { struct sock_fprog fprog; ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); if (!ret) ret = sk_attach_filter(&fprog, sk); break; } case SO_ATTACH_BPF: ret = -EINVAL; if (optlen == sizeof(u32)) { u32 ufd; ret = -EFAULT; if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) break; ret = sk_attach_bpf(ufd, sk); } break; case SO_ATTACH_REUSEPORT_CBPF: { struct sock_fprog fprog; ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); if (!ret) ret = sk_reuseport_attach_filter(&fprog, sk); break; } case SO_ATTACH_REUSEPORT_EBPF: ret = -EINVAL; if (optlen == sizeof(u32)) { u32 ufd; ret = -EFAULT; if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) break; ret = sk_reuseport_attach_bpf(ufd, sk); } break; case SO_DETACH_REUSEPORT_BPF: ret = reuseport_detach_prog(sk); break; case SO_DETACH_FILTER: ret = sk_detach_filter(sk); break; case SO_LOCK_FILTER: if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) ret = -EPERM; else sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); break; case SO_MARK: if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; break; } __sock_set_mark(sk, val); break; case SO_RCVMARK: sock_valbool_flag(sk, SOCK_RCVMARK, valbool); break; case SO_RXQ_OVFL: sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); break; case SO_WIFI_STATUS: sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); break; case SO_NOFCS: sock_valbool_flag(sk, SOCK_NOFCS, valbool); break; case SO_SELECT_ERR_QUEUE: sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); break; case SO_INCOMING_CPU: reuseport_update_incoming_cpu(sk, val); break; case SO_CNX_ADVICE: if (val == 1) dst_negative_advice(sk); break; case SO_ZEROCOPY: if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { if (!(sk_is_tcp(sk) || (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP))) ret = -EOPNOTSUPP; } else if (sk->sk_family != PF_RDS) { ret = -EOPNOTSUPP; } if (!ret) { if (val < 0 || val > 1) ret = -EINVAL; else sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); } break; case SO_TXTIME: if (optlen != sizeof(struct sock_txtime)) { ret = -EINVAL; break; } else if (copy_from_sockptr(&sk_txtime, optval, sizeof(struct sock_txtime))) { ret = -EFAULT; break; } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { ret = -EINVAL; break; } /* CLOCK_MONOTONIC is only used by sch_fq, and this packet * scheduler has enough safe guards. */ if (sk_txtime.clockid != CLOCK_MONOTONIC && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; break; } sock_valbool_flag(sk, SOCK_TXTIME, true); sk->sk_clockid = sk_txtime.clockid; sk->sk_txtime_deadline_mode = !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); sk->sk_txtime_report_errors = !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); break; case SO_BINDTOIFINDEX: ret = sock_bindtoindex_locked(sk, val); break; case SO_BUF_LOCK: if (val & ~SOCK_BUF_LOCK_MASK) { ret = -EINVAL; break; } sk->sk_userlocks = val | (sk->sk_userlocks & ~SOCK_BUF_LOCK_MASK); break; case SO_RESERVE_MEM: { int delta; if (val < 0) { ret = -EINVAL; break; } delta = val - sk->sk_reserved_mem; if (delta < 0) sock_release_reserved_memory(sk, -delta); else ret = sock_reserve_memory(sk, delta); break; } default: ret = -ENOPROTOOPT; break; } sockopt_release_sock(sk); return ret; } int sock_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { return sk_setsockopt(sock->sk, level, optname, optval, optlen); } EXPORT_SYMBOL(sock_setsockopt); static const struct cred *sk_get_peer_cred(struct sock *sk) { const struct cred *cred; spin_lock(&sk->sk_peer_lock); cred = get_cred(sk->sk_peer_cred); spin_unlock(&sk->sk_peer_lock); return cred; } static void cred_to_ucred(struct pid *pid, const struct cred *cred, struct ucred *ucred) { ucred->pid = pid_vnr(pid); ucred->uid = ucred->gid = -1; if (cred) { struct user_namespace *current_ns = current_user_ns(); ucred->uid = from_kuid_munged(current_ns, cred->euid); ucred->gid = from_kgid_munged(current_ns, cred->egid); } } static int groups_to_user(sockptr_t dst, const struct group_info *src) { struct user_namespace *user_ns = current_user_ns(); int i; for (i = 0; i < src->ngroups; i++) { gid_t gid = from_kgid_munged(user_ns, src->gid[i]); if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid))) return -EFAULT; } return 0; } int sk_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen) { struct socket *sock = sk->sk_socket; union { int val; u64 val64; unsigned long ulval; struct linger ling; struct old_timeval32 tm32; struct __kernel_old_timeval tm; struct __kernel_sock_timeval stm; struct sock_txtime txtime; struct so_timestamping timestamping; } v; int lv = sizeof(int); int len; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len < 0) return -EINVAL; memset(&v, 0, sizeof(v)); switch (optname) { case SO_DEBUG: v.val = sock_flag(sk, SOCK_DBG); break; case SO_DONTROUTE: v.val = sock_flag(sk, SOCK_LOCALROUTE); break; case SO_BROADCAST: v.val = sock_flag(sk, SOCK_BROADCAST); break; case SO_SNDBUF: v.val = READ_ONCE(sk->sk_sndbuf); break; case SO_RCVBUF: v.val = READ_ONCE(sk->sk_rcvbuf); break; case SO_REUSEADDR: v.val = sk->sk_reuse; break; case SO_REUSEPORT: v.val = sk->sk_reuseport; break; case SO_KEEPALIVE: v.val = sock_flag(sk, SOCK_KEEPOPEN); break; case SO_TYPE: v.val = sk->sk_type; break; case SO_PROTOCOL: v.val = sk->sk_protocol; break; case SO_DOMAIN: v.val = sk->sk_family; break; case SO_ERROR: v.val = -sock_error(sk); if (v.val == 0) v.val = xchg(&sk->sk_err_soft, 0); break; case SO_OOBINLINE: v.val = sock_flag(sk, SOCK_URGINLINE); break; case SO_NO_CHECK: v.val = sk->sk_no_check_tx; break; case SO_PRIORITY: v.val = READ_ONCE(sk->sk_priority); break; case SO_LINGER: lv = sizeof(v.ling); v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ; break; case SO_BSDCOMPAT: break; case SO_TIMESTAMP_OLD: v.val = sock_flag(sk, SOCK_RCVTSTAMP) && !sock_flag(sk, SOCK_TSTAMP_NEW) && !sock_flag(sk, SOCK_RCVTSTAMPNS); break; case SO_TIMESTAMPNS_OLD: v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); break; case SO_TIMESTAMP_NEW: v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); break; case SO_TIMESTAMPNS_NEW: v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); break; case SO_TIMESTAMPING_OLD: case SO_TIMESTAMPING_NEW: lv = sizeof(v.timestamping); /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only * returning the flags when they were set through the same option. * Don't change the beviour for the old case SO_TIMESTAMPING_OLD. */ if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) { v.timestamping.flags = READ_ONCE(sk->sk_tsflags); v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc); } break; case SO_RCVTIMEO_OLD: case SO_RCVTIMEO_NEW: lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v, SO_RCVTIMEO_OLD == optname); break; case SO_SNDTIMEO_OLD: case SO_SNDTIMEO_NEW: lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v, SO_SNDTIMEO_OLD == optname); break; case SO_RCVLOWAT: v.val = READ_ONCE(sk->sk_rcvlowat); break; case SO_SNDLOWAT: v.val = 1; break; case SO_PASSCRED: v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); break; case SO_PASSPIDFD: v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags); break; case SO_PEERCRED: { struct ucred peercred; if (len > sizeof(peercred)) len = sizeof(peercred); spin_lock(&sk->sk_peer_lock); cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); spin_unlock(&sk->sk_peer_lock); if (copy_to_sockptr(optval, &peercred, len)) return -EFAULT; goto lenout; } case SO_PEERPIDFD: { struct pid *peer_pid; struct file *pidfd_file = NULL; int pidfd; if (len > sizeof(pidfd)) len = sizeof(pidfd); spin_lock(&sk->sk_peer_lock); peer_pid = get_pid(sk->sk_peer_pid); spin_unlock(&sk->sk_peer_lock); if (!peer_pid) return -ENODATA; pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file); put_pid(peer_pid); if (pidfd < 0) return pidfd; if (copy_to_sockptr(optval, &pidfd, len) || copy_to_sockptr(optlen, &len, sizeof(int))) { put_unused_fd(pidfd); fput(pidfd_file); return -EFAULT; } fd_install(pidfd, pidfd_file); return 0; } case SO_PEERGROUPS: { const struct cred *cred; int ret, n; cred = sk_get_peer_cred(sk); if (!cred) return -ENODATA; n = cred->group_info->ngroups; if (len < n * sizeof(gid_t)) { len = n * sizeof(gid_t); put_cred(cred); return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE; } len = n * sizeof(gid_t); ret = groups_to_user(optval, cred->group_info); put_cred(cred); if (ret) return ret; goto lenout; } case SO_PEERNAME: { struct sockaddr_storage address; lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2); if (lv < 0) return -ENOTCONN; if (lv < len) return -EINVAL; if (copy_to_sockptr(optval, &address, len)) return -EFAULT; goto lenout; } /* Dubious BSD thing... Probably nobody even uses it, but * the UNIX standard wants it for whatever reason... -DaveM */ case SO_ACCEPTCONN: v.val = sk->sk_state == TCP_LISTEN; break; case SO_PASSSEC: v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); break; case SO_PEERSEC: return security_socket_getpeersec_stream(sock, optval, optlen, len); case SO_MARK: v.val = READ_ONCE(sk->sk_mark); break; case SO_RCVMARK: v.val = sock_flag(sk, SOCK_RCVMARK); break; case SO_RXQ_OVFL: v.val = sock_flag(sk, SOCK_RXQ_OVFL); break; case SO_WIFI_STATUS: v.val = sock_flag(sk, SOCK_WIFI_STATUS); break; case SO_PEEK_OFF: if (!READ_ONCE(sock->ops)->set_peek_off) return -EOPNOTSUPP; v.val = READ_ONCE(sk->sk_peek_off); break; case SO_NOFCS: v.val = sock_flag(sk, SOCK_NOFCS); break; case SO_BINDTODEVICE: return sock_getbindtodevice(sk, optval, optlen, len); case SO_GET_FILTER: len = sk_get_filter(sk, optval, len); if (len < 0) return len; goto lenout; case SO_LOCK_FILTER: v.val = sock_flag(sk, SOCK_FILTER_LOCKED); break; case SO_BPF_EXTENSIONS: v.val = bpf_tell_extensions(); break; case SO_SELECT_ERR_QUEUE: v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); break; #ifdef CONFIG_NET_RX_BUSY_POLL case SO_BUSY_POLL: v.val = READ_ONCE(sk->sk_ll_usec); break; case SO_PREFER_BUSY_POLL: v.val = READ_ONCE(sk->sk_prefer_busy_poll); break; #endif case SO_MAX_PACING_RATE: /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */ if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { lv = sizeof(v.ulval); v.ulval = READ_ONCE(sk->sk_max_pacing_rate); } else { /* 32bit version */ v.val = min_t(unsigned long, ~0U, READ_ONCE(sk->sk_max_pacing_rate)); } break; case SO_INCOMING_CPU: v.val = READ_ONCE(sk->sk_incoming_cpu); break; case SO_MEMINFO: { u32 meminfo[SK_MEMINFO_VARS]; sk_get_meminfo(sk, meminfo); len = min_t(unsigned int, len, sizeof(meminfo)); if (copy_to_sockptr(optval, &meminfo, len)) return -EFAULT; goto lenout; } #ifdef CONFIG_NET_RX_BUSY_POLL case SO_INCOMING_NAPI_ID: v.val = READ_ONCE(sk->sk_napi_id); /* aggregate non-NAPI IDs down to 0 */ if (v.val < MIN_NAPI_ID) v.val = 0; break; #endif case SO_COOKIE: lv = sizeof(u64); if (len < lv) return -EINVAL; v.val64 = sock_gen_cookie(sk); break; case SO_ZEROCOPY: v.val = sock_flag(sk, SOCK_ZEROCOPY); break; case SO_TXTIME: lv = sizeof(v.txtime); v.txtime.clockid = sk->sk_clockid; v.txtime.flags |= sk->sk_txtime_deadline_mode ? SOF_TXTIME_DEADLINE_MODE : 0; v.txtime.flags |= sk->sk_txtime_report_errors ? SOF_TXTIME_REPORT_ERRORS : 0; break; case SO_BINDTOIFINDEX: v.val = READ_ONCE(sk->sk_bound_dev_if); break; case SO_NETNS_COOKIE: lv = sizeof(u64); if (len != lv) return -EINVAL; v.val64 = sock_net(sk)->net_cookie; break; case SO_BUF_LOCK: v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK; break; case SO_RESERVE_MEM: v.val = READ_ONCE(sk->sk_reserved_mem); break; case SO_TXREHASH: /* Paired with WRITE_ONCE() in sk_setsockopt() */ v.val = READ_ONCE(sk->sk_txrehash); break; default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). */ return -ENOPROTOOPT; } if (len > lv) len = lv; if (copy_to_sockptr(optval, &v, len)) return -EFAULT; lenout: if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; return 0; } /* * Initialize an sk_lock. * * (We also register the sk_lock with the lock validator.) */ static inline void sock_lock_init(struct sock *sk) { if (sk->sk_kern_sock) sock_lock_init_class_and_name( sk, af_family_kern_slock_key_strings[sk->sk_family], af_family_kern_slock_keys + sk->sk_family, af_family_kern_key_strings[sk->sk_family], af_family_kern_keys + sk->sk_family); else sock_lock_init_class_and_name( sk, af_family_slock_key_strings[sk->sk_family], af_family_slock_keys + sk->sk_family, af_family_key_strings[sk->sk_family], af_family_keys + sk->sk_family); } /* * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, * even temporarly, because of RCU lookups. sk_node should also be left as is. * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end */ static void sock_copy(struct sock *nsk, const struct sock *osk) { const struct proto *prot = READ_ONCE(osk->sk_prot); #ifdef CONFIG_SECURITY_NETWORK void *sptr = nsk->sk_security; #endif /* If we move sk_tx_queue_mapping out of the private section, * we must check if sk_tx_queue_clear() is called after * sock_copy() in sk_clone_lock(). */ BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) < offsetof(struct sock, sk_dontcopy_begin) || offsetof(struct sock, sk_tx_queue_mapping) >= offsetof(struct sock, sk_dontcopy_end)); memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, prot->obj_size - offsetof(struct sock, sk_dontcopy_end), /* alloc is larger than struct, see sk_prot_alloc() */); #ifdef CONFIG_SECURITY_NETWORK nsk->sk_security = sptr; security_sk_clone(osk, nsk); #endif } static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, int family) { struct sock *sk; struct kmem_cache *slab; slab = prot->slab; if (slab != NULL) { sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); if (!sk) return sk; if (want_init_on_alloc(priority)) sk_prot_clear_nulls(sk, prot->obj_size); } else sk = kmalloc(prot->obj_size, priority); if (sk != NULL) { if (security_sk_alloc(sk, family, priority)) goto out_free; if (!try_module_get(prot->owner)) goto out_free_sec; } return sk; out_free_sec: security_sk_free(sk); out_free: if (slab != NULL) kmem_cache_free(slab, sk); else kfree(sk); return NULL; } static void sk_prot_free(struct proto *prot, struct sock *sk) { struct kmem_cache *slab; struct module *owner; owner = prot->owner; slab = prot->slab; cgroup_sk_free(&sk->sk_cgrp_data); mem_cgroup_sk_free(sk); security_sk_free(sk); if (slab != NULL) kmem_cache_free(slab, sk); else kfree(sk); module_put(owner); } /** * sk_alloc - All socket objects are allocated here * @net: the applicable net namespace * @family: protocol family * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * @prot: struct proto associated with this new sock instance * @kern: is this to be a kernel socket? */ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern) { struct sock *sk; sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); if (sk) { sk->sk_family = family; /* * See comment in struct sock definition to understand * why we need sk_prot_creator -acme */ sk->sk_prot = sk->sk_prot_creator = prot; sk->sk_kern_sock = kern; sock_lock_init(sk); sk->sk_net_refcnt = kern ? 0 : 1; if (likely(sk->sk_net_refcnt)) { get_net_track(net, &sk->ns_tracker, priority); sock_inuse_add(net, 1); } else { __netns_tracker_alloc(net, &sk->ns_tracker, false, priority); } sock_net_set(sk, net); refcount_set(&sk->sk_wmem_alloc, 1); mem_cgroup_sk_alloc(sk); cgroup_sk_alloc(&sk->sk_cgrp_data); sock_update_classid(&sk->sk_cgrp_data); sock_update_netprioidx(&sk->sk_cgrp_data); sk_tx_queue_clear(sk); } return sk; } EXPORT_SYMBOL(sk_alloc); /* Sockets having SOCK_RCU_FREE will call this function after one RCU * grace period. This is the case for UDP sockets and TCP listeners. */ static void __sk_destruct(struct rcu_head *head) { struct sock *sk = container_of(head, struct sock, sk_rcu); struct sk_filter *filter; if (sk->sk_destruct) sk->sk_destruct(sk); filter = rcu_dereference_check(sk->sk_filter, refcount_read(&sk->sk_wmem_alloc) == 0); if (filter) { sk_filter_uncharge(sk, filter); RCU_INIT_POINTER(sk->sk_filter, NULL); } sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); #ifdef CONFIG_BPF_SYSCALL bpf_sk_storage_free(sk); #endif if (atomic_read(&sk->sk_omem_alloc)) pr_debug("%s: optmem leakage (%d bytes) detected\n", __func__, atomic_read(&sk->sk_omem_alloc)); if (sk->sk_frag.page) { put_page(sk->sk_frag.page); sk->sk_frag.page = NULL; } /* We do not need to acquire sk->sk_peer_lock, we are the last user. */ put_cred(sk->sk_peer_cred); put_pid(sk->sk_peer_pid); if (likely(sk->sk_net_refcnt)) put_net_track(sock_net(sk), &sk->ns_tracker); else __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false); sk_prot_free(sk->sk_prot_creator, sk); } void sk_destruct(struct sock *sk) { bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); if (rcu_access_pointer(sk->sk_reuseport_cb)) { reuseport_detach_sock(sk); use_call_rcu = true; } if (use_call_rcu) call_rcu(&sk->sk_rcu, __sk_destruct); else __sk_destruct(&sk->sk_rcu); } static void __sk_free(struct sock *sk) { if (likely(sk->sk_net_refcnt)) sock_inuse_add(sock_net(sk), -1); if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) sock_diag_broadcast_destroy(sk); else sk_destruct(sk); } void sk_free(struct sock *sk) { /* * We subtract one from sk_wmem_alloc and can know if * some packets are still in some tx queue. * If not null, sock_wfree() will call __sk_free(sk) later */ if (refcount_dec_and_test(&sk->sk_wmem_alloc)) __sk_free(sk); } EXPORT_SYMBOL(sk_free); static void sk_init_common(struct sock *sk) { skb_queue_head_init(&sk->sk_receive_queue); skb_queue_head_init(&sk->sk_write_queue); skb_queue_head_init(&sk->sk_error_queue); rwlock_init(&sk->sk_callback_lock); lockdep_set_class_and_name(&sk->sk_receive_queue.lock, af_rlock_keys + sk->sk_family, af_family_rlock_key_strings[sk->sk_family]); lockdep_set_class_and_name(&sk->sk_write_queue.lock, af_wlock_keys + sk->sk_family, af_family_wlock_key_strings[sk->sk_family]); lockdep_set_class_and_name(&sk->sk_error_queue.lock, af_elock_keys + sk->sk_family, af_family_elock_key_strings[sk->sk_family]); lockdep_set_class_and_name(&sk->sk_callback_lock, af_callback_keys + sk->sk_family, af_family_clock_key_strings[sk->sk_family]); } /** * sk_clone_lock - clone a socket, and lock its clone * @sk: the socket to clone * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) */ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) { struct proto *prot = READ_ONCE(sk->sk_prot); struct sk_filter *filter; bool is_charged = true; struct sock *newsk; newsk = sk_prot_alloc(prot, priority, sk->sk_family); if (!newsk) goto out; sock_copy(newsk, sk); newsk->sk_prot_creator = prot; /* SANITY */ if (likely(newsk->sk_net_refcnt)) { get_net_track(sock_net(newsk), &newsk->ns_tracker, priority); sock_inuse_add(sock_net(newsk), 1); } else { /* Kernel sockets are not elevating the struct net refcount. * Instead, use a tracker to more easily detect if a layer * is not properly dismantling its kernel sockets at netns * destroy time. */ __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker, false, priority); } sk_node_init(&newsk->sk_node); sock_lock_init(newsk); bh_lock_sock(newsk); newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; newsk->sk_backlog.len = 0; atomic_set(&newsk->sk_rmem_alloc, 0); /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */ refcount_set(&newsk->sk_wmem_alloc, 1); atomic_set(&newsk->sk_omem_alloc, 0); sk_init_common(newsk); newsk->sk_dst_cache = NULL; newsk->sk_dst_pending_confirm = 0; newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; newsk->sk_reserved_mem = 0; atomic_set(&newsk->sk_drops, 0); newsk->sk_send_head = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; atomic_set(&newsk->sk_zckey, 0); sock_reset_flag(newsk, SOCK_DONE); /* sk->sk_memcg will be populated at accept() time */ newsk->sk_memcg = NULL; cgroup_sk_clone(&newsk->sk_cgrp_data); rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter != NULL) /* though it's an empty new sock, the charging may fail * if sysctl_optmem_max was changed between creation of * original socket and cloning */ is_charged = sk_filter_charge(newsk, filter); RCU_INIT_POINTER(newsk->sk_filter, filter); rcu_read_unlock(); if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { /* We need to make sure that we don't uncharge the new * socket if we couldn't charge it in the first place * as otherwise we uncharge the parent's filter. */ if (!is_charged) RCU_INIT_POINTER(newsk->sk_filter, NULL); sk_free_unlock_clone(newsk); newsk = NULL; goto out; } RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); if (bpf_sk_storage_clone(sk, newsk)) { sk_free_unlock_clone(newsk); newsk = NULL; goto out; } /* Clear sk_user_data if parent had the pointer tagged * as not suitable for copying when cloning. */ if (sk_user_data_is_nocopy(newsk)) newsk->sk_user_data = NULL; newsk->sk_err = 0; newsk->sk_err_soft = 0; newsk->sk_priority = 0; newsk->sk_incoming_cpu = raw_smp_processor_id(); /* Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.rst for details) */ smp_wmb(); refcount_set(&newsk->sk_refcnt, 2); sk_set_socket(newsk, NULL); sk_tx_queue_clear(newsk); RCU_INIT_POINTER(newsk->sk_wq, NULL); if (newsk->sk_prot->sockets_allocated) sk_sockets_allocated_inc(newsk); if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP) net_enable_timestamp(); out: return newsk; } EXPORT_SYMBOL_GPL(sk_clone_lock); void sk_free_unlock_clone(struct sock *sk) { /* It is still raw copy of parent, so invalidate * destructor and make plain sk_free() */ sk->sk_destruct = NULL; bh_unlock_sock(sk); sk_free(sk); } EXPORT_SYMBOL_GPL(sk_free_unlock_clone); static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) { bool is_ipv6 = false; u32 max_size; #if IS_ENABLED(CONFIG_IPV6) is_ipv6 = (sk->sk_family == AF_INET6 && !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)); #endif /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) : READ_ONCE(dst->dev->gso_ipv4_max_size); if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk)) max_size = GSO_LEGACY_MAX_SIZE; return max_size - (MAX_TCP_HEADER + 1); } void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { u32 max_segs = 1; sk->sk_route_caps = dst->dev->features; if (sk_is_tcp(sk)) sk->sk_route_caps |= NETIF_F_GSO; if (sk->sk_route_caps & NETIF_F_GSO) sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; if (unlikely(sk->sk_gso_disabled)) sk->sk_route_caps &= ~NETIF_F_GSO_MASK; if (sk_can_gso(sk)) { if (dst->header_len && !xfrm_dst_offload_ok(dst)) { sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst); /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1); } } sk->sk_gso_max_segs = max_segs; sk_dst_set(sk, dst); } EXPORT_SYMBOL_GPL(sk_setup_caps); /* * Simple resource managers for sockets. */ /* * Write buffer destructor automatically called from kfree_skb. */ void sock_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; unsigned int len = skb->truesize; bool free; if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { if (sock_flag(sk, SOCK_RCU_FREE) && sk->sk_write_space == sock_def_write_space) { rcu_read_lock(); free = refcount_sub_and_test(len, &sk->sk_wmem_alloc); sock_def_write_space_wfree(sk); rcu_read_unlock(); if (unlikely(free)) __sk_free(sk); return; } /* * Keep a reference on sk_wmem_alloc, this will be released * after sk_write_space() call */ WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); sk->sk_write_space(sk); len = 1; } /* * if sk_wmem_alloc reaches 0, we must finish what sk_free() * could not do because of in-flight packets */ if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) __sk_free(sk); } EXPORT_SYMBOL(sock_wfree); /* This variant of sock_wfree() is used by TCP, * since it sets SOCK_USE_WRITE_QUEUE. */ void __sock_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) __sk_free(sk); } void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) { skb_orphan(skb); skb->sk = sk; #ifdef CONFIG_INET if (unlikely(!sk_fullsock(sk))) { skb->destructor = sock_edemux; sock_hold(sk); return; } #endif skb->destructor = sock_wfree; skb_set_hash_from_sk(skb, sk); /* * We used to take a refcount on sk, but following operation * is enough to guarantee sk_free() wont free this sock until * all in-flight packets are completed */ refcount_add(skb->truesize, &sk->sk_wmem_alloc); } EXPORT_SYMBOL(skb_set_owner_w); static bool can_skb_orphan_partial(const struct sk_buff *skb) { #ifdef CONFIG_TLS_DEVICE /* Drivers depend on in-order delivery for crypto offload, * partial orphan breaks out-of-order-OK logic. */ if (skb->decrypted) return false; #endif return (skb->destructor == sock_wfree || (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); } /* This helper is used by netem, as it can hold packets in its * delay queue. We want to allow the owner socket to send more * packets, as if they were already TX completed by a typical driver. * But we also want to keep skb->sk set because some packet schedulers * rely on it (sch_fq for example). */ void skb_orphan_partial(struct sk_buff *skb) { if (skb_is_tcp_pure_ack(skb)) return; if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk)) return; skb_orphan(skb); } EXPORT_SYMBOL(skb_orphan_partial); /* * Read buffer destructor automatically called from kfree_skb. */ void sock_rfree(struct sk_buff *skb) { struct sock *sk = skb->sk; unsigned int len = skb->truesize; atomic_sub(len, &sk->sk_rmem_alloc); sk_mem_uncharge(sk, len); } EXPORT_SYMBOL(sock_rfree); /* * Buffer destructor for skbs that are not used directly in read or write * path, e.g. for error handler skbs. Automatically called from kfree_skb. */ void sock_efree(struct sk_buff *skb) { sock_put(skb->sk); } EXPORT_SYMBOL(sock_efree); /* Buffer destructor for prefetch/receive path where reference count may * not be held, e.g. for listen sockets. */ #ifdef CONFIG_INET void sock_pfree(struct sk_buff *skb) { struct sock *sk = skb->sk; if (!sk_is_refcounted(sk)) return; if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) { inet_reqsk(sk)->rsk_listener = NULL; reqsk_free(inet_reqsk(sk)); return; } sock_gen_put(sk); } EXPORT_SYMBOL(sock_pfree); #endif /* CONFIG_INET */ kuid_t sock_i_uid(struct sock *sk) { kuid_t uid; read_lock_bh(&sk->sk_callback_lock); uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; read_unlock_bh(&sk->sk_callback_lock); return uid; } EXPORT_SYMBOL(sock_i_uid); unsigned long __sock_i_ino(struct sock *sk) { unsigned long ino; read_lock(&sk->sk_callback_lock); ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; read_unlock(&sk->sk_callback_lock); return ino; } EXPORT_SYMBOL(__sock_i_ino); unsigned long sock_i_ino(struct sock *sk) { unsigned long ino; local_bh_disable(); ino = __sock_i_ino(sk); local_bh_enable(); return ino; } EXPORT_SYMBOL(sock_i_ino); /* * Allocate a skb from the socket's send buffer. */ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority) { if (force || refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { struct sk_buff *skb = alloc_skb(size, priority); if (skb) { skb_set_owner_w(skb, sk); return skb; } } return NULL; } EXPORT_SYMBOL(sock_wmalloc); static void sock_ofree(struct sk_buff *skb) { struct sock *sk = skb->sk; atomic_sub(skb->truesize, &sk->sk_omem_alloc); } struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, gfp_t priority) { struct sk_buff *skb; /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max)) return NULL; skb = alloc_skb(size, priority); if (!skb) return NULL; atomic_add(skb->truesize, &sk->sk_omem_alloc); skb->sk = sk; skb->destructor = sock_ofree; return skb; } /* * Allocate a memory block from the socket's option memory buffer. */ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) { int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); if ((unsigned int)size <= optmem_max && atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { void *mem; /* First do the add, to avoid the race if kmalloc * might sleep. */ atomic_add(size, &sk->sk_omem_alloc); mem = kmalloc(size, priority); if (mem) return mem; atomic_sub(size, &sk->sk_omem_alloc); } return NULL; } EXPORT_SYMBOL(sock_kmalloc); /* Free an option memory block. Note, we actually want the inline * here as this allows gcc to detect the nullify and fold away the * condition entirely. */ static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, const bool nullify) { if (WARN_ON_ONCE(!mem)) return; if (nullify) kfree_sensitive(mem); else kfree(mem); atomic_sub(size, &sk->sk_omem_alloc); } void sock_kfree_s(struct sock *sk, void *mem, int size) { __sock_kfree_s(sk, mem, size, false); } EXPORT_SYMBOL(sock_kfree_s); void sock_kzfree_s(struct sock *sk, void *mem, int size) { __sock_kfree_s(sk, mem, size, true); } EXPORT_SYMBOL(sock_kzfree_s); /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. I think, these locks should be removed for datagram sockets. */ static long sock_wait_for_wmem(struct sock *sk, long timeo) { DEFINE_WAIT(wait); sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); for (;;) { if (!timeo) break; if (signal_pending(current)) break; set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) break; if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) break; if (READ_ONCE(sk->sk_err)) break; timeo = schedule_timeout(timeo); } finish_wait(sk_sleep(sk), &wait); return timeo; } /* * Generic send/receive buffer handlers */ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, int *errcode, int max_page_order) { struct sk_buff *skb; long timeo; int err; timeo = sock_sndtimeo(sk, noblock); for (;;) { err = sock_error(sk); if (err != 0) goto failure; err = -EPIPE; if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) goto failure; if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) break; sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); err = -EAGAIN; if (!timeo) goto failure; if (signal_pending(current)) goto interrupted; timeo = sock_wait_for_wmem(sk, timeo); } skb = alloc_skb_with_frags(header_len, data_len, max_page_order, errcode, sk->sk_allocation); if (skb) skb_set_owner_w(skb, sk); return skb; interrupted: err = sock_intr_errno(timeo); failure: *errcode = err; return NULL; } EXPORT_SYMBOL(sock_alloc_send_pskb); int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, struct sockcm_cookie *sockc) { u32 tsflags; switch (cmsg->cmsg_type) { case SO_MARK: if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) return -EINVAL; sockc->mark = *(u32 *)CMSG_DATA(cmsg); break; case SO_TIMESTAMPING_OLD: case SO_TIMESTAMPING_NEW: if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) return -EINVAL; tsflags = *(u32 *)CMSG_DATA(cmsg); if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) return -EINVAL; sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; sockc->tsflags |= tsflags; break; case SCM_TXTIME: if (!sock_flag(sk, SOCK_TXTIME)) return -EINVAL; if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) return -EINVAL; sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); break; /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ case SCM_RIGHTS: case SCM_CREDENTIALS: break; default: return -EINVAL; } return 0; } EXPORT_SYMBOL(__sock_cmsg_send); int sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct sockcm_cookie *sockc) { struct cmsghdr *cmsg; int ret; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_SOCKET) continue; ret = __sock_cmsg_send(sk, cmsg, sockc); if (ret) return ret; } return 0; } EXPORT_SYMBOL(sock_cmsg_send); static void sk_enter_memory_pressure(struct sock *sk) { if (!sk->sk_prot->enter_memory_pressure) return; sk->sk_prot->enter_memory_pressure(sk); } static void sk_leave_memory_pressure(struct sock *sk) { if (sk->sk_prot->leave_memory_pressure) { INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure, tcp_leave_memory_pressure, sk); } else { unsigned long *memory_pressure = sk->sk_prot->memory_pressure; if (memory_pressure && READ_ONCE(*memory_pressure)) WRITE_ONCE(*memory_pressure, 0); } } DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); /** * skb_page_frag_refill - check that a page_frag contains enough room * @sz: minimum size of the fragment we want to get * @pfrag: pointer to page_frag * @gfp: priority for memory allocation * * Note: While this allocator tries to use high order pages, there is * no guarantee that allocations succeed. Therefore, @sz MUST be * less or equal than PAGE_SIZE. */ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) { if (pfrag->page) { if (page_ref_count(pfrag->page) == 1) { pfrag->offset = 0; return true; } if (pfrag->offset + sz <= pfrag->size) return true; put_page(pfrag->page); } pfrag->offset = 0; if (SKB_FRAG_PAGE_ORDER && !static_branch_unlikely(&net_high_order_alloc_disable_key)) { /* Avoid direct reclaim but allow kswapd to wake */ pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY, SKB_FRAG_PAGE_ORDER); if (likely(pfrag->page)) { pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; return true; } } pfrag->page = alloc_page(gfp); if (likely(pfrag->page)) { pfrag->size = PAGE_SIZE; return true; } return false; } EXPORT_SYMBOL(skb_page_frag_refill); bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) { if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) return true; sk_enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); return false; } EXPORT_SYMBOL(sk_page_frag_refill); void __lock_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) { DEFINE_WAIT(wait); for (;;) { prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, TASK_UNINTERRUPTIBLE); spin_unlock_bh(&sk->sk_lock.slock); schedule(); spin_lock_bh(&sk->sk_lock.slock); if (!sock_owned_by_user(sk)) break; } finish_wait(&sk->sk_lock.wq, &wait); } void __release_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) { struct sk_buff *skb, *next; while ((skb = sk->sk_backlog.head) != NULL) { sk->sk_backlog.head = sk->sk_backlog.tail = NULL; spin_unlock_bh(&sk->sk_lock.slock); do { next = skb->next; prefetch(next); DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb)); skb_mark_not_on_list(skb); sk_backlog_rcv(sk, skb); cond_resched(); skb = next; } while (skb != NULL); spin_lock_bh(&sk->sk_lock.slock); } /* * Doing the zeroing here guarantee we can not loop forever * while a wild producer attempts to flood us. */ sk->sk_backlog.len = 0; } void __sk_flush_backlog(struct sock *sk) { spin_lock_bh(&sk->sk_lock.slock); __release_sock(sk); if (sk->sk_prot->release_cb) INDIRECT_CALL_INET_1(sk->sk_prot->release_cb, tcp_release_cb, sk); spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL_GPL(__sk_flush_backlog); /** * sk_wait_data - wait for data to arrive at sk_receive_queue * @sk: sock to wait on * @timeo: for how long * @skb: last skb seen on sk_receive_queue * * Now socket state including sk->sk_err is changed only under lock, * hence we may omit checks after joining wait queue. * We check receive queue before schedule() only as optimization; * it is very likely that release_sock() added new data. */ int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) { DEFINE_WAIT_FUNC(wait, woken_wake_function); int rc; add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); return rc; } EXPORT_SYMBOL(sk_wait_data); /** * __sk_mem_raise_allocated - increase memory_allocated * @sk: socket * @size: memory size to allocate * @amt: pages to allocate * @kind: allocation type * * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc. * * Unlike the globally shared limits among the sockets under same protocol, * consuming the budget of a memcg won't have direct effect on other ones. * So be optimistic about memcg's tolerance, and leave the callers to decide * whether or not to raise allocated through sk_under_memory_pressure() or * its variants. */ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL; struct proto *prot = sk->sk_prot; bool charged = false; long allocated; sk_memory_allocated_add(sk, amt); allocated = sk_memory_allocated(sk); if (memcg) { if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge())) goto suppress_allocation; charged = true; } /* Under limit. */ if (allocated <= sk_prot_mem_limits(sk, 0)) { sk_leave_memory_pressure(sk); return 1; } /* Under pressure. */ if (allocated > sk_prot_mem_limits(sk, 1)) sk_enter_memory_pressure(sk); /* Over hard limit. */ if (allocated > sk_prot_mem_limits(sk, 2)) goto suppress_allocation; /* Guarantee minimum buffer size under pressure (either global * or memcg) to make sure features described in RFC 7323 (TCP * Extensions for High Performance) work properly. * * This rule does NOT stand when exceeds global or memcg's hard * limit, or else a DoS attack can be taken place by spawning * lots of sockets whose usage are under minimum buffer size. */ if (kind == SK_MEM_RECV) { if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) return 1; } else { /* SK_MEM_SEND */ int wmem0 = sk_get_wmem0(sk, prot); if (sk->sk_type == SOCK_STREAM) { if (sk->sk_wmem_queued < wmem0) return 1; } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { return 1; } } if (sk_has_memory_pressure(sk)) { u64 alloc; /* The following 'average' heuristic is within the * scope of global accounting, so it only makes * sense for global memory pressure. */ if (!sk_under_global_memory_pressure(sk)) return 1; /* Try to be fair among all the sockets under global * pressure by allowing the ones that below average * usage to raise. */ alloc = sk_sockets_allocated_read_positive(sk); if (sk_prot_mem_limits(sk, 2) > alloc * sk_mem_pages(sk->sk_wmem_queued + atomic_read(&sk->sk_rmem_alloc) + sk->sk_forward_alloc)) return 1; } suppress_allocation: if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { sk_stream_moderate_sndbuf(sk); /* Fail only if socket is _under_ its sndbuf. * In this case we cannot block, so that we have to fail. */ if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) { /* Force charge with __GFP_NOFAIL */ if (memcg && !charged) { mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge() | __GFP_NOFAIL); } return 1; } } if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) trace_sock_exceed_buf_limit(sk, prot, allocated, kind); sk_memory_allocated_sub(sk, amt); if (charged) mem_cgroup_uncharge_skmem(memcg, amt); return 0; } /** * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated * @sk: socket * @size: memory size to allocate * @kind: allocation type * * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means * rmem allocation. This function assumes that protocols which have * memory_pressure use sk_wmem_queued as write buffer accounting. */ int __sk_mem_schedule(struct sock *sk, int size, int kind) { int ret, amt = sk_mem_pages(size); sk_forward_alloc_add(sk, amt << PAGE_SHIFT); ret = __sk_mem_raise_allocated(sk, size, amt, kind); if (!ret) sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT)); return ret; } EXPORT_SYMBOL(__sk_mem_schedule); /** * __sk_mem_reduce_allocated - reclaim memory_allocated * @sk: socket * @amount: number of quanta * * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc */ void __sk_mem_reduce_allocated(struct sock *sk, int amount) { sk_memory_allocated_sub(sk, amount); if (mem_cgroup_sockets_enabled && sk->sk_memcg) mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); if (sk_under_global_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) sk_leave_memory_pressure(sk); } /** * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated * @sk: socket * @amount: number of bytes (rounded down to a PAGE_SIZE multiple) */ void __sk_mem_reclaim(struct sock *sk, int amount) { amount >>= PAGE_SHIFT; sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT)); __sk_mem_reduce_allocated(sk, amount); } EXPORT_SYMBOL(__sk_mem_reclaim); int sk_set_peek_off(struct sock *sk, int val) { WRITE_ONCE(sk->sk_peek_off, val); return 0; } EXPORT_SYMBOL_GPL(sk_set_peek_off); /* * Set of default routines for initialising struct proto_ops when * the protocol does not support a particular function. In certain * cases where it makes no sense for a protocol to have a "do nothing" * function, some default processing is provided. */ int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_bind); int sock_no_connect(struct socket *sock, struct sockaddr *saddr, int len, int flags) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_connect); int sock_no_socketpair(struct socket *sock1, struct socket *sock2) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_socketpair); int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_accept); int sock_no_getname(struct socket *sock, struct sockaddr *saddr, int peer) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_getname); int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_ioctl); int sock_no_listen(struct socket *sock, int backlog) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_listen); int sock_no_shutdown(struct socket *sock, int how) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_shutdown); int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_sendmsg); int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_sendmsg_locked); int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_recvmsg); int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { /* Mirror missing mmap method error code */ return -ENODEV; } EXPORT_SYMBOL(sock_no_mmap); /* * When a file is received (via SCM_RIGHTS, etc), we must bump the * various sock-based usage counts. */ void __receive_sock(struct file *file) { struct socket *sock; sock = sock_from_file(file); if (sock) { sock_update_netprioidx(&sock->sk->sk_cgrp_data); sock_update_classid(&sock->sk->sk_cgrp_data); } } /* * Default Socket Callbacks */ static void sock_def_wakeup(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_all(&wq->wait); rcu_read_unlock(); } static void sock_def_error_report(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_poll(&wq->wait, EPOLLERR); sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); rcu_read_unlock(); } void sock_def_readable(struct sock *sk) { struct socket_wq *wq; trace_sk_data_ready(sk); rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | EPOLLRDNORM | EPOLLRDBAND); sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } static void sock_def_write_space(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ if (sock_writeable(sk)) { wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); /* Should agree with poll, otherwise some programs break */ sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); } /* An optimised version of sock_def_write_space(), should only be called * for SOCK_RCU_FREE sockets under RCU read section and after putting * ->sk_wmem_alloc. */ static void sock_def_write_space_wfree(struct sock *sk) { /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ if (sock_writeable(sk)) { struct socket_wq *wq = rcu_dereference(sk->sk_wq); /* rely on refcount_sub from sock_wfree() */ smp_mb__after_atomic(); if (wq && waitqueue_active(&wq->wait)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); /* Should agree with poll, otherwise some programs break */ sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } } static void sock_def_destruct(struct sock *sk) { } void sk_send_sigurg(struct sock *sk) { if (sk->sk_socket && sk->sk_socket->file) if (send_sigurg(&sk->sk_socket->file->f_owner)) sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); } EXPORT_SYMBOL(sk_send_sigurg); void sk_reset_timer(struct sock *sk, struct timer_list* timer, unsigned long expires) { if (!mod_timer(timer, expires)) sock_hold(sk); } EXPORT_SYMBOL(sk_reset_timer); void sk_stop_timer(struct sock *sk, struct timer_list* timer) { if (del_timer(timer)) __sock_put(sk); } EXPORT_SYMBOL(sk_stop_timer); void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) { if (del_timer_sync(timer)) __sock_put(sk); } EXPORT_SYMBOL(sk_stop_timer_sync); void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) { sk_init_common(sk); sk->sk_send_head = NULL; timer_setup(&sk->sk_timer, NULL, 0); sk->sk_allocation = GFP_KERNEL; sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default); sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default); sk->sk_state = TCP_CLOSE; sk->sk_use_task_frag = true; sk_set_socket(sk, sock); sock_set_flag(sk, SOCK_ZAPPED); if (sock) { sk->sk_type = sock->type; RCU_INIT_POINTER(sk->sk_wq, &sock->wq); sock->sk = sk; } else { RCU_INIT_POINTER(sk->sk_wq, NULL); } sk->sk_uid = uid; rwlock_init(&sk->sk_callback_lock); if (sk->sk_kern_sock) lockdep_set_class_and_name( &sk->sk_callback_lock, af_kern_callback_keys + sk->sk_family, af_family_kern_clock_key_strings[sk->sk_family]); else lockdep_set_class_and_name( &sk->sk_callback_lock, af_callback_keys + sk->sk_family, af_family_clock_key_strings[sk->sk_family]); sk->sk_state_change = sock_def_wakeup; sk->sk_data_ready = sock_def_readable; sk->sk_write_space = sock_def_write_space; sk->sk_error_report = sock_def_error_report; sk->sk_destruct = sock_def_destruct; sk->sk_frag.page = NULL; sk->sk_frag.offset = 0; sk->sk_peek_off = -1; sk->sk_peer_pid = NULL; sk->sk_peer_cred = NULL; spin_lock_init(&sk->sk_peer_lock); sk->sk_write_pending = 0; sk->sk_rcvlowat = 1; sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_stamp = SK_DEFAULT_STAMP; #if BITS_PER_LONG==32 seqlock_init(&sk->sk_stamp_seq); #endif atomic_set(&sk->sk_zckey, 0); #ifdef CONFIG_NET_RX_BUSY_POLL sk->sk_napi_id = 0; sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read); #endif sk->sk_max_pacing_rate = ~0UL; sk->sk_pacing_rate = ~0UL; WRITE_ONCE(sk->sk_pacing_shift, 10); sk->sk_incoming_cpu = -1; sk_rx_queue_clear(sk); /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.rst for details) */ smp_wmb(); refcount_set(&sk->sk_refcnt, 1); atomic_set(&sk->sk_drops, 0); } EXPORT_SYMBOL(sock_init_data_uid); void sock_init_data(struct socket *sock, struct sock *sk) { kuid_t uid = sock ? SOCK_INODE(sock)->i_uid : make_kuid(sock_net(sk)->user_ns, 0); sock_init_data_uid(sock, sk, uid); } EXPORT_SYMBOL(sock_init_data); void lock_sock_nested(struct sock *sk, int subclass) { /* The sk_lock has mutex_lock() semantics here. */ mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); might_sleep(); spin_lock_bh(&sk->sk_lock.slock); if (sock_owned_by_user_nocheck(sk)) __lock_sock(sk); sk->sk_lock.owned = 1; spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL(lock_sock_nested); void release_sock(struct sock *sk) { spin_lock_bh(&sk->sk_lock.slock); if (sk->sk_backlog.tail) __release_sock(sk); if (sk->sk_prot->release_cb) INDIRECT_CALL_INET_1(sk->sk_prot->release_cb, tcp_release_cb, sk); sock_release_ownership(sk); if (waitqueue_active(&sk->sk_lock.wq)) wake_up(&sk->sk_lock.wq); spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL(release_sock); bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) { might_sleep(); spin_lock_bh(&sk->sk_lock.slock); if (!sock_owned_by_user_nocheck(sk)) { /* * Fast path return with bottom halves disabled and * sock::sk_lock.slock held. * * The 'mutex' is not contended and holding * sock::sk_lock.slock prevents all other lockers to * proceed so the corresponding unlock_sock_fast() can * avoid the slow path of release_sock() completely and * just release slock. * * From a semantical POV this is equivalent to 'acquiring' * the 'mutex', hence the corresponding lockdep * mutex_release() has to happen in the fast path of * unlock_sock_fast(). */ return false; } __lock_sock(sk); sk->sk_lock.owned = 1; __acquire(&sk->sk_lock.slock); spin_unlock_bh(&sk->sk_lock.slock); return true; } EXPORT_SYMBOL(__lock_sock_fast); int sock_gettstamp(struct socket *sock, void __user *userstamp, bool timeval, bool time32) { struct sock *sk = sock->sk; struct timespec64 ts; sock_enable_timestamp(sk, SOCK_TIMESTAMP); ts = ktime_to_timespec64(sock_read_timestamp(sk)); if (ts.tv_sec == -1) return -ENOENT; if (ts.tv_sec == 0) { ktime_t kt = ktime_get_real(); sock_write_timestamp(sk, kt); ts = ktime_to_timespec64(kt); } if (timeval) ts.tv_nsec /= 1000; #ifdef CONFIG_COMPAT_32BIT_TIME if (time32) return put_old_timespec32(&ts, userstamp); #endif #ifdef CONFIG_SPARC64 /* beware of padding in sparc64 timeval */ if (timeval && !in_compat_syscall()) { struct __kernel_old_timeval __user tv = { .tv_sec = ts.tv_sec, .tv_usec = ts.tv_nsec, }; if (copy_to_user(userstamp, &tv, sizeof(tv))) return -EFAULT; return 0; } #endif return put_timespec64(&ts, userstamp); } EXPORT_SYMBOL(sock_gettstamp); void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) { if (!sock_flag(sk, flag)) { unsigned long previous_flags = sk->sk_flags; sock_set_flag(sk, flag); /* * we just set one of the two flags which require net * time stamping, but time stamping might have been on * already because of the other one */ if (sock_needs_netstamp(sk) && !(previous_flags & SK_FLAGS_TIMESTAMP)) net_enable_timestamp(); } } int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level, int type) { struct sock_exterr_skb *serr; struct sk_buff *skb; int copied, err; err = -EAGAIN; skb = sock_dequeue_err_skb(sk); if (skb == NULL) goto out; copied = skb->len; if (copied > len) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; sock_recv_timestamp(msg, sk, skb); serr = SKB_EXT_ERR(skb); put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); msg->msg_flags |= MSG_ERRQUEUE; err = copied; out_free_skb: kfree_skb(skb); out: return err; } EXPORT_SYMBOL(sock_recv_errqueue); /* * Get a socket option on an socket. * * FIX: POSIX 1003.1g is very ambiguous here. It states that * asynchronous errors should be reported by getsockopt. We assume * this means if you specify SO_ERROR (otherwise whats the point of it). */ int sock_common_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; /* IPV6_ADDRFORM can change sk->sk_prot under us. */ return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(sock_common_getsockopt); int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; int addr_len = 0; int err; err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len); if (err >= 0) msg->msg_namelen = addr_len; return err; } EXPORT_SYMBOL(sock_common_recvmsg); /* * Set socket options on an inet socket. */ int sock_common_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; /* IPV6_ADDRFORM can change sk->sk_prot under us. */ return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(sock_common_setsockopt); void sk_common_release(struct sock *sk) { if (sk->sk_prot->destroy) sk->sk_prot->destroy(sk); /* * Observation: when sk_common_release is called, processes have * no access to socket. But net still has. * Step one, detach it from networking: * * A. Remove from hash tables. */ sk->sk_prot->unhash(sk); /* * In this point socket cannot receive new packets, but it is possible * that some packets are in flight because some CPU runs receiver and * did hash table lookup before we unhashed socket. They will achieve * receive queue and will be purged by socket destructor. * * Also we still have packets pending on receive queue and probably, * our own packets waiting in device queues. sock_destroy will drain * receive queue, but transmitted packets will delay socket destruction * until the last reference will be released. */ sock_orphan(sk); xfrm_sk_free_policy(sk); sock_put(sk); } EXPORT_SYMBOL(sk_common_release); void sk_get_meminfo(const struct sock *sk, u32 *mem) { memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk); mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); } #ifdef CONFIG_PROC_FS static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); int sock_prot_inuse_get(struct net *net, struct proto *prot) { int cpu, idx = prot->inuse_idx; int res = 0; for_each_possible_cpu(cpu) res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; return res >= 0 ? res : 0; } EXPORT_SYMBOL_GPL(sock_prot_inuse_get); int sock_inuse_get(struct net *net) { int cpu, res = 0; for_each_possible_cpu(cpu) res += per_cpu_ptr(net->core.prot_inuse, cpu)->all; return res; } EXPORT_SYMBOL_GPL(sock_inuse_get); static int __net_init sock_inuse_init_net(struct net *net) { net->core.prot_inuse = alloc_percpu(struct prot_inuse); if (net->core.prot_inuse == NULL) return -ENOMEM; return 0; } static void __net_exit sock_inuse_exit_net(struct net *net) { free_percpu(net->core.prot_inuse); } static struct pernet_operations net_inuse_ops = { .init = sock_inuse_init_net, .exit = sock_inuse_exit_net, }; static __init int net_inuse_init(void) { if (register_pernet_subsys(&net_inuse_ops)) panic("Cannot initialize net inuse counters"); return 0; } core_initcall(net_inuse_init); static int assign_proto_idx(struct proto *prot) { prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { pr_err("PROTO_INUSE_NR exhausted\n"); return -ENOSPC; } set_bit(prot->inuse_idx, proto_inuse_idx); return 0; } static void release_proto_idx(struct proto *prot) { if (prot->inuse_idx != PROTO_INUSE_NR - 1) clear_bit(prot->inuse_idx, proto_inuse_idx); } #else static inline int assign_proto_idx(struct proto *prot) { return 0; } static inline void release_proto_idx(struct proto *prot) { } #endif static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) { if (!twsk_prot) return; kfree(twsk_prot->twsk_slab_name); twsk_prot->twsk_slab_name = NULL; kmem_cache_destroy(twsk_prot->twsk_slab); twsk_prot->twsk_slab = NULL; } static int tw_prot_init(const struct proto *prot) { struct timewait_sock_ops *twsk_prot = prot->twsk_prot; if (!twsk_prot) return 0; twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name); if (!twsk_prot->twsk_slab_name) return -ENOMEM; twsk_prot->twsk_slab = kmem_cache_create(twsk_prot->twsk_slab_name, twsk_prot->twsk_obj_size, 0, SLAB_ACCOUNT | prot->slab_flags, NULL); if (!twsk_prot->twsk_slab) { pr_crit("%s: Can't create timewait sock SLAB cache!\n", prot->name); return -ENOMEM; } return 0; } static void req_prot_cleanup(struct request_sock_ops *rsk_prot) { if (!rsk_prot) return; kfree(rsk_prot->slab_name); rsk_prot->slab_name = NULL; kmem_cache_destroy(rsk_prot->slab); rsk_prot->slab = NULL; } static int req_prot_init(const struct proto *prot) { struct request_sock_ops *rsk_prot = prot->rsk_prot; if (!rsk_prot) return 0; rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name); if (!rsk_prot->slab_name) return -ENOMEM; rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, rsk_prot->obj_size, 0, SLAB_ACCOUNT | prot->slab_flags, NULL); if (!rsk_prot->slab) { pr_crit("%s: Can't create request sock SLAB cache!\n", prot->name); return -ENOMEM; } return 0; } int proto_register(struct proto *prot, int alloc_slab) { int ret = -ENOBUFS; if (prot->memory_allocated && !prot->sysctl_mem) { pr_err("%s: missing sysctl_mem\n", prot->name); return -EINVAL; } if (prot->memory_allocated && !prot->per_cpu_fw_alloc) { pr_err("%s: missing per_cpu_fw_alloc\n", prot->name); return -EINVAL; } if (alloc_slab) { prot->slab = kmem_cache_create_usercopy(prot->name, prot->obj_size, 0, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | prot->slab_flags, prot->useroffset, prot->usersize, NULL); if (prot->slab == NULL) { pr_crit("%s: Can't create sock SLAB cache!\n", prot->name); goto out; } if (req_prot_init(prot)) goto out_free_request_sock_slab; if (tw_prot_init(prot)) goto out_free_timewait_sock_slab; } mutex_lock(&proto_list_mutex); ret = assign_proto_idx(prot); if (ret) { mutex_unlock(&proto_list_mutex); goto out_free_timewait_sock_slab; } list_add(&prot->node, &proto_list); mutex_unlock(&proto_list_mutex); return ret; out_free_timewait_sock_slab: if (alloc_slab) tw_prot_cleanup(prot->twsk_prot); out_free_request_sock_slab: if (alloc_slab) { req_prot_cleanup(prot->rsk_prot); kmem_cache_destroy(prot->slab); prot->slab = NULL; } out: return ret; } EXPORT_SYMBOL(proto_register); void proto_unregister(struct proto *prot) { mutex_lock(&proto_list_mutex); release_proto_idx(prot); list_del(&prot->node); mutex_unlock(&proto_list_mutex); kmem_cache_destroy(prot->slab); prot->slab = NULL; req_prot_cleanup(prot->rsk_prot); tw_prot_cleanup(prot->twsk_prot); } EXPORT_SYMBOL(proto_unregister); int sock_load_diag_module(int family, int protocol) { if (!protocol) { if (!sock_is_registered(family)) return -ENOENT; return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, NETLINK_SOCK_DIAG, family); } #ifdef CONFIG_INET if (family == AF_INET && protocol != IPPROTO_RAW && protocol < MAX_INET_PROTOS && !rcu_access_pointer(inet_protos[protocol])) return -ENOENT; #endif return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, NETLINK_SOCK_DIAG, family, protocol); } EXPORT_SYMBOL(sock_load_diag_module); #ifdef CONFIG_PROC_FS static void *proto_seq_start(struct seq_file *seq, loff_t *pos) __acquires(proto_list_mutex) { mutex_lock(&proto_list_mutex); return seq_list_start_head(&proto_list, *pos); } static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) { return seq_list_next(v, &proto_list, pos); } static void proto_seq_stop(struct seq_file *seq, void *v) __releases(proto_list_mutex) { mutex_unlock(&proto_list_mutex); } static char proto_method_implemented(const void *method) { return method == NULL ? 'n' : 'y'; } static long sock_prot_memory_allocated(struct proto *proto) { return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; } static const char *sock_prot_memory_pressure(struct proto *proto) { return proto->memory_pressure != NULL ? proto_memory_pressure(proto) ? "yes" : "no" : "NI"; } static void proto_seq_printf(struct seq_file *seq, struct proto *proto) { seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", proto->name, proto->obj_size, sock_prot_inuse_get(seq_file_net(seq), proto), sock_prot_memory_allocated(proto), sock_prot_memory_pressure(proto), proto->max_header, proto->slab == NULL ? "no" : "yes", module_name(proto->owner), proto_method_implemented(proto->close), proto_method_implemented(proto->connect), proto_method_implemented(proto->disconnect), proto_method_implemented(proto->accept), proto_method_implemented(proto->ioctl), proto_method_implemented(proto->init), proto_method_implemented(proto->destroy), proto_method_implemented(proto->shutdown), proto_method_implemented(proto->setsockopt), proto_method_implemented(proto->getsockopt), proto_method_implemented(proto->sendmsg), proto_method_implemented(proto->recvmsg), proto_method_implemented(proto->bind), proto_method_implemented(proto->backlog_rcv), proto_method_implemented(proto->hash), proto_method_implemented(proto->unhash), proto_method_implemented(proto->get_port), proto_method_implemented(proto->enter_memory_pressure)); } static int proto_seq_show(struct seq_file *seq, void *v) { if (v == &proto_list) seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", "protocol", "size", "sockets", "memory", "press", "maxhdr", "slab", "module", "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n"); else proto_seq_printf(seq, list_entry(v, struct proto, node)); return 0; } static const struct seq_operations proto_seq_ops = { .start = proto_seq_start, .next = proto_seq_next, .stop = proto_seq_stop, .show = proto_seq_show, }; static __net_init int proto_init_net(struct net *net) { if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, sizeof(struct seq_net_private))) return -ENOMEM; return 0; } static __net_exit void proto_exit_net(struct net *net) { remove_proc_entry("protocols", net->proc_net); } static __net_initdata struct pernet_operations proto_net_ops = { .init = proto_init_net, .exit = proto_exit_net, }; static int __init proto_init(void) { return register_pernet_subsys(&proto_net_ops); } subsys_initcall(proto_init); #endif /* PROC_FS */ #ifdef CONFIG_NET_RX_BUSY_POLL bool sk_busy_loop_end(void *p, unsigned long start_time) { struct sock *sk = p; if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) return true; if (sk_is_udp(sk) && !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue)) return true; return sk_busy_loop_timeout(sk, start_time); } EXPORT_SYMBOL(sk_busy_loop_end); #endif /* CONFIG_NET_RX_BUSY_POLL */ int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) { if (!sk->sk_prot->bind_add) return -EOPNOTSUPP; return sk->sk_prot->bind_add(sk, addr, addr_len); } EXPORT_SYMBOL(sock_bind_add); /* Copy 'size' bytes from userspace and return `size` back to userspace */ int sock_ioctl_inout(struct sock *sk, unsigned int cmd, void __user *arg, void *karg, size_t size) { int ret; if (copy_from_user(karg, arg, size)) return -EFAULT; ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg); if (ret) return ret; if (copy_to_user(arg, karg, size)) return -EFAULT; return 0; } EXPORT_SYMBOL(sock_ioctl_inout); /* This is the most common ioctl prep function, where the result (4 bytes) is * copied back to userspace if the ioctl() returns successfully. No input is * copied from userspace as input argument. */ static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg) { int ret, karg = 0; ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg); if (ret) return ret; return put_user(karg, (int __user *)arg); } /* A wrapper around sock ioctls, which copies the data from userspace * (depending on the protocol/ioctl), and copies back the result to userspace. * The main motivation for this function is to pass kernel memory to the * protocol ioctl callbacks, instead of userspace memory. */ int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { int rc = 1; if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET) rc = ipmr_sk_ioctl(sk, cmd, arg); else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6) rc = ip6mr_sk_ioctl(sk, cmd, arg); else if (sk_is_phonet(sk)) rc = phonet_sk_ioctl(sk, cmd, arg); /* If ioctl was processed, returns its value */ if (rc <= 0) return rc; /* Otherwise call the default handler */ return sock_ioctl_out(sk, cmd, arg); } EXPORT_SYMBOL(sk_ioctl); static int __init sock_struct_check(void) { CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag); return 0; } core_initcall(sock_struct_check);
1 1 1 41 1 40 33 12 12 12 12 43 85 27 20 1 10 115 116 117 109 109 109 108 109 108 61 61 36 33 3 41 40 30 3 34 33 34 34 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001-2003 International Business Machines, Corp. * Copyright (c) 2001 Intel Corp. * Copyright (c) 2001 Nokia, Inc. * Copyright (c) 2001 La Monte H.P. Yarroll * * This file is part of the SCTP kernel implementation * * These functions handle all input from the IP layer into SCTP. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Xingang Guo <xingang.guo@intel.com> * Jon Grimm <jgrimm@us.ibm.com> * Hui Huang <hui.huang@nokia.com> * Daisy Chang <daisyc@us.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> */ #include <linux/types.h> #include <linux/list.h> /* For struct list_head */ #include <linux/socket.h> #include <linux/ip.h> #include <linux/time.h> /* For struct timeval */ #include <linux/slab.h> #include <net/ip.h> #include <net/icmp.h> #include <net/snmp.h> #include <net/sock.h> #include <net/xfrm.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/checksum.h> #include <net/net_namespace.h> #include <linux/rhashtable.h> #include <net/sock_reuseport.h> /* Forward declarations for internal helpers. */ static int sctp_rcv_ootb(struct sk_buff *); static struct sctp_association *__sctp_rcv_lookup(struct net *net, struct sk_buff *skb, const union sctp_addr *paddr, const union sctp_addr *laddr, struct sctp_transport **transportp, int dif, int sdif); static struct sctp_endpoint *__sctp_rcv_lookup_endpoint( struct net *net, struct sk_buff *skb, const union sctp_addr *laddr, const union sctp_addr *daddr, int dif, int sdif); static struct sctp_association *__sctp_lookup_association( struct net *net, const union sctp_addr *local, const union sctp_addr *peer, struct sctp_transport **pt, int dif, int sdif); static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb); /* Calculate the SCTP checksum of an SCTP packet. */ static inline int sctp_rcv_checksum(struct net *net, struct sk_buff *skb) { struct sctphdr *sh = sctp_hdr(skb); __le32 cmp = sh->checksum; __le32 val = sctp_compute_cksum(skb, 0); if (val != cmp) { /* CRC failure, dump it. */ __SCTP_INC_STATS(net, SCTP_MIB_CHECKSUMERRORS); return -1; } return 0; } /* * This is the routine which IP calls when receiving an SCTP packet. */ int sctp_rcv(struct sk_buff *skb) { struct sock *sk; struct sctp_association *asoc; struct sctp_endpoint *ep = NULL; struct sctp_ep_common *rcvr; struct sctp_transport *transport = NULL; struct sctp_chunk *chunk; union sctp_addr src; union sctp_addr dest; int family; struct sctp_af *af; struct net *net = dev_net(skb->dev); bool is_gso = skb_is_gso(skb) && skb_is_gso_sctp(skb); int dif, sdif; if (skb->pkt_type != PACKET_HOST) goto discard_it; __SCTP_INC_STATS(net, SCTP_MIB_INSCTPPACKS); /* If packet is too small to contain a single chunk, let's not * waste time on it anymore. */ if (skb->len < sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + skb_transport_offset(skb)) goto discard_it; /* If the packet is fragmented and we need to do crc checking, * it's better to just linearize it otherwise crc computing * takes longer. */ if ((!is_gso && skb_linearize(skb)) || !pskb_may_pull(skb, sizeof(struct sctphdr))) goto discard_it; /* Pull up the IP header. */ __skb_pull(skb, skb_transport_offset(skb)); skb->csum_valid = 0; /* Previous value not applicable */ if (skb_csum_unnecessary(skb)) __skb_decr_checksum_unnecessary(skb); else if (!sctp_checksum_disable && !is_gso && sctp_rcv_checksum(net, skb) < 0) goto discard_it; skb->csum_valid = 1; __skb_pull(skb, sizeof(struct sctphdr)); family = ipver2af(ip_hdr(skb)->version); af = sctp_get_af_specific(family); if (unlikely(!af)) goto discard_it; SCTP_INPUT_CB(skb)->af = af; /* Initialize local addresses for lookups. */ af->from_skb(&src, skb, 1); af->from_skb(&dest, skb, 0); dif = af->skb_iif(skb); sdif = af->skb_sdif(skb); /* If the packet is to or from a non-unicast address, * silently discard the packet. * * This is not clearly defined in the RFC except in section * 8.4 - OOTB handling. However, based on the book "Stream Control * Transmission Protocol" 2.1, "It is important to note that the * IP address of an SCTP transport address must be a routable * unicast address. In other words, IP multicast addresses and * IP broadcast addresses cannot be used in an SCTP transport * address." */ if (!af->addr_valid(&src, NULL, skb) || !af->addr_valid(&dest, NULL, skb)) goto discard_it; asoc = __sctp_rcv_lookup(net, skb, &src, &dest, &transport, dif, sdif); if (!asoc) ep = __sctp_rcv_lookup_endpoint(net, skb, &dest, &src, dif, sdif); /* Retrieve the common input handling substructure. */ rcvr = asoc ? &asoc->base : &ep->base; sk = rcvr->sk; /* * RFC 2960, 8.4 - Handle "Out of the blue" Packets. * An SCTP packet is called an "out of the blue" (OOTB) * packet if it is correctly formed, i.e., passed the * receiver's checksum check, but the receiver is not * able to identify the association to which this * packet belongs. */ if (!asoc) { if (sctp_rcv_ootb(skb)) { __SCTP_INC_STATS(net, SCTP_MIB_OUTOFBLUES); goto discard_release; } } if (!xfrm_policy_check(sk, XFRM_POLICY_IN, skb, family)) goto discard_release; nf_reset_ct(skb); if (sk_filter(sk, skb)) goto discard_release; /* Create an SCTP packet structure. */ chunk = sctp_chunkify(skb, asoc, sk, GFP_ATOMIC); if (!chunk) goto discard_release; SCTP_INPUT_CB(skb)->chunk = chunk; /* Remember what endpoint is to handle this packet. */ chunk->rcvr = rcvr; /* Remember the SCTP header. */ chunk->sctp_hdr = sctp_hdr(skb); /* Set the source and destination addresses of the incoming chunk. */ sctp_init_addrs(chunk, &src, &dest); /* Remember where we came from. */ chunk->transport = transport; /* Acquire access to the sock lock. Note: We are safe from other * bottom halves on this lock, but a user may be in the lock too, * so check if it is busy. */ bh_lock_sock(sk); if (sk != rcvr->sk) { /* Our cached sk is different from the rcvr->sk. This is * because migrate()/accept() may have moved the association * to a new socket and released all the sockets. So now we * are holding a lock on the old socket while the user may * be doing something with the new socket. Switch our veiw * of the current sk. */ bh_unlock_sock(sk); sk = rcvr->sk; bh_lock_sock(sk); } if (sock_owned_by_user(sk) || !sctp_newsk_ready(sk)) { if (sctp_add_backlog(sk, skb)) { bh_unlock_sock(sk); sctp_chunk_free(chunk); skb = NULL; /* sctp_chunk_free already freed the skb */ goto discard_release; } __SCTP_INC_STATS(net, SCTP_MIB_IN_PKT_BACKLOG); } else { __SCTP_INC_STATS(net, SCTP_MIB_IN_PKT_SOFTIRQ); sctp_inq_push(&chunk->rcvr->inqueue, chunk); } bh_unlock_sock(sk); /* Release the asoc/ep ref we took in the lookup calls. */ if (transport) sctp_transport_put(transport); else sctp_endpoint_put(ep); return 0; discard_it: __SCTP_INC_STATS(net, SCTP_MIB_IN_PKT_DISCARDS); kfree_skb(skb); return 0; discard_release: /* Release the asoc/ep ref we took in the lookup calls. */ if (transport) sctp_transport_put(transport); else sctp_endpoint_put(ep); goto discard_it; } /* Process the backlog queue of the socket. Every skb on * the backlog holds a ref on an association or endpoint. * We hold this ref throughout the state machine to make * sure that the structure we need is still around. */ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) { struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk; struct sctp_inq *inqueue = &chunk->rcvr->inqueue; struct sctp_transport *t = chunk->transport; struct sctp_ep_common *rcvr = NULL; int backloged = 0; rcvr = chunk->rcvr; /* If the rcvr is dead then the association or endpoint * has been deleted and we can safely drop the chunk * and refs that we are holding. */ if (rcvr->dead) { sctp_chunk_free(chunk); goto done; } if (unlikely(rcvr->sk != sk)) { /* In this case, the association moved from one socket to * another. We are currently sitting on the backlog of the * old socket, so we need to move. * However, since we are here in the process context we * need to take make sure that the user doesn't own * the new socket when we process the packet. * If the new socket is user-owned, queue the chunk to the * backlog of the new socket without dropping any refs. * Otherwise, we can safely push the chunk on the inqueue. */ sk = rcvr->sk; local_bh_disable(); bh_lock_sock(sk); if (sock_owned_by_user(sk) || !sctp_newsk_ready(sk)) { if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) sctp_chunk_free(chunk); else backloged = 1; } else sctp_inq_push(inqueue, chunk); bh_unlock_sock(sk); local_bh_enable(); /* If the chunk was backloged again, don't drop refs */ if (backloged) return 0; } else { if (!sctp_newsk_ready(sk)) { if (!sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) return 0; sctp_chunk_free(chunk); } else { sctp_inq_push(inqueue, chunk); } } done: /* Release the refs we took in sctp_add_backlog */ if (SCTP_EP_TYPE_ASSOCIATION == rcvr->type) sctp_transport_put(t); else if (SCTP_EP_TYPE_SOCKET == rcvr->type) sctp_endpoint_put(sctp_ep(rcvr)); else BUG(); return 0; } static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb) { struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk; struct sctp_transport *t = chunk->transport; struct sctp_ep_common *rcvr = chunk->rcvr; int ret; ret = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)); if (!ret) { /* Hold the assoc/ep while hanging on the backlog queue. * This way, we know structures we need will not disappear * from us */ if (SCTP_EP_TYPE_ASSOCIATION == rcvr->type) sctp_transport_hold(t); else if (SCTP_EP_TYPE_SOCKET == rcvr->type) sctp_endpoint_hold(sctp_ep(rcvr)); else BUG(); } return ret; } /* Handle icmp frag needed error. */ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc, struct sctp_transport *t, __u32 pmtu) { if (!t || (t->pathmtu <= pmtu && t->pl.probe_size + sctp_transport_pl_hlen(t) <= pmtu)) return; if (sock_owned_by_user(sk)) { atomic_set(&t->mtu_info, pmtu); asoc->pmtu_pending = 1; t->pmtu_pending = 1; return; } if (!(t->param_flags & SPP_PMTUD_ENABLE)) /* We can't allow retransmitting in such case, as the * retransmission would be sized just as before, and thus we * would get another icmp, and retransmit again. */ return; /* Update transports view of the MTU. Return if no update was needed. * If an update wasn't needed/possible, it also doesn't make sense to * try to retransmit now. */ if (!sctp_transport_update_pmtu(t, pmtu)) return; /* Update association pmtu. */ sctp_assoc_sync_pmtu(asoc); /* Retransmit with the new pmtu setting. */ sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD); } void sctp_icmp_redirect(struct sock *sk, struct sctp_transport *t, struct sk_buff *skb) { struct dst_entry *dst; if (sock_owned_by_user(sk) || !t) return; dst = sctp_transport_dst_check(t); if (dst) dst->ops->redirect(dst, sk, skb); } /* * SCTP Implementer's Guide, 2.37 ICMP handling procedures * * ICMP8) If the ICMP code is a "Unrecognized next header type encountered" * or a "Protocol Unreachable" treat this message as an abort * with the T bit set. * * This function sends an event to the state machine, which will abort the * association. * */ void sctp_icmp_proto_unreachable(struct sock *sk, struct sctp_association *asoc, struct sctp_transport *t) { if (sock_owned_by_user(sk)) { if (timer_pending(&t->proto_unreach_timer)) return; else { if (!mod_timer(&t->proto_unreach_timer, jiffies + (HZ/20))) sctp_transport_hold(t); } } else { struct net *net = sock_net(sk); pr_debug("%s: unrecognized next header type " "encountered!\n", __func__); if (del_timer(&t->proto_unreach_timer)) sctp_transport_put(t); sctp_do_sm(net, SCTP_EVENT_T_OTHER, SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH), asoc->state, asoc->ep, asoc, t, GFP_ATOMIC); } } /* Common lookup code for icmp/icmpv6 error handler. */ struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *skb, struct sctphdr *sctphdr, struct sctp_association **app, struct sctp_transport **tpp) { struct sctp_init_chunk *chunkhdr, _chunkhdr; union sctp_addr saddr; union sctp_addr daddr; struct sctp_af *af; struct sock *sk = NULL; struct sctp_association *asoc; struct sctp_transport *transport = NULL; __u32 vtag = ntohl(sctphdr->vtag); int sdif = inet_sdif(skb); int dif = inet_iif(skb); *app = NULL; *tpp = NULL; af = sctp_get_af_specific(family); if (unlikely(!af)) { return NULL; } /* Initialize local addresses for lookups. */ af->from_skb(&saddr, skb, 1); af->from_skb(&daddr, skb, 0); /* Look for an association that matches the incoming ICMP error * packet. */ asoc = __sctp_lookup_association(net, &saddr, &daddr, &transport, dif, sdif); if (!asoc) return NULL; sk = asoc->base.sk; /* RFC 4960, Appendix C. ICMP Handling * * ICMP6) An implementation MUST validate that the Verification Tag * contained in the ICMP message matches the Verification Tag of * the peer. If the Verification Tag is not 0 and does NOT * match, discard the ICMP message. If it is 0 and the ICMP * message contains enough bytes to verify that the chunk type is * an INIT chunk and that the Initiate Tag matches the tag of the * peer, continue with ICMP7. If the ICMP message is too short * or the chunk type or the Initiate Tag does not match, silently * discard the packet. */ if (vtag == 0) { /* chunk header + first 4 octects of init header */ chunkhdr = skb_header_pointer(skb, skb_transport_offset(skb) + sizeof(struct sctphdr), sizeof(struct sctp_chunkhdr) + sizeof(__be32), &_chunkhdr); if (!chunkhdr || chunkhdr->chunk_hdr.type != SCTP_CID_INIT || ntohl(chunkhdr->init_hdr.init_tag) != asoc->c.my_vtag) goto out; } else if (vtag != asoc->c.peer_vtag) { goto out; } bh_lock_sock(sk); /* If too many ICMPs get dropped on busy * servers this needs to be solved differently. */ if (sock_owned_by_user(sk)) __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); *app = asoc; *tpp = transport; return sk; out: sctp_transport_put(transport); return NULL; } /* Common cleanup code for icmp/icmpv6 error handler. */ void sctp_err_finish(struct sock *sk, struct sctp_transport *t) __releases(&((__sk)->sk_lock.slock)) { bh_unlock_sock(sk); sctp_transport_put(t); } static void sctp_v4_err_handle(struct sctp_transport *t, struct sk_buff *skb, __u8 type, __u8 code, __u32 info) { struct sctp_association *asoc = t->asoc; struct sock *sk = asoc->base.sk; int err = 0; switch (type) { case ICMP_PARAMETERPROB: err = EPROTO; break; case ICMP_DEST_UNREACH: if (code > NR_ICMP_UNREACH) return; if (code == ICMP_FRAG_NEEDED) { sctp_icmp_frag_needed(sk, asoc, t, SCTP_TRUNC4(info)); return; } if (code == ICMP_PROT_UNREACH) { sctp_icmp_proto_unreachable(sk, asoc, t); return; } err = icmp_err_convert[code].errno; break; case ICMP_TIME_EXCEEDED: if (code == ICMP_EXC_FRAGTIME) return; err = EHOSTUNREACH; break; case ICMP_REDIRECT: sctp_icmp_redirect(sk, t, skb); return; default: return; } if (!sock_owned_by_user(sk) && inet_test_bit(RECVERR, sk)) { sk->sk_err = err; sk_error_report(sk); } else { /* Only an error on timeout */ WRITE_ONCE(sk->sk_err_soft, err); } } /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should * be closed and the error returned to the user. If err > 0 * it's just the icmp type << 8 | icmp code. After adjustment * header points to the first 8 bytes of the sctp header. We need * to find the appropriate port. * * The locking strategy used here is very "optimistic". When * someone else accesses the socket the ICMP is just dropped * and for some paths there is no check at all. * A more general error queue to queue errors for later handling * is probably better. * */ int sctp_v4_err(struct sk_buff *skb, __u32 info) { const struct iphdr *iph = (const struct iphdr *)skb->data; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct net *net = dev_net(skb->dev); struct sctp_transport *transport; struct sctp_association *asoc; __u16 saveip, savesctp; struct sock *sk; /* Fix up skb to look at the embedded net header. */ saveip = skb->network_header; savesctp = skb->transport_header; skb_reset_network_header(skb); skb_set_transport_header(skb, iph->ihl * 4); sk = sctp_err_lookup(net, AF_INET, skb, sctp_hdr(skb), &asoc, &transport); /* Put back, the original values. */ skb->network_header = saveip; skb->transport_header = savesctp; if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return -ENOENT; } sctp_v4_err_handle(transport, skb, type, code, info); sctp_err_finish(sk, transport); return 0; } int sctp_udp_v4_err(struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct sctp_association *asoc; struct sctp_transport *t; struct icmphdr *hdr; __u32 info = 0; skb->transport_header += sizeof(struct udphdr); sk = sctp_err_lookup(net, AF_INET, skb, sctp_hdr(skb), &asoc, &t); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return -ENOENT; } skb->transport_header -= sizeof(struct udphdr); hdr = (struct icmphdr *)(skb_network_header(skb) - sizeof(struct icmphdr)); if (hdr->type == ICMP_REDIRECT) { /* can't be handled without outer iphdr known, leave it to udp_err */ sctp_err_finish(sk, t); return 0; } if (hdr->type == ICMP_DEST_UNREACH && hdr->code == ICMP_FRAG_NEEDED) info = ntohs(hdr->un.frag.mtu); sctp_v4_err_handle(t, skb, hdr->type, hdr->code, info); sctp_err_finish(sk, t); return 1; } /* * RFC 2960, 8.4 - Handle "Out of the blue" Packets. * * This function scans all the chunks in the OOTB packet to determine if * the packet should be discarded right away. If a response might be needed * for this packet, or, if further processing is possible, the packet will * be queued to a proper inqueue for the next phase of handling. * * Output: * Return 0 - If further processing is needed. * Return 1 - If the packet can be discarded right away. */ static int sctp_rcv_ootb(struct sk_buff *skb) { struct sctp_chunkhdr *ch, _ch; int ch_end, offset = 0; /* Scan through all the chunks in the packet. */ do { /* Make sure we have at least the header there */ if (offset + sizeof(_ch) > skb->len) break; ch = skb_header_pointer(skb, offset, sizeof(*ch), &_ch); /* Break out if chunk length is less then minimal. */ if (!ch || ntohs(ch->length) < sizeof(_ch)) break; ch_end = offset + SCTP_PAD4(ntohs(ch->length)); if (ch_end > skb->len) break; /* RFC 8.4, 2) If the OOTB packet contains an ABORT chunk, the * receiver MUST silently discard the OOTB packet and take no * further action. */ if (SCTP_CID_ABORT == ch->type) goto discard; /* RFC 8.4, 6) If the packet contains a SHUTDOWN COMPLETE * chunk, the receiver should silently discard the packet * and take no further action. */ if (SCTP_CID_SHUTDOWN_COMPLETE == ch->type) goto discard; /* RFC 4460, 2.11.2 * This will discard packets with INIT chunk bundled as * subsequent chunks in the packet. When INIT is first, * the normal INIT processing will discard the chunk. */ if (SCTP_CID_INIT == ch->type && (void *)ch != skb->data) goto discard; offset = ch_end; } while (ch_end < skb->len); return 0; discard: return 1; } /* Insert endpoint into the hash table. */ static int __sctp_hash_endpoint(struct sctp_endpoint *ep) { struct sock *sk = ep->base.sk; struct net *net = sock_net(sk); struct sctp_hashbucket *head; ep->hashent = sctp_ep_hashfn(net, ep->base.bind_addr.port); head = &sctp_ep_hashtable[ep->hashent]; if (sk->sk_reuseport) { bool any = sctp_is_ep_boundall(sk); struct sctp_endpoint *ep2; struct list_head *list; int cnt = 0, err = 1; list_for_each(list, &ep->base.bind_addr.address_list) cnt++; sctp_for_each_hentry(ep2, &head->chain) { struct sock *sk2 = ep2->base.sk; if (!net_eq(sock_net(sk2), net) || sk2 == sk || !uid_eq(sock_i_uid(sk2), sock_i_uid(sk)) || !sk2->sk_reuseport) continue; err = sctp_bind_addrs_check(sctp_sk(sk2), sctp_sk(sk), cnt); if (!err) { err = reuseport_add_sock(sk, sk2, any); if (err) return err; break; } else if (err < 0) { return err; } } if (err) { err = reuseport_alloc(sk, any); if (err) return err; } } write_lock(&head->lock); hlist_add_head(&ep->node, &head->chain); write_unlock(&head->lock); return 0; } /* Add an endpoint to the hash. Local BH-safe. */ int sctp_hash_endpoint(struct sctp_endpoint *ep) { int err; local_bh_disable(); err = __sctp_hash_endpoint(ep); local_bh_enable(); return err; } /* Remove endpoint from the hash table. */ static void __sctp_unhash_endpoint(struct sctp_endpoint *ep) { struct sock *sk = ep->base.sk; struct sctp_hashbucket *head; ep->hashent = sctp_ep_hashfn(sock_net(sk), ep->base.bind_addr.port); head = &sctp_ep_hashtable[ep->hashent]; if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_detach_sock(sk); write_lock(&head->lock); hlist_del_init(&ep->node); write_unlock(&head->lock); } /* Remove endpoint from the hash. Local BH-safe. */ void sctp_unhash_endpoint(struct sctp_endpoint *ep) { local_bh_disable(); __sctp_unhash_endpoint(ep); local_bh_enable(); } static inline __u32 sctp_hashfn(const struct net *net, __be16 lport, const union sctp_addr *paddr, __u32 seed) { __u32 addr; if (paddr->sa.sa_family == AF_INET6) addr = jhash(&paddr->v6.sin6_addr, 16, seed); else addr = (__force __u32)paddr->v4.sin_addr.s_addr; return jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 | (__force __u32)lport, net_hash_mix(net), seed); } /* Look up an endpoint. */ static struct sctp_endpoint *__sctp_rcv_lookup_endpoint( struct net *net, struct sk_buff *skb, const union sctp_addr *laddr, const union sctp_addr *paddr, int dif, int sdif) { struct sctp_hashbucket *head; struct sctp_endpoint *ep; struct sock *sk; __be16 lport; int hash; lport = laddr->v4.sin_port; hash = sctp_ep_hashfn(net, ntohs(lport)); head = &sctp_ep_hashtable[hash]; read_lock(&head->lock); sctp_for_each_hentry(ep, &head->chain) { if (sctp_endpoint_is_match(ep, net, laddr, dif, sdif)) goto hit; } ep = sctp_sk(net->sctp.ctl_sock)->ep; hit: sk = ep->base.sk; if (sk->sk_reuseport) { __u32 phash = sctp_hashfn(net, lport, paddr, 0); sk = reuseport_select_sock(sk, phash, skb, sizeof(struct sctphdr)); if (sk) ep = sctp_sk(sk)->ep; } sctp_endpoint_hold(ep); read_unlock(&head->lock); return ep; } /* rhashtable for transport */ struct sctp_hash_cmp_arg { const union sctp_addr *paddr; const struct net *net; __be16 lport; }; static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { struct sctp_transport *t = (struct sctp_transport *)ptr; const struct sctp_hash_cmp_arg *x = arg->key; int err = 1; if (!sctp_cmp_addr_exact(&t->ipaddr, x->paddr)) return err; if (!sctp_transport_hold(t)) return err; if (!net_eq(t->asoc->base.net, x->net)) goto out; if (x->lport != htons(t->asoc->base.bind_addr.port)) goto out; err = 0; out: sctp_transport_put(t); return err; } static inline __u32 sctp_hash_obj(const void *data, u32 len, u32 seed) { const struct sctp_transport *t = data; return sctp_hashfn(t->asoc->base.net, htons(t->asoc->base.bind_addr.port), &t->ipaddr, seed); } static inline __u32 sctp_hash_key(const void *data, u32 len, u32 seed) { const struct sctp_hash_cmp_arg *x = data; return sctp_hashfn(x->net, x->lport, x->paddr, seed); } static const struct rhashtable_params sctp_hash_params = { .head_offset = offsetof(struct sctp_transport, node), .hashfn = sctp_hash_key, .obj_hashfn = sctp_hash_obj, .obj_cmpfn = sctp_hash_cmp, .automatic_shrinking = true, }; int sctp_transport_hashtable_init(void) { return rhltable_init(&sctp_transport_hashtable, &sctp_hash_params); } void sctp_transport_hashtable_destroy(void) { rhltable_destroy(&sctp_transport_hashtable); } int sctp_hash_transport(struct sctp_transport *t) { struct sctp_transport *transport; struct rhlist_head *tmp, *list; struct sctp_hash_cmp_arg arg; int err; if (t->asoc->temp) return 0; arg.net = t->asoc->base.net; arg.paddr = &t->ipaddr; arg.lport = htons(t->asoc->base.bind_addr.port); rcu_read_lock(); list = rhltable_lookup(&sctp_transport_hashtable, &arg, sctp_hash_params); rhl_for_each_entry_rcu(transport, tmp, list, node) if (transport->asoc->ep == t->asoc->ep) { rcu_read_unlock(); return -EEXIST; } rcu_read_unlock(); err = rhltable_insert_key(&sctp_transport_hashtable, &arg, &t->node, sctp_hash_params); if (err) pr_err_once("insert transport fail, errno %d\n", err); return err; } void sctp_unhash_transport(struct sctp_transport *t) { if (t->asoc->temp) return; rhltable_remove(&sctp_transport_hashtable, &t->node, sctp_hash_params); } bool sctp_sk_bound_dev_eq(struct net *net, int bound_dev_if, int dif, int sdif) { bool l3mdev_accept = true; #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) l3mdev_accept = !!READ_ONCE(net->sctp.l3mdev_accept); #endif return inet_bound_dev_eq(l3mdev_accept, bound_dev_if, dif, sdif); } /* return a transport with holding it */ struct sctp_transport *sctp_addrs_lookup_transport( struct net *net, const union sctp_addr *laddr, const union sctp_addr *paddr, int dif, int sdif) { struct rhlist_head *tmp, *list; struct sctp_transport *t; int bound_dev_if; struct sctp_hash_cmp_arg arg = { .paddr = paddr, .net = net, .lport = laddr->v4.sin_port, }; list = rhltable_lookup(&sctp_transport_hashtable, &arg, sctp_hash_params); rhl_for_each_entry_rcu(t, tmp, list, node) { if (!sctp_transport_hold(t)) continue; bound_dev_if = READ_ONCE(t->asoc->base.sk->sk_bound_dev_if); if (sctp_sk_bound_dev_eq(net, bound_dev_if, dif, sdif) && sctp_bind_addr_match(&t->asoc->base.bind_addr, laddr, sctp_sk(t->asoc->base.sk))) return t; sctp_transport_put(t); } return NULL; } /* return a transport without holding it, as it's only used under sock lock */ struct sctp_transport *sctp_epaddr_lookup_transport( const struct sctp_endpoint *ep, const union sctp_addr *paddr) { struct rhlist_head *tmp, *list; struct sctp_transport *t; struct sctp_hash_cmp_arg arg = { .paddr = paddr, .net = ep->base.net, .lport = htons(ep->base.bind_addr.port), }; list = rhltable_lookup(&sctp_transport_hashtable, &arg, sctp_hash_params); rhl_for_each_entry_rcu(t, tmp, list, node) if (ep == t->asoc->ep) return t; return NULL; } /* Look up an association. */ static struct sctp_association *__sctp_lookup_association( struct net *net, const union sctp_addr *local, const union sctp_addr *peer, struct sctp_transport **pt, int dif, int sdif) { struct sctp_transport *t; struct sctp_association *asoc = NULL; t = sctp_addrs_lookup_transport(net, local, peer, dif, sdif); if (!t) goto out; asoc = t->asoc; *pt = t; out: return asoc; } /* Look up an association. protected by RCU read lock */ static struct sctp_association *sctp_lookup_association(struct net *net, const union sctp_addr *laddr, const union sctp_addr *paddr, struct sctp_transport **transportp, int dif, int sdif) { struct sctp_association *asoc; rcu_read_lock(); asoc = __sctp_lookup_association(net, laddr, paddr, transportp, dif, sdif); rcu_read_unlock(); return asoc; } /* Is there an association matching the given local and peer addresses? */ bool sctp_has_association(struct net *net, const union sctp_addr *laddr, const union sctp_addr *paddr, int dif, int sdif) { struct sctp_transport *transport; if (sctp_lookup_association(net, laddr, paddr, &transport, dif, sdif)) { sctp_transport_put(transport); return true; } return false; } /* * SCTP Implementors Guide, 2.18 Handling of address * parameters within the INIT or INIT-ACK. * * D) When searching for a matching TCB upon reception of an INIT * or INIT-ACK chunk the receiver SHOULD use not only the * source address of the packet (containing the INIT or * INIT-ACK) but the receiver SHOULD also use all valid * address parameters contained within the chunk. * * 2.18.3 Solution description * * This new text clearly specifies to an implementor the need * to look within the INIT or INIT-ACK. Any implementation that * does not do this, may not be able to establish associations * in certain circumstances. * */ static struct sctp_association *__sctp_rcv_init_lookup(struct net *net, struct sk_buff *skb, const union sctp_addr *laddr, struct sctp_transport **transportp, int dif, int sdif) { struct sctp_association *asoc; union sctp_addr addr; union sctp_addr *paddr = &addr; struct sctphdr *sh = sctp_hdr(skb); union sctp_params params; struct sctp_init_chunk *init; struct sctp_af *af; /* * This code will NOT touch anything inside the chunk--it is * strictly READ-ONLY. * * RFC 2960 3 SCTP packet Format * * Multiple chunks can be bundled into one SCTP packet up to * the MTU size, except for the INIT, INIT ACK, and SHUTDOWN * COMPLETE chunks. These chunks MUST NOT be bundled with any * other chunk in a packet. See Section 6.10 for more details * on chunk bundling. */ /* Find the start of the TLVs and the end of the chunk. This is * the region we search for address parameters. */ init = (struct sctp_init_chunk *)skb->data; /* Walk the parameters looking for embedded addresses. */ sctp_walk_params(params, init) { /* Note: Ignoring hostname addresses. */ af = sctp_get_af_specific(param_type2af(params.p->type)); if (!af) continue; if (!af->from_addr_param(paddr, params.addr, sh->source, 0)) continue; asoc = __sctp_lookup_association(net, laddr, paddr, transportp, dif, sdif); if (asoc) return asoc; } return NULL; } /* ADD-IP, Section 5.2 * When an endpoint receives an ASCONF Chunk from the remote peer * special procedures may be needed to identify the association the * ASCONF Chunk is associated with. To properly find the association * the following procedures SHOULD be followed: * * D2) If the association is not found, use the address found in the * Address Parameter TLV combined with the port number found in the * SCTP common header. If found proceed to rule D4. * * D2-ext) If more than one ASCONF Chunks are packed together, use the * address found in the ASCONF Address Parameter TLV of each of the * subsequent ASCONF Chunks. If found, proceed to rule D4. */ static struct sctp_association *__sctp_rcv_asconf_lookup( struct net *net, struct sctp_chunkhdr *ch, const union sctp_addr *laddr, __be16 peer_port, struct sctp_transport **transportp, int dif, int sdif) { struct sctp_addip_chunk *asconf = (struct sctp_addip_chunk *)ch; struct sctp_af *af; union sctp_addr_param *param; union sctp_addr paddr; if (ntohs(ch->length) < sizeof(*asconf) + sizeof(struct sctp_paramhdr)) return NULL; /* Skip over the ADDIP header and find the Address parameter */ param = (union sctp_addr_param *)(asconf + 1); af = sctp_get_af_specific(param_type2af(param->p.type)); if (unlikely(!af)) return NULL; if (!af->from_addr_param(&paddr, param, peer_port, 0)) return NULL; return __sctp_lookup_association(net, laddr, &paddr, transportp, dif, sdif); } /* SCTP-AUTH, Section 6.3: * If the receiver does not find a STCB for a packet containing an AUTH * chunk as the first chunk and not a COOKIE-ECHO chunk as the second * chunk, it MUST use the chunks after the AUTH chunk to look up an existing * association. * * This means that any chunks that can help us identify the association need * to be looked at to find this association. */ static struct sctp_association *__sctp_rcv_walk_lookup(struct net *net, struct sk_buff *skb, const union sctp_addr *laddr, struct sctp_transport **transportp, int dif, int sdif) { struct sctp_association *asoc = NULL; struct sctp_chunkhdr *ch; int have_auth = 0; unsigned int chunk_num = 1; __u8 *ch_end; /* Walk through the chunks looking for AUTH or ASCONF chunks * to help us find the association. */ ch = (struct sctp_chunkhdr *)skb->data; do { /* Break out if chunk length is less then minimal. */ if (ntohs(ch->length) < sizeof(*ch)) break; ch_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length)); if (ch_end > skb_tail_pointer(skb)) break; switch (ch->type) { case SCTP_CID_AUTH: have_auth = chunk_num; break; case SCTP_CID_COOKIE_ECHO: /* If a packet arrives containing an AUTH chunk as * a first chunk, a COOKIE-ECHO chunk as the second * chunk, and possibly more chunks after them, and * the receiver does not have an STCB for that * packet, then authentication is based on * the contents of the COOKIE- ECHO chunk. */ if (have_auth == 1 && chunk_num == 2) return NULL; break; case SCTP_CID_ASCONF: if (have_auth || net->sctp.addip_noauth) asoc = __sctp_rcv_asconf_lookup( net, ch, laddr, sctp_hdr(skb)->source, transportp, dif, sdif); break; default: break; } if (asoc) break; ch = (struct sctp_chunkhdr *)ch_end; chunk_num++; } while (ch_end + sizeof(*ch) < skb_tail_pointer(skb)); return asoc; } /* * There are circumstances when we need to look inside the SCTP packet * for information to help us find the association. Examples * include looking inside of INIT/INIT-ACK chunks or after the AUTH * chunks. */ static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net, struct sk_buff *skb, const union sctp_addr *laddr, struct sctp_transport **transportp, int dif, int sdif) { struct sctp_chunkhdr *ch; /* We do not allow GSO frames here as we need to linearize and * then cannot guarantee frame boundaries. This shouldn't be an * issue as packets hitting this are mostly INIT or INIT-ACK and * those cannot be on GSO-style anyway. */ if (skb_is_gso(skb) && skb_is_gso_sctp(skb)) return NULL; ch = (struct sctp_chunkhdr *)skb->data; /* The code below will attempt to walk the chunk and extract * parameter information. Before we do that, we need to verify * that the chunk length doesn't cause overflow. Otherwise, we'll * walk off the end. */ if (SCTP_PAD4(ntohs(ch->length)) > skb->len) return NULL; /* If this is INIT/INIT-ACK look inside the chunk too. */ if (ch->type == SCTP_CID_INIT || ch->type == SCTP_CID_INIT_ACK) return __sctp_rcv_init_lookup(net, skb, laddr, transportp, dif, sdif); return __sctp_rcv_walk_lookup(net, skb, laddr, transportp, dif, sdif); } /* Lookup an association for an inbound skb. */ static struct sctp_association *__sctp_rcv_lookup(struct net *net, struct sk_buff *skb, const union sctp_addr *paddr, const union sctp_addr *laddr, struct sctp_transport **transportp, int dif, int sdif) { struct sctp_association *asoc; asoc = __sctp_lookup_association(net, laddr, paddr, transportp, dif, sdif); if (asoc) goto out; /* Further lookup for INIT/INIT-ACK packets. * SCTP Implementors Guide, 2.18 Handling of address * parameters within the INIT or INIT-ACK. */ asoc = __sctp_rcv_lookup_harder(net, skb, laddr, transportp, dif, sdif); if (asoc) goto out; if (paddr->sa.sa_family == AF_INET) pr_debug("sctp: asoc not found for src:%pI4:%d dst:%pI4:%d\n", &laddr->v4.sin_addr, ntohs(laddr->v4.sin_port), &paddr->v4.sin_addr, ntohs(paddr->v4.sin_port)); else pr_debug("sctp: asoc not found for src:%pI6:%d dst:%pI6:%d\n", &laddr->v6.sin6_addr, ntohs(laddr->v6.sin6_port), &paddr->v6.sin6_addr, ntohs(paddr->v6.sin6_port)); out: return asoc; }
2 1 1 1 1 4 4 1 2 2 1 1 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 // SPDX-License-Identifier: GPL-2.0 /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Basic Transport Functions exploiting Infiniband API * * Copyright IBM Corp. 2016 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> */ #include <linux/socket.h> #include <linux/if_vlan.h> #include <linux/random.h> #include <linux/workqueue.h> #include <linux/wait.h> #include <linux/reboot.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/smc.h> #include <net/tcp.h> #include <net/sock.h> #include <rdma/ib_verbs.h> #include <rdma/ib_cache.h> #include "smc.h" #include "smc_clc.h" #include "smc_core.h" #include "smc_ib.h" #include "smc_wr.h" #include "smc_llc.h" #include "smc_cdc.h" #include "smc_close.h" #include "smc_ism.h" #include "smc_netlink.h" #include "smc_stats.h" #include "smc_tracepoint.h" #define SMC_LGR_NUM_INCR 256 #define SMC_LGR_FREE_DELAY_SERV (600 * HZ) #define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ) struct smc_lgr_list smc_lgr_list = { /* established link groups */ .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), .list = LIST_HEAD_INIT(smc_lgr_list.list), .num = 0, }; static atomic_t lgr_cnt = ATOMIC_INIT(0); /* number of existing link groups */ static DECLARE_WAIT_QUEUE_HEAD(lgrs_deleted); static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc); static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft); static void smc_link_down_work(struct work_struct *work); /* return head of link group list and its lock for a given link group */ static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr, spinlock_t **lgr_lock) { if (lgr->is_smcd) { *lgr_lock = &lgr->smcd->lgr_lock; return &lgr->smcd->lgr_list; } *lgr_lock = &smc_lgr_list.lock; return &smc_lgr_list.list; } static void smc_ibdev_cnt_inc(struct smc_link *lnk) { atomic_inc(&lnk->smcibdev->lnk_cnt_by_port[lnk->ibport - 1]); } static void smc_ibdev_cnt_dec(struct smc_link *lnk) { atomic_dec(&lnk->smcibdev->lnk_cnt_by_port[lnk->ibport - 1]); } static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) { /* client link group creation always follows the server link group * creation. For client use a somewhat higher removal delay time, * otherwise there is a risk of out-of-sync link groups. */ if (!lgr->freeing) { mod_delayed_work(system_wq, &lgr->free_work, (!lgr->is_smcd && lgr->role == SMC_CLNT) ? SMC_LGR_FREE_DELAY_CLNT : SMC_LGR_FREE_DELAY_SERV); } } /* Register connection's alert token in our lookup structure. * To use rbtrees we have to implement our own insert core. * Requires @conns_lock * @smc connection to register * Returns 0 on success, != otherwise. */ static void smc_lgr_add_alert_token(struct smc_connection *conn) { struct rb_node **link, *parent = NULL; u32 token = conn->alert_token_local; link = &conn->lgr->conns_all.rb_node; while (*link) { struct smc_connection *cur = rb_entry(*link, struct smc_connection, alert_node); parent = *link; if (cur->alert_token_local > token) link = &parent->rb_left; else link = &parent->rb_right; } /* Put the new node there */ rb_link_node(&conn->alert_node, parent, link); rb_insert_color(&conn->alert_node, &conn->lgr->conns_all); } /* assign an SMC-R link to the connection */ static int smcr_lgr_conn_assign_link(struct smc_connection *conn, bool first) { enum smc_link_state expected = first ? SMC_LNK_ACTIVATING : SMC_LNK_ACTIVE; int i, j; /* do link balancing */ conn->lnk = NULL; /* reset conn->lnk first */ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &conn->lgr->lnk[i]; if (lnk->state != expected || lnk->link_is_asym) continue; if (conn->lgr->role == SMC_CLNT) { conn->lnk = lnk; /* temporary, SMC server assigns link*/ break; } if (conn->lgr->conns_num % 2) { for (j = i + 1; j < SMC_LINKS_PER_LGR_MAX; j++) { struct smc_link *lnk2; lnk2 = &conn->lgr->lnk[j]; if (lnk2->state == expected && !lnk2->link_is_asym) { conn->lnk = lnk2; break; } } } if (!conn->lnk) conn->lnk = lnk; break; } if (!conn->lnk) return SMC_CLC_DECL_NOACTLINK; atomic_inc(&conn->lnk->conn_cnt); return 0; } /* Register connection in link group by assigning an alert token * registered in a search tree. * Requires @conns_lock * Note that '0' is a reserved value and not assigned. */ static int smc_lgr_register_conn(struct smc_connection *conn, bool first) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); static atomic_t nexttoken = ATOMIC_INIT(0); int rc; if (!conn->lgr->is_smcd) { rc = smcr_lgr_conn_assign_link(conn, first); if (rc) { conn->lgr = NULL; return rc; } } /* find a new alert_token_local value not yet used by some connection * in this link group */ sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */ while (!conn->alert_token_local) { conn->alert_token_local = atomic_inc_return(&nexttoken); if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr)) conn->alert_token_local = 0; } smc_lgr_add_alert_token(conn); conn->lgr->conns_num++; return 0; } /* Unregister connection and reset the alert token of the given connection< */ static void __smc_lgr_unregister_conn(struct smc_connection *conn) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); struct smc_link_group *lgr = conn->lgr; rb_erase(&conn->alert_node, &lgr->conns_all); if (conn->lnk) atomic_dec(&conn->lnk->conn_cnt); lgr->conns_num--; conn->alert_token_local = 0; sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */ } /* Unregister connection from lgr */ static void smc_lgr_unregister_conn(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; if (!smc_conn_lgr_valid(conn)) return; write_lock_bh(&lgr->conns_lock); if (conn->alert_token_local) { __smc_lgr_unregister_conn(conn); } write_unlock_bh(&lgr->conns_lock); } int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); char hostname[SMC_MAX_HOSTNAME_LEN + 1]; char smc_seid[SMC_MAX_EID_LEN + 1]; struct nlattr *attrs; u8 *seid = NULL; u8 *host = NULL; void *nlh; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_SYS_INFO); if (!nlh) goto errmsg; if (cb_ctx->pos[0]) goto errout; attrs = nla_nest_start(skb, SMC_GEN_SYS_INFO); if (!attrs) goto errout; if (nla_put_u8(skb, SMC_NLA_SYS_VER, SMC_V2)) goto errattr; if (nla_put_u8(skb, SMC_NLA_SYS_REL, SMC_RELEASE)) goto errattr; if (nla_put_u8(skb, SMC_NLA_SYS_IS_ISM_V2, smc_ism_is_v2_capable())) goto errattr; if (nla_put_u8(skb, SMC_NLA_SYS_IS_SMCR_V2, true)) goto errattr; smc_clc_get_hostname(&host); if (host) { memcpy(hostname, host, SMC_MAX_HOSTNAME_LEN); hostname[SMC_MAX_HOSTNAME_LEN] = 0; if (nla_put_string(skb, SMC_NLA_SYS_LOCAL_HOST, hostname)) goto errattr; } if (smc_ism_is_v2_capable()) { smc_ism_get_system_eid(&seid); memcpy(smc_seid, seid, SMC_MAX_EID_LEN); smc_seid[SMC_MAX_EID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_SYS_SEID, smc_seid)) goto errattr; } nla_nest_end(skb, attrs); genlmsg_end(skb, nlh); cb_ctx->pos[0] = 1; return skb->len; errattr: nla_nest_cancel(skb, attrs); errout: genlmsg_cancel(skb, nlh); errmsg: return skb->len; } /* Fill SMC_NLA_LGR_D_V2_COMMON/SMC_NLA_LGR_R_V2_COMMON nested attributes */ static int smc_nl_fill_lgr_v2_common(struct smc_link_group *lgr, struct sk_buff *skb, struct netlink_callback *cb, struct nlattr *v2_attrs) { char smc_host[SMC_MAX_HOSTNAME_LEN + 1]; char smc_eid[SMC_MAX_EID_LEN + 1]; if (nla_put_u8(skb, SMC_NLA_LGR_V2_VER, lgr->smc_version)) goto errv2attr; if (nla_put_u8(skb, SMC_NLA_LGR_V2_REL, lgr->peer_smc_release)) goto errv2attr; if (nla_put_u8(skb, SMC_NLA_LGR_V2_OS, lgr->peer_os)) goto errv2attr; memcpy(smc_host, lgr->peer_hostname, SMC_MAX_HOSTNAME_LEN); smc_host[SMC_MAX_HOSTNAME_LEN] = 0; if (nla_put_string(skb, SMC_NLA_LGR_V2_PEER_HOST, smc_host)) goto errv2attr; memcpy(smc_eid, lgr->negotiated_eid, SMC_MAX_EID_LEN); smc_eid[SMC_MAX_EID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_LGR_V2_NEG_EID, smc_eid)) goto errv2attr; nla_nest_end(skb, v2_attrs); return 0; errv2attr: nla_nest_cancel(skb, v2_attrs); return -EMSGSIZE; } static int smc_nl_fill_smcr_lgr_v2(struct smc_link_group *lgr, struct sk_buff *skb, struct netlink_callback *cb) { struct nlattr *v2_attrs; v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_R_V2); if (!v2_attrs) goto errattr; if (nla_put_u8(skb, SMC_NLA_LGR_R_V2_DIRECT, !lgr->uses_gateway)) goto errv2attr; if (nla_put_u8(skb, SMC_NLA_LGR_R_V2_MAX_CONNS, lgr->max_conns)) goto errv2attr; if (nla_put_u8(skb, SMC_NLA_LGR_R_V2_MAX_LINKS, lgr->max_links)) goto errv2attr; nla_nest_end(skb, v2_attrs); return 0; errv2attr: nla_nest_cancel(skb, v2_attrs); errattr: return -EMSGSIZE; } static int smc_nl_fill_lgr(struct smc_link_group *lgr, struct sk_buff *skb, struct netlink_callback *cb) { char smc_target[SMC_MAX_PNETID_LEN + 1]; struct nlattr *attrs, *v2_attrs; attrs = nla_nest_start(skb, SMC_GEN_LGR_SMCR); if (!attrs) goto errout; if (nla_put_u32(skb, SMC_NLA_LGR_R_ID, *((u32 *)&lgr->id))) goto errattr; if (nla_put_u32(skb, SMC_NLA_LGR_R_CONNS_NUM, lgr->conns_num)) goto errattr; if (nla_put_u8(skb, SMC_NLA_LGR_R_ROLE, lgr->role)) goto errattr; if (nla_put_u8(skb, SMC_NLA_LGR_R_TYPE, lgr->type)) goto errattr; if (nla_put_u8(skb, SMC_NLA_LGR_R_BUF_TYPE, lgr->buf_type)) goto errattr; if (nla_put_u8(skb, SMC_NLA_LGR_R_VLAN_ID, lgr->vlan_id)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_LGR_R_NET_COOKIE, lgr->net->net_cookie, SMC_NLA_LGR_R_PAD)) goto errattr; memcpy(smc_target, lgr->pnet_id, SMC_MAX_PNETID_LEN); smc_target[SMC_MAX_PNETID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_LGR_R_PNETID, smc_target)) goto errattr; if (lgr->smc_version > SMC_V1) { v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_R_V2_COMMON); if (!v2_attrs) goto errattr; if (smc_nl_fill_lgr_v2_common(lgr, skb, cb, v2_attrs)) goto errattr; if (smc_nl_fill_smcr_lgr_v2(lgr, skb, cb)) goto errattr; } nla_nest_end(skb, attrs); return 0; errattr: nla_nest_cancel(skb, attrs); errout: return -EMSGSIZE; } static int smc_nl_fill_lgr_link(struct smc_link_group *lgr, struct smc_link *link, struct sk_buff *skb, struct netlink_callback *cb) { char smc_ibname[IB_DEVICE_NAME_MAX]; u8 smc_gid_target[41]; struct nlattr *attrs; u32 link_uid = 0; void *nlh; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_LINK_SMCR); if (!nlh) goto errmsg; attrs = nla_nest_start(skb, SMC_GEN_LINK_SMCR); if (!attrs) goto errout; if (nla_put_u8(skb, SMC_NLA_LINK_ID, link->link_id)) goto errattr; if (nla_put_u32(skb, SMC_NLA_LINK_STATE, link->state)) goto errattr; if (nla_put_u32(skb, SMC_NLA_LINK_CONN_CNT, atomic_read(&link->conn_cnt))) goto errattr; if (nla_put_u8(skb, SMC_NLA_LINK_IB_PORT, link->ibport)) goto errattr; if (nla_put_u32(skb, SMC_NLA_LINK_NET_DEV, link->ndev_ifidx)) goto errattr; snprintf(smc_ibname, sizeof(smc_ibname), "%s", link->ibname); if (nla_put_string(skb, SMC_NLA_LINK_IB_DEV, smc_ibname)) goto errattr; memcpy(&link_uid, link->link_uid, sizeof(link_uid)); if (nla_put_u32(skb, SMC_NLA_LINK_UID, link_uid)) goto errattr; memcpy(&link_uid, link->peer_link_uid, sizeof(link_uid)); if (nla_put_u32(skb, SMC_NLA_LINK_PEER_UID, link_uid)) goto errattr; memset(smc_gid_target, 0, sizeof(smc_gid_target)); smc_gid_be16_convert(smc_gid_target, link->gid); if (nla_put_string(skb, SMC_NLA_LINK_GID, smc_gid_target)) goto errattr; memset(smc_gid_target, 0, sizeof(smc_gid_target)); smc_gid_be16_convert(smc_gid_target, link->peer_gid); if (nla_put_string(skb, SMC_NLA_LINK_PEER_GID, smc_gid_target)) goto errattr; nla_nest_end(skb, attrs); genlmsg_end(skb, nlh); return 0; errattr: nla_nest_cancel(skb, attrs); errout: genlmsg_cancel(skb, nlh); errmsg: return -EMSGSIZE; } static int smc_nl_handle_lgr(struct smc_link_group *lgr, struct sk_buff *skb, struct netlink_callback *cb, bool list_links) { void *nlh; int i; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_LGR_SMCR); if (!nlh) goto errmsg; if (smc_nl_fill_lgr(lgr, skb, cb)) goto errout; genlmsg_end(skb, nlh); if (!list_links) goto out; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_usable(&lgr->lnk[i])) continue; if (smc_nl_fill_lgr_link(lgr, &lgr->lnk[i], skb, cb)) goto errout; } out: return 0; errout: genlmsg_cancel(skb, nlh); errmsg: return -EMSGSIZE; } static void smc_nl_fill_lgr_list(struct smc_lgr_list *smc_lgr, struct sk_buff *skb, struct netlink_callback *cb, bool list_links) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); struct smc_link_group *lgr; int snum = cb_ctx->pos[0]; int num = 0; spin_lock_bh(&smc_lgr->lock); list_for_each_entry(lgr, &smc_lgr->list, list) { if (num < snum) goto next; if (smc_nl_handle_lgr(lgr, skb, cb, list_links)) goto errout; next: num++; } errout: spin_unlock_bh(&smc_lgr->lock); cb_ctx->pos[0] = num; } static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr, struct sk_buff *skb, struct netlink_callback *cb) { char smc_pnet[SMC_MAX_PNETID_LEN + 1]; struct smcd_dev *smcd = lgr->smcd; struct smcd_gid smcd_gid; struct nlattr *attrs; void *nlh; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_LGR_SMCD); if (!nlh) goto errmsg; attrs = nla_nest_start(skb, SMC_GEN_LGR_SMCD); if (!attrs) goto errout; if (nla_put_u32(skb, SMC_NLA_LGR_D_ID, *((u32 *)&lgr->id))) goto errattr; smcd->ops->get_local_gid(smcd, &smcd_gid); if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_GID, smcd_gid.gid, SMC_NLA_LGR_D_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_EXT_GID, smcd_gid.gid_ext, SMC_NLA_LGR_D_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_PEER_GID, lgr->peer_gid.gid, SMC_NLA_LGR_D_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_PEER_EXT_GID, lgr->peer_gid.gid_ext, SMC_NLA_LGR_D_PAD)) goto errattr; if (nla_put_u8(skb, SMC_NLA_LGR_D_VLAN_ID, lgr->vlan_id)) goto errattr; if (nla_put_u32(skb, SMC_NLA_LGR_D_CONNS_NUM, lgr->conns_num)) goto errattr; if (nla_put_u32(skb, SMC_NLA_LGR_D_CHID, smc_ism_get_chid(lgr->smcd))) goto errattr; memcpy(smc_pnet, lgr->smcd->pnetid, SMC_MAX_PNETID_LEN); smc_pnet[SMC_MAX_PNETID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_LGR_D_PNETID, smc_pnet)) goto errattr; if (lgr->smc_version > SMC_V1) { struct nlattr *v2_attrs; v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_D_V2_COMMON); if (!v2_attrs) goto errattr; if (smc_nl_fill_lgr_v2_common(lgr, skb, cb, v2_attrs)) goto errattr; } nla_nest_end(skb, attrs); genlmsg_end(skb, nlh); return 0; errattr: nla_nest_cancel(skb, attrs); errout: genlmsg_cancel(skb, nlh); errmsg: return -EMSGSIZE; } static int smc_nl_handle_smcd_lgr(struct smcd_dev *dev, struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); struct smc_link_group *lgr; int snum = cb_ctx->pos[1]; int rc = 0, num = 0; spin_lock_bh(&dev->lgr_lock); list_for_each_entry(lgr, &dev->lgr_list, list) { if (!lgr->is_smcd) continue; if (num < snum) goto next; rc = smc_nl_fill_smcd_lgr(lgr, skb, cb); if (rc) goto errout; next: num++; } errout: spin_unlock_bh(&dev->lgr_lock); cb_ctx->pos[1] = num; return rc; } static int smc_nl_fill_smcd_dev(struct smcd_dev_list *dev_list, struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); struct smcd_dev *smcd_dev; int snum = cb_ctx->pos[0]; int rc = 0, num = 0; mutex_lock(&dev_list->mutex); list_for_each_entry(smcd_dev, &dev_list->list, list) { if (list_empty(&smcd_dev->lgr_list)) continue; if (num < snum) goto next; rc = smc_nl_handle_smcd_lgr(smcd_dev, skb, cb); if (rc) goto errout; next: num++; } errout: mutex_unlock(&dev_list->mutex); cb_ctx->pos[0] = num; return rc; } int smcr_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb) { bool list_links = false; smc_nl_fill_lgr_list(&smc_lgr_list, skb, cb, list_links); return skb->len; } int smcr_nl_get_link(struct sk_buff *skb, struct netlink_callback *cb) { bool list_links = true; smc_nl_fill_lgr_list(&smc_lgr_list, skb, cb, list_links); return skb->len; } int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb) { smc_nl_fill_smcd_dev(&smcd_dev_list, skb, cb); return skb->len; } void smc_lgr_cleanup_early(struct smc_link_group *lgr) { spinlock_t *lgr_lock; if (!lgr) return; smc_lgr_list_head(lgr, &lgr_lock); spin_lock_bh(lgr_lock); /* do not use this link group for new connections */ if (!list_empty(&lgr->list)) list_del_init(&lgr->list); spin_unlock_bh(lgr_lock); __smc_lgr_terminate(lgr, true); } static void smcr_lgr_link_deactivate_all(struct smc_link_group *lgr) { int i; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &lgr->lnk[i]; if (smc_link_sendable(lnk)) lnk->state = SMC_LNK_INACTIVE; } wake_up_all(&lgr->llc_msg_waiter); wake_up_all(&lgr->llc_flow_waiter); } static void smc_lgr_free(struct smc_link_group *lgr); static void smc_lgr_free_work(struct work_struct *work) { struct smc_link_group *lgr = container_of(to_delayed_work(work), struct smc_link_group, free_work); spinlock_t *lgr_lock; bool conns; smc_lgr_list_head(lgr, &lgr_lock); spin_lock_bh(lgr_lock); if (lgr->freeing) { spin_unlock_bh(lgr_lock); return; } read_lock_bh(&lgr->conns_lock); conns = RB_EMPTY_ROOT(&lgr->conns_all); read_unlock_bh(&lgr->conns_lock); if (!conns) { /* number of lgr connections is no longer zero */ spin_unlock_bh(lgr_lock); return; } list_del_init(&lgr->list); /* remove from smc_lgr_list */ lgr->freeing = 1; /* this instance does the freeing, no new schedule */ spin_unlock_bh(lgr_lock); cancel_delayed_work(&lgr->free_work); if (!lgr->is_smcd && !lgr->terminating) smc_llc_send_link_delete_all(lgr, true, SMC_LLC_DEL_PROG_INIT_TERM); if (lgr->is_smcd && !lgr->terminating) smc_ism_signal_shutdown(lgr); if (!lgr->is_smcd) smcr_lgr_link_deactivate_all(lgr); smc_lgr_free(lgr); } static void smc_lgr_terminate_work(struct work_struct *work) { struct smc_link_group *lgr = container_of(work, struct smc_link_group, terminate_work); __smc_lgr_terminate(lgr, true); } /* return next unique link id for the lgr */ static u8 smcr_next_link_id(struct smc_link_group *lgr) { u8 link_id; int i; while (1) { again: link_id = ++lgr->next_link_id; if (!link_id) /* skip zero as link_id */ link_id = ++lgr->next_link_id; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (smc_link_usable(&lgr->lnk[i]) && lgr->lnk[i].link_id == link_id) goto again; } break; } return link_id; } static void smcr_copy_dev_info_to_link(struct smc_link *link) { struct smc_ib_device *smcibdev = link->smcibdev; snprintf(link->ibname, sizeof(link->ibname), "%s", smcibdev->ibdev->name); link->ndev_ifidx = smcibdev->ndev_ifidx[link->ibport - 1]; } int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, u8 link_idx, struct smc_init_info *ini) { struct smc_ib_device *smcibdev; u8 rndvec[3]; int rc; if (lgr->smc_version == SMC_V2) { lnk->smcibdev = ini->smcrv2.ib_dev_v2; lnk->ibport = ini->smcrv2.ib_port_v2; } else { lnk->smcibdev = ini->ib_dev; lnk->ibport = ini->ib_port; } get_device(&lnk->smcibdev->ibdev->dev); atomic_inc(&lnk->smcibdev->lnk_cnt); refcount_set(&lnk->refcnt, 1); /* link refcnt is set to 1 */ lnk->clearing = 0; lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu; lnk->link_id = smcr_next_link_id(lgr); lnk->lgr = lgr; smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */ lnk->link_idx = link_idx; lnk->wr_rx_id_compl = 0; smc_ibdev_cnt_inc(lnk); smcr_copy_dev_info_to_link(lnk); atomic_set(&lnk->conn_cnt, 0); smc_llc_link_set_uid(lnk); INIT_WORK(&lnk->link_down_wrk, smc_link_down_work); if (!lnk->smcibdev->initialized) { rc = (int)smc_ib_setup_per_ibdev(lnk->smcibdev); if (rc) goto out; } get_random_bytes(rndvec, sizeof(rndvec)); lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16); rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport, ini->vlan_id, lnk->gid, &lnk->sgid_index, lgr->smc_version == SMC_V2 ? &ini->smcrv2 : NULL); if (rc) goto out; rc = smc_llc_link_init(lnk); if (rc) goto out; rc = smc_wr_alloc_link_mem(lnk); if (rc) goto clear_llc_lnk; rc = smc_ib_create_protection_domain(lnk); if (rc) goto free_link_mem; rc = smc_ib_create_queue_pair(lnk); if (rc) goto dealloc_pd; rc = smc_wr_create_link(lnk); if (rc) goto destroy_qp; lnk->state = SMC_LNK_ACTIVATING; return 0; destroy_qp: smc_ib_destroy_queue_pair(lnk); dealloc_pd: smc_ib_dealloc_protection_domain(lnk); free_link_mem: smc_wr_free_link_mem(lnk); clear_llc_lnk: smc_llc_link_clear(lnk, false); out: smc_ibdev_cnt_dec(lnk); put_device(&lnk->smcibdev->ibdev->dev); smcibdev = lnk->smcibdev; memset(lnk, 0, sizeof(struct smc_link)); lnk->state = SMC_LNK_UNUSED; if (!atomic_dec_return(&smcibdev->lnk_cnt)) wake_up(&smcibdev->lnks_deleted); smc_lgr_put(lgr); /* lgr_hold above */ return rc; } /* create a new SMC link group */ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_link_group *lgr; struct list_head *lgr_list; struct smcd_dev *smcd; struct smc_link *lnk; spinlock_t *lgr_lock; u8 link_idx; int rc = 0; int i; if (ini->is_smcd && ini->vlan_id) { if (smc_ism_get_vlan(ini->ism_dev[ini->ism_selected], ini->vlan_id)) { rc = SMC_CLC_DECL_ISMVLANERR; goto out; } } lgr = kzalloc(sizeof(*lgr), GFP_KERNEL); if (!lgr) { rc = SMC_CLC_DECL_MEM; goto ism_put_vlan; } lgr->tx_wq = alloc_workqueue("smc_tx_wq-%*phN", 0, 0, SMC_LGR_ID_SIZE, &lgr->id); if (!lgr->tx_wq) { rc = -ENOMEM; goto free_lgr; } lgr->is_smcd = ini->is_smcd; lgr->sync_err = 0; lgr->terminating = 0; lgr->freeing = 0; lgr->vlan_id = ini->vlan_id; refcount_set(&lgr->refcnt, 1); /* set lgr refcnt to 1 */ init_rwsem(&lgr->sndbufs_lock); init_rwsem(&lgr->rmbs_lock); rwlock_init(&lgr->conns_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { INIT_LIST_HEAD(&lgr->sndbufs[i]); INIT_LIST_HEAD(&lgr->rmbs[i]); } lgr->next_link_id = 0; smc_lgr_list.num += SMC_LGR_NUM_INCR; memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE); INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); INIT_WORK(&lgr->terminate_work, smc_lgr_terminate_work); lgr->conns_all = RB_ROOT; if (ini->is_smcd) { /* SMC-D specific settings */ smcd = ini->ism_dev[ini->ism_selected]; get_device(smcd->ops->get_dev(smcd)); lgr->peer_gid.gid = ini->ism_peer_gid[ini->ism_selected].gid; lgr->peer_gid.gid_ext = ini->ism_peer_gid[ini->ism_selected].gid_ext; lgr->smcd = ini->ism_dev[ini->ism_selected]; lgr_list = &ini->ism_dev[ini->ism_selected]->lgr_list; lgr_lock = &lgr->smcd->lgr_lock; lgr->smc_version = ini->smcd_version; lgr->peer_shutdown = 0; atomic_inc(&ini->ism_dev[ini->ism_selected]->lgr_cnt); } else { /* SMC-R specific settings */ struct smc_ib_device *ibdev; int ibport; lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; lgr->smc_version = ini->smcr_version; memcpy(lgr->peer_systemid, ini->peer_systemid, SMC_SYSTEMID_LEN); if (lgr->smc_version == SMC_V2) { ibdev = ini->smcrv2.ib_dev_v2; ibport = ini->smcrv2.ib_port_v2; lgr->saddr = ini->smcrv2.saddr; lgr->uses_gateway = ini->smcrv2.uses_gateway; memcpy(lgr->nexthop_mac, ini->smcrv2.nexthop_mac, ETH_ALEN); lgr->max_conns = ini->max_conns; lgr->max_links = ini->max_links; } else { ibdev = ini->ib_dev; ibport = ini->ib_port; lgr->max_conns = SMC_CONN_PER_LGR_MAX; lgr->max_links = SMC_LINKS_ADD_LNK_MAX; } memcpy(lgr->pnet_id, ibdev->pnetid[ibport - 1], SMC_MAX_PNETID_LEN); rc = smc_wr_alloc_lgr_mem(lgr); if (rc) goto free_wq; smc_llc_lgr_init(lgr, smc); link_idx = SMC_SINGLE_LINK; lnk = &lgr->lnk[link_idx]; rc = smcr_link_init(lgr, lnk, link_idx, ini); if (rc) { smc_wr_free_lgr_mem(lgr); goto free_wq; } lgr->net = smc_ib_net(lnk->smcibdev); lgr_list = &smc_lgr_list.list; lgr_lock = &smc_lgr_list.lock; lgr->buf_type = lgr->net->smc.sysctl_smcr_buf_type; atomic_inc(&lgr_cnt); } smc->conn.lgr = lgr; spin_lock_bh(lgr_lock); list_add_tail(&lgr->list, lgr_list); spin_unlock_bh(lgr_lock); return 0; free_wq: destroy_workqueue(lgr->tx_wq); free_lgr: kfree(lgr); ism_put_vlan: if (ini->is_smcd && ini->vlan_id) smc_ism_put_vlan(ini->ism_dev[ini->ism_selected], ini->vlan_id); out: if (rc < 0) { if (rc == -ENOMEM) rc = SMC_CLC_DECL_MEM; else rc = SMC_CLC_DECL_INTERR; } return rc; } static int smc_write_space(struct smc_connection *conn) { int buffer_len = conn->peer_rmbe_size; union smc_host_cursor prod; union smc_host_cursor cons; int space; smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn); smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn); /* determine rx_buf space */ space = buffer_len - smc_curs_diff(buffer_len, &cons, &prod); return space; } static int smc_switch_cursor(struct smc_sock *smc, struct smc_cdc_tx_pend *pend, struct smc_wr_buf *wr_buf) { struct smc_connection *conn = &smc->conn; union smc_host_cursor cons, fin; int rc = 0; int diff; smc_curs_copy(&conn->tx_curs_sent, &conn->tx_curs_fin, conn); smc_curs_copy(&fin, &conn->local_tx_ctrl_fin, conn); /* set prod cursor to old state, enforce tx_rdma_writes() */ smc_curs_copy(&conn->local_tx_ctrl.prod, &fin, conn); smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn); if (smc_curs_comp(conn->peer_rmbe_size, &cons, &fin) < 0) { /* cons cursor advanced more than fin, and prod was set * fin above, so now prod is smaller than cons. Fix that. */ diff = smc_curs_diff(conn->peer_rmbe_size, &fin, &cons); smc_curs_add(conn->sndbuf_desc->len, &conn->tx_curs_sent, diff); smc_curs_add(conn->sndbuf_desc->len, &conn->tx_curs_fin, diff); smp_mb__before_atomic(); atomic_add(diff, &conn->sndbuf_space); smp_mb__after_atomic(); smc_curs_add(conn->peer_rmbe_size, &conn->local_tx_ctrl.prod, diff); smc_curs_add(conn->peer_rmbe_size, &conn->local_tx_ctrl_fin, diff); } /* recalculate, value is used by tx_rdma_writes() */ atomic_set(&smc->conn.peer_rmbe_space, smc_write_space(conn)); if (smc->sk.sk_state != SMC_INIT && smc->sk.sk_state != SMC_CLOSED) { rc = smcr_cdc_msg_send_validation(conn, pend, wr_buf); if (!rc) { queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, 0); smc->sk.sk_data_ready(&smc->sk); } } else { smc_wr_tx_put_slot(conn->lnk, (struct smc_wr_tx_pend_priv *)pend); } return rc; } void smc_switch_link_and_count(struct smc_connection *conn, struct smc_link *to_lnk) { atomic_dec(&conn->lnk->conn_cnt); /* link_hold in smc_conn_create() */ smcr_link_put(conn->lnk); conn->lnk = to_lnk; atomic_inc(&conn->lnk->conn_cnt); /* link_put in smc_conn_free() */ smcr_link_hold(conn->lnk); } struct smc_link *smc_switch_conns(struct smc_link_group *lgr, struct smc_link *from_lnk, bool is_dev_err) { struct smc_link *to_lnk = NULL; struct smc_cdc_tx_pend *pend; struct smc_connection *conn; struct smc_wr_buf *wr_buf; struct smc_sock *smc; struct rb_node *node; int i, rc = 0; /* link is inactive, wake up tx waiters */ smc_wr_wakeup_tx_wait(from_lnk); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&lgr->lnk[i]) || i == from_lnk->link_idx) continue; if (is_dev_err && from_lnk->smcibdev == lgr->lnk[i].smcibdev && from_lnk->ibport == lgr->lnk[i].ibport) { continue; } to_lnk = &lgr->lnk[i]; break; } if (!to_lnk || !smc_wr_tx_link_hold(to_lnk)) { smc_lgr_terminate_sched(lgr); return NULL; } again: read_lock_bh(&lgr->conns_lock); for (node = rb_first(&lgr->conns_all); node; node = rb_next(node)) { conn = rb_entry(node, struct smc_connection, alert_node); if (conn->lnk != from_lnk) continue; smc = container_of(conn, struct smc_sock, conn); /* conn->lnk not yet set in SMC_INIT state */ if (smc->sk.sk_state == SMC_INIT) continue; if (smc->sk.sk_state == SMC_CLOSED || smc->sk.sk_state == SMC_PEERCLOSEWAIT1 || smc->sk.sk_state == SMC_PEERCLOSEWAIT2 || smc->sk.sk_state == SMC_APPFINCLOSEWAIT || smc->sk.sk_state == SMC_APPCLOSEWAIT1 || smc->sk.sk_state == SMC_APPCLOSEWAIT2 || smc->sk.sk_state == SMC_PEERFINCLOSEWAIT || smc->sk.sk_state == SMC_PEERABORTWAIT || smc->sk.sk_state == SMC_PROCESSABORT) { spin_lock_bh(&conn->send_lock); smc_switch_link_and_count(conn, to_lnk); spin_unlock_bh(&conn->send_lock); continue; } sock_hold(&smc->sk); read_unlock_bh(&lgr->conns_lock); /* pre-fetch buffer outside of send_lock, might sleep */ rc = smc_cdc_get_free_slot(conn, to_lnk, &wr_buf, NULL, &pend); if (rc) goto err_out; /* avoid race with smcr_tx_sndbuf_nonempty() */ spin_lock_bh(&conn->send_lock); smc_switch_link_and_count(conn, to_lnk); rc = smc_switch_cursor(smc, pend, wr_buf); spin_unlock_bh(&conn->send_lock); sock_put(&smc->sk); if (rc) goto err_out; goto again; } read_unlock_bh(&lgr->conns_lock); smc_wr_tx_link_put(to_lnk); return to_lnk; err_out: smcr_link_down_cond_sched(to_lnk); smc_wr_tx_link_put(to_lnk); return NULL; } static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb, struct smc_link_group *lgr) { struct rw_semaphore *lock; /* lock buffer list */ int rc; if (is_rmb && buf_desc->is_conf_rkey && !list_empty(&lgr->list)) { /* unregister rmb with peer */ rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); if (!rc) { /* protect against smc_llc_cli_rkey_exchange() */ down_read(&lgr->llc_conf_mutex); smc_llc_do_delete_rkey(lgr, buf_desc); buf_desc->is_conf_rkey = false; up_read(&lgr->llc_conf_mutex); smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); } } if (buf_desc->is_reg_err) { /* buf registration failed, reuse not possible */ lock = is_rmb ? &lgr->rmbs_lock : &lgr->sndbufs_lock; down_write(lock); list_del(&buf_desc->list); up_write(lock); smc_buf_free(lgr, is_rmb, buf_desc); } else { /* memzero_explicit provides potential memory barrier semantics */ memzero_explicit(buf_desc->cpu_addr, buf_desc->len); WRITE_ONCE(buf_desc->used, 0); } } static void smc_buf_unuse(struct smc_connection *conn, struct smc_link_group *lgr) { if (conn->sndbuf_desc) { if (!lgr->is_smcd && conn->sndbuf_desc->is_vm) { smcr_buf_unuse(conn->sndbuf_desc, false, lgr); } else { memzero_explicit(conn->sndbuf_desc->cpu_addr, conn->sndbuf_desc->len); WRITE_ONCE(conn->sndbuf_desc->used, 0); } } if (conn->rmb_desc) { if (!lgr->is_smcd) { smcr_buf_unuse(conn->rmb_desc, true, lgr); } else { memzero_explicit(conn->rmb_desc->cpu_addr, conn->rmb_desc->len + sizeof(struct smcd_cdc_msg)); WRITE_ONCE(conn->rmb_desc->used, 0); } } } /* remove a finished connection from its link group */ void smc_conn_free(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; if (!lgr || conn->freed) /* Connection has never been registered in a * link group, or has already been freed. */ return; conn->freed = 1; if (!smc_conn_lgr_valid(conn)) /* Connection has already unregistered from * link group. */ goto lgr_put; if (lgr->is_smcd) { if (!list_empty(&lgr->list)) smc_ism_unset_conn(conn); tasklet_kill(&conn->rx_tsklet); } else { smc_cdc_wait_pend_tx_wr(conn); if (current_work() != &conn->abort_work) cancel_work_sync(&conn->abort_work); } if (!list_empty(&lgr->list)) { smc_buf_unuse(conn, lgr); /* allow buffer reuse */ smc_lgr_unregister_conn(conn); } if (!lgr->conns_num) smc_lgr_schedule_free_work(lgr); lgr_put: if (!lgr->is_smcd) smcr_link_put(conn->lnk); /* link_hold in smc_conn_create() */ smc_lgr_put(lgr); /* lgr_hold in smc_conn_create() */ } /* unregister a link from a buf_desc */ static void smcr_buf_unmap_link(struct smc_buf_desc *buf_desc, bool is_rmb, struct smc_link *lnk) { if (is_rmb || buf_desc->is_vm) buf_desc->is_reg_mr[lnk->link_idx] = false; if (!buf_desc->is_map_ib[lnk->link_idx]) return; if ((is_rmb || buf_desc->is_vm) && buf_desc->mr[lnk->link_idx]) { smc_ib_put_memory_region(buf_desc->mr[lnk->link_idx]); buf_desc->mr[lnk->link_idx] = NULL; } if (is_rmb) smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE); else smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE); sg_free_table(&buf_desc->sgt[lnk->link_idx]); buf_desc->is_map_ib[lnk->link_idx] = false; } /* unmap all buffers of lgr for a deleted link */ static void smcr_buf_unmap_lgr(struct smc_link *lnk) { struct smc_link_group *lgr = lnk->lgr; struct smc_buf_desc *buf_desc, *bf; int i; for (i = 0; i < SMC_RMBE_SIZES; i++) { down_write(&lgr->rmbs_lock); list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) smcr_buf_unmap_link(buf_desc, true, lnk); up_write(&lgr->rmbs_lock); down_write(&lgr->sndbufs_lock); list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i], list) smcr_buf_unmap_link(buf_desc, false, lnk); up_write(&lgr->sndbufs_lock); } } static void smcr_rtoken_clear_link(struct smc_link *lnk) { struct smc_link_group *lgr = lnk->lgr; int i; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { lgr->rtokens[i][lnk->link_idx].rkey = 0; lgr->rtokens[i][lnk->link_idx].dma_addr = 0; } } static void __smcr_link_clear(struct smc_link *lnk) { struct smc_link_group *lgr = lnk->lgr; struct smc_ib_device *smcibdev; smc_wr_free_link_mem(lnk); smc_ibdev_cnt_dec(lnk); put_device(&lnk->smcibdev->ibdev->dev); smcibdev = lnk->smcibdev; memset(lnk, 0, sizeof(struct smc_link)); lnk->state = SMC_LNK_UNUSED; if (!atomic_dec_return(&smcibdev->lnk_cnt)) wake_up(&smcibdev->lnks_deleted); smc_lgr_put(lgr); /* lgr_hold in smcr_link_init() */ } /* must be called under lgr->llc_conf_mutex lock */ void smcr_link_clear(struct smc_link *lnk, bool log) { if (!lnk->lgr || lnk->clearing || lnk->state == SMC_LNK_UNUSED) return; lnk->clearing = 1; lnk->peer_qpn = 0; smc_llc_link_clear(lnk, log); smcr_buf_unmap_lgr(lnk); smcr_rtoken_clear_link(lnk); smc_ib_modify_qp_error(lnk); smc_wr_free_link(lnk); smc_ib_destroy_queue_pair(lnk); smc_ib_dealloc_protection_domain(lnk); smcr_link_put(lnk); /* theoretically last link_put */ } void smcr_link_hold(struct smc_link *lnk) { refcount_inc(&lnk->refcnt); } void smcr_link_put(struct smc_link *lnk) { if (refcount_dec_and_test(&lnk->refcnt)) __smcr_link_clear(lnk); } static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc) { int i; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) smcr_buf_unmap_link(buf_desc, is_rmb, &lgr->lnk[i]); if (!buf_desc->is_vm && buf_desc->pages) __free_pages(buf_desc->pages, buf_desc->order); else if (buf_desc->is_vm && buf_desc->cpu_addr) vfree(buf_desc->cpu_addr); kfree(buf_desc); } static void smcd_buf_free(struct smc_link_group *lgr, bool is_dmb, struct smc_buf_desc *buf_desc) { if (is_dmb) { /* restore original buf len */ buf_desc->len += sizeof(struct smcd_cdc_msg); smc_ism_unregister_dmb(lgr->smcd, buf_desc); } else { kfree(buf_desc->cpu_addr); } kfree(buf_desc); } static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc) { if (lgr->is_smcd) smcd_buf_free(lgr, is_rmb, buf_desc); else smcr_buf_free(lgr, is_rmb, buf_desc); } static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb) { struct smc_buf_desc *buf_desc, *bf_desc; struct list_head *buf_list; int i; for (i = 0; i < SMC_RMBE_SIZES; i++) { if (is_rmb) buf_list = &lgr->rmbs[i]; else buf_list = &lgr->sndbufs[i]; list_for_each_entry_safe(buf_desc, bf_desc, buf_list, list) { list_del(&buf_desc->list); smc_buf_free(lgr, is_rmb, buf_desc); } } } static void smc_lgr_free_bufs(struct smc_link_group *lgr) { /* free send buffers */ __smc_lgr_free_bufs(lgr, false); /* free rmbs */ __smc_lgr_free_bufs(lgr, true); } /* won't be freed until no one accesses to lgr anymore */ static void __smc_lgr_free(struct smc_link_group *lgr) { smc_lgr_free_bufs(lgr); if (lgr->is_smcd) { if (!atomic_dec_return(&lgr->smcd->lgr_cnt)) wake_up(&lgr->smcd->lgrs_deleted); } else { smc_wr_free_lgr_mem(lgr); if (!atomic_dec_return(&lgr_cnt)) wake_up(&lgrs_deleted); } kfree(lgr); } /* remove a link group */ static void smc_lgr_free(struct smc_link_group *lgr) { int i; if (!lgr->is_smcd) { down_write(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (lgr->lnk[i].state != SMC_LNK_UNUSED) smcr_link_clear(&lgr->lnk[i], false); } up_write(&lgr->llc_conf_mutex); smc_llc_lgr_clear(lgr); } destroy_workqueue(lgr->tx_wq); if (lgr->is_smcd) { smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); put_device(lgr->smcd->ops->get_dev(lgr->smcd)); } smc_lgr_put(lgr); /* theoretically last lgr_put */ } void smc_lgr_hold(struct smc_link_group *lgr) { refcount_inc(&lgr->refcnt); } void smc_lgr_put(struct smc_link_group *lgr) { if (refcount_dec_and_test(&lgr->refcnt)) __smc_lgr_free(lgr); } static void smc_sk_wake_ups(struct smc_sock *smc) { smc->sk.sk_write_space(&smc->sk); smc->sk.sk_data_ready(&smc->sk); smc->sk.sk_state_change(&smc->sk); } /* kill a connection */ static void smc_conn_kill(struct smc_connection *conn, bool soft) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); if (conn->lgr->is_smcd && conn->lgr->peer_shutdown) conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; else smc_close_abort(conn); conn->killed = 1; smc->sk.sk_err = ECONNABORTED; smc_sk_wake_ups(smc); if (conn->lgr->is_smcd) { smc_ism_unset_conn(conn); if (soft) tasklet_kill(&conn->rx_tsklet); else tasklet_unlock_wait(&conn->rx_tsklet); } else { smc_cdc_wait_pend_tx_wr(conn); } smc_lgr_unregister_conn(conn); smc_close_active_abort(smc); } static void smc_lgr_cleanup(struct smc_link_group *lgr) { if (lgr->is_smcd) { smc_ism_signal_shutdown(lgr); } else { u32 rsn = lgr->llc_termination_rsn; if (!rsn) rsn = SMC_LLC_DEL_PROG_INIT_TERM; smc_llc_send_link_delete_all(lgr, false, rsn); smcr_lgr_link_deactivate_all(lgr); } } /* terminate link group * @soft: true if link group shutdown can take its time * false if immediate link group shutdown is required */ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) { struct smc_connection *conn; struct smc_sock *smc; struct rb_node *node; if (lgr->terminating) return; /* lgr already terminating */ /* cancel free_work sync, will terminate when lgr->freeing is set */ cancel_delayed_work(&lgr->free_work); lgr->terminating = 1; /* kill remaining link group connections */ read_lock_bh(&lgr->conns_lock); node = rb_first(&lgr->conns_all); while (node) { read_unlock_bh(&lgr->conns_lock); conn = rb_entry(node, struct smc_connection, alert_node); smc = container_of(conn, struct smc_sock, conn); sock_hold(&smc->sk); /* sock_put below */ lock_sock(&smc->sk); smc_conn_kill(conn, soft); release_sock(&smc->sk); sock_put(&smc->sk); /* sock_hold above */ read_lock_bh(&lgr->conns_lock); node = rb_first(&lgr->conns_all); } read_unlock_bh(&lgr->conns_lock); smc_lgr_cleanup(lgr); smc_lgr_free(lgr); } /* unlink link group and schedule termination */ void smc_lgr_terminate_sched(struct smc_link_group *lgr) { spinlock_t *lgr_lock; smc_lgr_list_head(lgr, &lgr_lock); spin_lock_bh(lgr_lock); if (list_empty(&lgr->list) || lgr->terminating || lgr->freeing) { spin_unlock_bh(lgr_lock); return; /* lgr already terminating */ } list_del_init(&lgr->list); lgr->freeing = 1; spin_unlock_bh(lgr_lock); schedule_work(&lgr->terminate_work); } /* Called when peer lgr shutdown (regularly or abnormally) is received */ void smc_smcd_terminate(struct smcd_dev *dev, struct smcd_gid *peer_gid, unsigned short vlan) { struct smc_link_group *lgr, *l; LIST_HEAD(lgr_free_list); /* run common cleanup function and build free list */ spin_lock_bh(&dev->lgr_lock); list_for_each_entry_safe(lgr, l, &dev->lgr_list, list) { if ((!peer_gid->gid || (lgr->peer_gid.gid == peer_gid->gid && !smc_ism_is_emulated(dev) ? 1 : lgr->peer_gid.gid_ext == peer_gid->gid_ext)) && (vlan == VLAN_VID_MASK || lgr->vlan_id == vlan)) { if (peer_gid->gid) /* peer triggered termination */ lgr->peer_shutdown = 1; list_move(&lgr->list, &lgr_free_list); lgr->freeing = 1; } } spin_unlock_bh(&dev->lgr_lock); /* cancel the regular free workers and actually free lgrs */ list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { list_del_init(&lgr->list); schedule_work(&lgr->terminate_work); } } /* Called when an SMCD device is removed or the smc module is unloaded */ void smc_smcd_terminate_all(struct smcd_dev *smcd) { struct smc_link_group *lgr, *lg; LIST_HEAD(lgr_free_list); spin_lock_bh(&smcd->lgr_lock); list_splice_init(&smcd->lgr_list, &lgr_free_list); list_for_each_entry(lgr, &lgr_free_list, list) lgr->freeing = 1; spin_unlock_bh(&smcd->lgr_lock); list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) { list_del_init(&lgr->list); __smc_lgr_terminate(lgr, false); } if (atomic_read(&smcd->lgr_cnt)) wait_event(smcd->lgrs_deleted, !atomic_read(&smcd->lgr_cnt)); } /* Called when an SMCR device is removed or the smc module is unloaded. * If smcibdev is given, all SMCR link groups using this device are terminated. * If smcibdev is NULL, all SMCR link groups are terminated. */ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) { struct smc_link_group *lgr, *lg; LIST_HEAD(lgr_free_list); int i; spin_lock_bh(&smc_lgr_list.lock); if (!smcibdev) { list_splice_init(&smc_lgr_list.list, &lgr_free_list); list_for_each_entry(lgr, &lgr_free_list, list) lgr->freeing = 1; } else { list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (lgr->lnk[i].smcibdev == smcibdev) smcr_link_down_cond_sched(&lgr->lnk[i]); } } } spin_unlock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) { list_del_init(&lgr->list); smc_llc_set_termination_rsn(lgr, SMC_LLC_DEL_OP_INIT_TERM); __smc_lgr_terminate(lgr, false); } if (smcibdev) { if (atomic_read(&smcibdev->lnk_cnt)) wait_event(smcibdev->lnks_deleted, !atomic_read(&smcibdev->lnk_cnt)); } else { if (atomic_read(&lgr_cnt)) wait_event(lgrs_deleted, !atomic_read(&lgr_cnt)); } } /* set new lgr type and clear all asymmetric link tagging */ void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type) { char *lgr_type = ""; int i; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) if (smc_link_usable(&lgr->lnk[i])) lgr->lnk[i].link_is_asym = false; if (lgr->type == new_type) return; lgr->type = new_type; switch (lgr->type) { case SMC_LGR_NONE: lgr_type = "NONE"; break; case SMC_LGR_SINGLE: lgr_type = "SINGLE"; break; case SMC_LGR_SYMMETRIC: lgr_type = "SYMMETRIC"; break; case SMC_LGR_ASYMMETRIC_PEER: lgr_type = "ASYMMETRIC_PEER"; break; case SMC_LGR_ASYMMETRIC_LOCAL: lgr_type = "ASYMMETRIC_LOCAL"; break; } pr_warn_ratelimited("smc: SMC-R lg %*phN net %llu state changed: " "%s, pnetid %.16s\n", SMC_LGR_ID_SIZE, &lgr->id, lgr->net->net_cookie, lgr_type, lgr->pnet_id); } /* set new lgr type and tag a link as asymmetric */ void smcr_lgr_set_type_asym(struct smc_link_group *lgr, enum smc_lgr_type new_type, int asym_lnk_idx) { smcr_lgr_set_type(lgr, new_type); lgr->lnk[asym_lnk_idx].link_is_asym = true; } /* abort connection, abort_work scheduled from tasklet context */ static void smc_conn_abort_work(struct work_struct *work) { struct smc_connection *conn = container_of(work, struct smc_connection, abort_work); struct smc_sock *smc = container_of(conn, struct smc_sock, conn); lock_sock(&smc->sk); smc_conn_kill(conn, true); release_sock(&smc->sk); sock_put(&smc->sk); /* sock_hold done by schedulers of abort_work */ } void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport) { struct smc_link_group *lgr, *n; spin_lock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) { struct smc_link *link; if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id, SMC_MAX_PNETID_LEN) || lgr->type == SMC_LGR_SYMMETRIC || lgr->type == SMC_LGR_ASYMMETRIC_PEER || !rdma_dev_access_netns(smcibdev->ibdev, lgr->net)) continue; if (lgr->type == SMC_LGR_SINGLE && lgr->max_links <= 1) continue; /* trigger local add link processing */ link = smc_llc_usable_link(lgr); if (link) smc_llc_add_link_local(link); } spin_unlock_bh(&smc_lgr_list.lock); } /* link is down - switch connections to alternate link, * must be called under lgr->llc_conf_mutex lock */ static void smcr_link_down(struct smc_link *lnk) { struct smc_link_group *lgr = lnk->lgr; struct smc_link *to_lnk; int del_link_id; if (!lgr || lnk->state == SMC_LNK_UNUSED || list_empty(&lgr->list)) return; to_lnk = smc_switch_conns(lgr, lnk, true); if (!to_lnk) { /* no backup link available */ smcr_link_clear(lnk, true); return; } smcr_lgr_set_type(lgr, SMC_LGR_SINGLE); del_link_id = lnk->link_id; if (lgr->role == SMC_SERV) { /* trigger local delete link processing */ smc_llc_srv_delete_link_local(to_lnk, del_link_id); } else { if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) { /* another llc task is ongoing */ up_write(&lgr->llc_conf_mutex); wait_event_timeout(lgr->llc_flow_waiter, (list_empty(&lgr->list) || lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE), SMC_LLC_WAIT_TIME); down_write(&lgr->llc_conf_mutex); } if (!list_empty(&lgr->list)) { smc_llc_send_delete_link(to_lnk, del_link_id, SMC_LLC_REQ, true, SMC_LLC_DEL_LOST_PATH); smcr_link_clear(lnk, true); } wake_up(&lgr->llc_flow_waiter); /* wake up next waiter */ } } /* must be called under lgr->llc_conf_mutex lock */ void smcr_link_down_cond(struct smc_link *lnk) { if (smc_link_downing(&lnk->state)) { trace_smcr_link_down(lnk, __builtin_return_address(0)); smcr_link_down(lnk); } } /* will get the lgr->llc_conf_mutex lock */ void smcr_link_down_cond_sched(struct smc_link *lnk) { if (smc_link_downing(&lnk->state)) { trace_smcr_link_down(lnk, __builtin_return_address(0)); schedule_work(&lnk->link_down_wrk); } } void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport) { struct smc_link_group *lgr, *n; int i; list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) { if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id, SMC_MAX_PNETID_LEN)) continue; /* lgr is not affected */ if (list_empty(&lgr->list)) continue; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &lgr->lnk[i]; if (smc_link_usable(lnk) && lnk->smcibdev == smcibdev && lnk->ibport == ibport) smcr_link_down_cond_sched(lnk); } } } static void smc_link_down_work(struct work_struct *work) { struct smc_link *link = container_of(work, struct smc_link, link_down_wrk); struct smc_link_group *lgr = link->lgr; if (list_empty(&lgr->list)) return; wake_up_all(&lgr->llc_msg_waiter); down_write(&lgr->llc_conf_mutex); smcr_link_down(link); up_write(&lgr->llc_conf_mutex); } static int smc_vlan_by_tcpsk_walk(struct net_device *lower_dev, struct netdev_nested_priv *priv) { unsigned short *vlan_id = (unsigned short *)priv->data; if (is_vlan_dev(lower_dev)) { *vlan_id = vlan_dev_vlan_id(lower_dev); return 1; } return 0; } /* Determine vlan of internal TCP socket. */ int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini) { struct dst_entry *dst = sk_dst_get(clcsock->sk); struct netdev_nested_priv priv; struct net_device *ndev; int rc = 0; ini->vlan_id = 0; if (!dst) { rc = -ENOTCONN; goto out; } if (!dst->dev) { rc = -ENODEV; goto out_rel; } ndev = dst->dev; if (is_vlan_dev(ndev)) { ini->vlan_id = vlan_dev_vlan_id(ndev); goto out_rel; } priv.data = (void *)&ini->vlan_id; rtnl_lock(); netdev_walk_all_lower_dev(ndev, smc_vlan_by_tcpsk_walk, &priv); rtnl_unlock(); out_rel: dst_release(dst); out: return rc; } static bool smcr_lgr_match(struct smc_link_group *lgr, u8 smcr_version, u8 peer_systemid[], u8 peer_gid[], u8 peer_mac_v1[], enum smc_lgr_role role, u32 clcqpn, struct net *net) { struct smc_link *lnk; int i; if (memcmp(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN) || lgr->role != role) return false; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { lnk = &lgr->lnk[i]; if (!smc_link_active(lnk)) continue; /* use verbs API to check netns, instead of lgr->net */ if (!rdma_dev_access_netns(lnk->smcibdev->ibdev, net)) return false; if ((lgr->role == SMC_SERV || lnk->peer_qpn == clcqpn) && !memcmp(lnk->peer_gid, peer_gid, SMC_GID_SIZE) && (smcr_version == SMC_V2 || !memcmp(lnk->peer_mac, peer_mac_v1, ETH_ALEN))) return true; } return false; } static bool smcd_lgr_match(struct smc_link_group *lgr, struct smcd_dev *smcismdev, struct smcd_gid *peer_gid) { if (lgr->peer_gid.gid != peer_gid->gid || lgr->smcd != smcismdev) return false; if (smc_ism_is_emulated(smcismdev) && lgr->peer_gid.gid_ext != peer_gid->gid_ext) return false; return true; } /* create a new SMC connection (and a new link group if necessary) */ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_connection *conn = &smc->conn; struct net *net = sock_net(&smc->sk); struct list_head *lgr_list; struct smc_link_group *lgr; enum smc_lgr_role role; spinlock_t *lgr_lock; int rc = 0; lgr_list = ini->is_smcd ? &ini->ism_dev[ini->ism_selected]->lgr_list : &smc_lgr_list.list; lgr_lock = ini->is_smcd ? &ini->ism_dev[ini->ism_selected]->lgr_lock : &smc_lgr_list.lock; ini->first_contact_local = 1; role = smc->listen_smc ? SMC_SERV : SMC_CLNT; if (role == SMC_CLNT && ini->first_contact_peer) /* create new link group as well */ goto create; /* determine if an existing link group can be reused */ spin_lock_bh(lgr_lock); list_for_each_entry(lgr, lgr_list, list) { write_lock_bh(&lgr->conns_lock); if ((ini->is_smcd ? smcd_lgr_match(lgr, ini->ism_dev[ini->ism_selected], &ini->ism_peer_gid[ini->ism_selected]) : smcr_lgr_match(lgr, ini->smcr_version, ini->peer_systemid, ini->peer_gid, ini->peer_mac, role, ini->ib_clcqpn, net)) && !lgr->sync_err && (ini->smcd_version == SMC_V2 || lgr->vlan_id == ini->vlan_id) && (role == SMC_CLNT || ini->is_smcd || (lgr->conns_num < lgr->max_conns && !bitmap_full(lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX)))) { /* link group found */ ini->first_contact_local = 0; conn->lgr = lgr; rc = smc_lgr_register_conn(conn, false); write_unlock_bh(&lgr->conns_lock); if (!rc && delayed_work_pending(&lgr->free_work)) cancel_delayed_work(&lgr->free_work); break; } write_unlock_bh(&lgr->conns_lock); } spin_unlock_bh(lgr_lock); if (rc) return rc; if (role == SMC_CLNT && !ini->first_contact_peer && ini->first_contact_local) { /* Server reuses a link group, but Client wants to start * a new one * send out_of_sync decline, reason synchr. error */ return SMC_CLC_DECL_SYNCERR; } create: if (ini->first_contact_local) { rc = smc_lgr_create(smc, ini); if (rc) goto out; lgr = conn->lgr; write_lock_bh(&lgr->conns_lock); rc = smc_lgr_register_conn(conn, true); write_unlock_bh(&lgr->conns_lock); if (rc) { smc_lgr_cleanup_early(lgr); goto out; } } smc_lgr_hold(conn->lgr); /* lgr_put in smc_conn_free() */ if (!conn->lgr->is_smcd) smcr_link_hold(conn->lnk); /* link_put in smc_conn_free() */ conn->freed = 0; conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; conn->urg_state = SMC_URG_READ; init_waitqueue_head(&conn->cdc_pend_tx_wq); INIT_WORK(&smc->conn.abort_work, smc_conn_abort_work); if (ini->is_smcd) { conn->rx_off = sizeof(struct smcd_cdc_msg); smcd_cdc_rx_init(conn); /* init tasklet for this conn */ } else { conn->rx_off = 0; } #ifndef KERNEL_HAS_ATOMIC64 spin_lock_init(&conn->acurs_lock); #endif out: return rc; } #define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ #define SMCR_RMBE_SIZES 5 /* 0 -> 16KB, 1 -> 32KB, .. 5 -> 512KB */ /* convert the RMB size into the compressed notation (minimum 16K, see * SMCD/R_DMBE_SIZES. * In contrast to plain ilog2, this rounds towards the next power of 2, * so the socket application gets at least its desired sndbuf / rcvbuf size. */ static u8 smc_compress_bufsize(int size, bool is_smcd, bool is_rmb) { const unsigned int max_scat = SG_MAX_SINGLE_ALLOC * PAGE_SIZE; u8 compressed; if (size <= SMC_BUF_MIN_SIZE) return 0; size = (size - 1) >> 14; /* convert to 16K multiple */ compressed = min_t(u8, ilog2(size) + 1, is_smcd ? SMCD_DMBE_SIZES : SMCR_RMBE_SIZES); if (!is_smcd && is_rmb) /* RMBs are backed by & limited to max size of scatterlists */ compressed = min_t(u8, compressed, ilog2(max_scat >> 14)); return compressed; } /* convert the RMB size from compressed notation into integer */ int smc_uncompress_bufsize(u8 compressed) { u32 size; size = 0x00000001 << (((int)compressed) + 14); return (int)size; } /* try to reuse a sndbuf or rmb description slot for a certain * buffer size; if not available, return NULL */ static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize, struct rw_semaphore *lock, struct list_head *buf_list) { struct smc_buf_desc *buf_slot; down_read(lock); list_for_each_entry(buf_slot, buf_list, list) { if (cmpxchg(&buf_slot->used, 0, 1) == 0) { up_read(lock); return buf_slot; } } up_read(lock); return NULL; } /* one of the conditions for announcing a receiver's current window size is * that it "results in a minimum increase in the window size of 10% of the * receive buffer space" [RFC7609] */ static inline int smc_rmb_wnd_update_limit(int rmbe_size) { return max_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); } /* map an buf to a link */ static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb, struct smc_link *lnk) { int rc, i, nents, offset, buf_size, size, access_flags; struct scatterlist *sg; void *buf; if (buf_desc->is_map_ib[lnk->link_idx]) return 0; if (buf_desc->is_vm) { buf = buf_desc->cpu_addr; buf_size = buf_desc->len; offset = offset_in_page(buf_desc->cpu_addr); nents = PAGE_ALIGN(buf_size + offset) / PAGE_SIZE; } else { nents = 1; } rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], nents, GFP_KERNEL); if (rc) return rc; if (buf_desc->is_vm) { /* virtually contiguous buffer */ for_each_sg(buf_desc->sgt[lnk->link_idx].sgl, sg, nents, i) { size = min_t(int, PAGE_SIZE - offset, buf_size); sg_set_page(sg, vmalloc_to_page(buf), size, offset); buf += size / sizeof(*buf); buf_size -= size; offset = 0; } } else { /* physically contiguous buffer */ sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl, buf_desc->cpu_addr, buf_desc->len); } /* map sg table to DMA address */ rc = smc_ib_buf_map_sg(lnk, buf_desc, is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); /* SMC protocol depends on mapping to one DMA address only */ if (rc != nents) { rc = -EAGAIN; goto free_table; } buf_desc->is_dma_need_sync |= smc_ib_is_sg_need_sync(lnk, buf_desc) << lnk->link_idx; if (is_rmb || buf_desc->is_vm) { /* create a new memory region for the RMB or vzalloced sndbuf */ access_flags = is_rmb ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : IB_ACCESS_LOCAL_WRITE; rc = smc_ib_get_memory_region(lnk->roce_pd, access_flags, buf_desc, lnk->link_idx); if (rc) goto buf_unmap; smc_ib_sync_sg_for_device(lnk, buf_desc, is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); } buf_desc->is_map_ib[lnk->link_idx] = true; return 0; buf_unmap: smc_ib_buf_unmap_sg(lnk, buf_desc, is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); free_table: sg_free_table(&buf_desc->sgt[lnk->link_idx]); return rc; } /* register a new buf on IB device, rmb or vzalloced sndbuf * must be called under lgr->llc_conf_mutex lock */ int smcr_link_reg_buf(struct smc_link *link, struct smc_buf_desc *buf_desc) { if (list_empty(&link->lgr->list)) return -ENOLINK; if (!buf_desc->is_reg_mr[link->link_idx]) { /* register memory region for new buf */ if (buf_desc->is_vm) buf_desc->mr[link->link_idx]->iova = (uintptr_t)buf_desc->cpu_addr; if (smc_wr_reg_send(link, buf_desc->mr[link->link_idx])) { buf_desc->is_reg_err = true; return -EFAULT; } buf_desc->is_reg_mr[link->link_idx] = true; } return 0; } static int _smcr_buf_map_lgr(struct smc_link *lnk, struct rw_semaphore *lock, struct list_head *lst, bool is_rmb) { struct smc_buf_desc *buf_desc, *bf; int rc = 0; down_write(lock); list_for_each_entry_safe(buf_desc, bf, lst, list) { if (!buf_desc->used) continue; rc = smcr_buf_map_link(buf_desc, is_rmb, lnk); if (rc) goto out; } out: up_write(lock); return rc; } /* map all used buffers of lgr for a new link */ int smcr_buf_map_lgr(struct smc_link *lnk) { struct smc_link_group *lgr = lnk->lgr; int i, rc = 0; for (i = 0; i < SMC_RMBE_SIZES; i++) { rc = _smcr_buf_map_lgr(lnk, &lgr->rmbs_lock, &lgr->rmbs[i], true); if (rc) return rc; rc = _smcr_buf_map_lgr(lnk, &lgr->sndbufs_lock, &lgr->sndbufs[i], false); if (rc) return rc; } return 0; } /* register all used buffers of lgr for a new link, * must be called under lgr->llc_conf_mutex lock */ int smcr_buf_reg_lgr(struct smc_link *lnk) { struct smc_link_group *lgr = lnk->lgr; struct smc_buf_desc *buf_desc, *bf; int i, rc = 0; /* reg all RMBs for a new link */ down_write(&lgr->rmbs_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) { if (!buf_desc->used) continue; rc = smcr_link_reg_buf(lnk, buf_desc); if (rc) { up_write(&lgr->rmbs_lock); return rc; } } } up_write(&lgr->rmbs_lock); if (lgr->buf_type == SMCR_PHYS_CONT_BUFS) return rc; /* reg all vzalloced sndbufs for a new link */ down_write(&lgr->sndbufs_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i], list) { if (!buf_desc->used || !buf_desc->is_vm) continue; rc = smcr_link_reg_buf(lnk, buf_desc); if (rc) { up_write(&lgr->sndbufs_lock); return rc; } } } up_write(&lgr->sndbufs_lock); return rc; } static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, bool is_rmb, int bufsize) { struct smc_buf_desc *buf_desc; /* try to alloc a new buffer */ buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); if (!buf_desc) return ERR_PTR(-ENOMEM); switch (lgr->buf_type) { case SMCR_PHYS_CONT_BUFS: case SMCR_MIXED_BUFS: buf_desc->order = get_order(bufsize); buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_NOMEMALLOC | __GFP_COMP | __GFP_NORETRY | __GFP_ZERO, buf_desc->order); if (buf_desc->pages) { buf_desc->cpu_addr = (void *)page_address(buf_desc->pages); buf_desc->len = bufsize; buf_desc->is_vm = false; break; } if (lgr->buf_type == SMCR_PHYS_CONT_BUFS) goto out; fallthrough; // try virtually continguous buf case SMCR_VIRT_CONT_BUFS: buf_desc->order = get_order(bufsize); buf_desc->cpu_addr = vzalloc(PAGE_SIZE << buf_desc->order); if (!buf_desc->cpu_addr) goto out; buf_desc->pages = NULL; buf_desc->len = bufsize; buf_desc->is_vm = true; break; } return buf_desc; out: kfree(buf_desc); return ERR_PTR(-EAGAIN); } /* map buf_desc on all usable links, * unused buffers stay mapped as long as the link is up */ static int smcr_buf_map_usable_links(struct smc_link_group *lgr, struct smc_buf_desc *buf_desc, bool is_rmb) { int i, rc = 0, cnt = 0; /* protect against parallel link reconfiguration */ down_read(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &lgr->lnk[i]; if (!smc_link_usable(lnk)) continue; if (smcr_buf_map_link(buf_desc, is_rmb, lnk)) { rc = -ENOMEM; goto out; } cnt++; } out: up_read(&lgr->llc_conf_mutex); if (!rc && !cnt) rc = -EINVAL; return rc; } static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, bool is_dmb, int bufsize) { struct smc_buf_desc *buf_desc; int rc; /* try to alloc a new DMB */ buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); if (!buf_desc) return ERR_PTR(-ENOMEM); if (is_dmb) { rc = smc_ism_register_dmb(lgr, bufsize, buf_desc); if (rc) { kfree(buf_desc); if (rc == -ENOMEM) return ERR_PTR(-EAGAIN); if (rc == -ENOSPC) return ERR_PTR(-ENOSPC); return ERR_PTR(-EIO); } buf_desc->pages = virt_to_page(buf_desc->cpu_addr); /* CDC header stored in buf. So, pretend it was smaller */ buf_desc->len = bufsize - sizeof(struct smcd_cdc_msg); } else { buf_desc->cpu_addr = kzalloc(bufsize, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC); if (!buf_desc->cpu_addr) { kfree(buf_desc); return ERR_PTR(-EAGAIN); } buf_desc->len = bufsize; } return buf_desc; } static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) { struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM); struct smc_connection *conn = &smc->conn; struct smc_link_group *lgr = conn->lgr; struct list_head *buf_list; int bufsize, bufsize_comp; struct rw_semaphore *lock; /* lock buffer list */ bool is_dgraded = false; if (is_rmb) /* use socket recv buffer size (w/o overhead) as start value */ bufsize = smc->sk.sk_rcvbuf / 2; else /* use socket send buffer size (w/o overhead) as start value */ bufsize = smc->sk.sk_sndbuf / 2; for (bufsize_comp = smc_compress_bufsize(bufsize, is_smcd, is_rmb); bufsize_comp >= 0; bufsize_comp--) { if (is_rmb) { lock = &lgr->rmbs_lock; buf_list = &lgr->rmbs[bufsize_comp]; } else { lock = &lgr->sndbufs_lock; buf_list = &lgr->sndbufs[bufsize_comp]; } bufsize = smc_uncompress_bufsize(bufsize_comp); /* check for reusable slot in the link group */ buf_desc = smc_buf_get_slot(bufsize_comp, lock, buf_list); if (buf_desc) { buf_desc->is_dma_need_sync = 0; SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize); SMC_STAT_BUF_REUSE(smc, is_smcd, is_rmb); break; /* found reusable slot */ } if (is_smcd) buf_desc = smcd_new_buf_create(lgr, is_rmb, bufsize); else buf_desc = smcr_new_buf_create(lgr, is_rmb, bufsize); if (PTR_ERR(buf_desc) == -ENOMEM) break; if (IS_ERR(buf_desc)) { if (!is_dgraded) { is_dgraded = true; SMC_STAT_RMB_DOWNGRADED(smc, is_smcd, is_rmb); } continue; } SMC_STAT_RMB_ALLOC(smc, is_smcd, is_rmb); SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize); buf_desc->used = 1; down_write(lock); list_add(&buf_desc->list, buf_list); up_write(lock); break; /* found */ } if (IS_ERR(buf_desc)) return PTR_ERR(buf_desc); if (!is_smcd) { if (smcr_buf_map_usable_links(lgr, buf_desc, is_rmb)) { smcr_buf_unuse(buf_desc, is_rmb, lgr); return -ENOMEM; } } if (is_rmb) { conn->rmb_desc = buf_desc; conn->rmbe_size_comp = bufsize_comp; smc->sk.sk_rcvbuf = bufsize * 2; atomic_set(&conn->bytes_to_rcv, 0); conn->rmbe_update_limit = smc_rmb_wnd_update_limit(buf_desc->len); if (is_smcd) smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */ } else { conn->sndbuf_desc = buf_desc; smc->sk.sk_sndbuf = bufsize * 2; atomic_set(&conn->sndbuf_space, bufsize); } return 0; } void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) { if (!conn->sndbuf_desc->is_dma_need_sync) return; if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd || !smc_link_active(conn->lnk)) return; smc_ib_sync_sg_for_device(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE); } void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) { int i; if (!conn->rmb_desc->is_dma_need_sync) return; if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd) return; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&conn->lgr->lnk[i])) continue; smc_ib_sync_sg_for_cpu(&conn->lgr->lnk[i], conn->rmb_desc, DMA_FROM_DEVICE); } } /* create the send and receive buffer for an SMC socket; * receive buffers are called RMBs; * (even though the SMC protocol allows more than one RMB-element per RMB, * the Linux implementation uses just one RMB-element per RMB, i.e. uses an * extra RMB for every connection in a link group */ int smc_buf_create(struct smc_sock *smc, bool is_smcd) { int rc; /* create send buffer */ rc = __smc_buf_create(smc, is_smcd, false); if (rc) return rc; /* create rmb */ rc = __smc_buf_create(smc, is_smcd, true); if (rc) { down_write(&smc->conn.lgr->sndbufs_lock); list_del(&smc->conn.sndbuf_desc->list); up_write(&smc->conn.lgr->sndbufs_lock); smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc); smc->conn.sndbuf_desc = NULL; } return rc; } static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr) { int i; for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) { if (!test_and_set_bit(i, lgr->rtokens_used_mask)) return i; } return -ENOSPC; } static int smc_rtoken_find_by_link(struct smc_link_group *lgr, int lnk_idx, u32 rkey) { int i; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { if (test_bit(i, lgr->rtokens_used_mask) && lgr->rtokens[i][lnk_idx].rkey == rkey) return i; } return -ENOENT; } /* set rtoken for a new link to an existing rmb */ void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new, __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey) { int rtok_idx; rtok_idx = smc_rtoken_find_by_link(lgr, link_idx, ntohl(nw_rkey_known)); if (rtok_idx == -ENOENT) return; lgr->rtokens[rtok_idx][link_idx_new].rkey = ntohl(nw_rkey); lgr->rtokens[rtok_idx][link_idx_new].dma_addr = be64_to_cpu(nw_vaddr); } /* set rtoken for a new link whose link_id is given */ void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id, __be64 nw_vaddr, __be32 nw_rkey) { u64 dma_addr = be64_to_cpu(nw_vaddr); u32 rkey = ntohl(nw_rkey); bool found = false; int link_idx; for (link_idx = 0; link_idx < SMC_LINKS_PER_LGR_MAX; link_idx++) { if (lgr->lnk[link_idx].link_id == link_id) { found = true; break; } } if (!found) return; lgr->rtokens[rtok_idx][link_idx].rkey = rkey; lgr->rtokens[rtok_idx][link_idx].dma_addr = dma_addr; } /* add a new rtoken from peer */ int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey) { struct smc_link_group *lgr = smc_get_lgr(lnk); u64 dma_addr = be64_to_cpu(nw_vaddr); u32 rkey = ntohl(nw_rkey); int i; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { if (lgr->rtokens[i][lnk->link_idx].rkey == rkey && lgr->rtokens[i][lnk->link_idx].dma_addr == dma_addr && test_bit(i, lgr->rtokens_used_mask)) { /* already in list */ return i; } } i = smc_rmb_reserve_rtoken_idx(lgr); if (i < 0) return i; lgr->rtokens[i][lnk->link_idx].rkey = rkey; lgr->rtokens[i][lnk->link_idx].dma_addr = dma_addr; return i; } /* delete an rtoken from all links */ int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey) { struct smc_link_group *lgr = smc_get_lgr(lnk); u32 rkey = ntohl(nw_rkey); int i, j; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { if (lgr->rtokens[i][lnk->link_idx].rkey == rkey && test_bit(i, lgr->rtokens_used_mask)) { for (j = 0; j < SMC_LINKS_PER_LGR_MAX; j++) { lgr->rtokens[i][j].rkey = 0; lgr->rtokens[i][j].dma_addr = 0; } clear_bit(i, lgr->rtokens_used_mask); return 0; } } return -ENOENT; } /* save rkey and dma_addr received from peer during clc handshake */ int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *lnk, struct smc_clc_msg_accept_confirm *clc) { conn->rtoken_idx = smc_rtoken_add(lnk, clc->r0.rmb_dma_addr, clc->r0.rmb_rkey); if (conn->rtoken_idx < 0) return conn->rtoken_idx; return 0; } static void smc_core_going_away(void) { struct smc_ib_device *smcibdev; struct smcd_dev *smcd; mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(smcibdev, &smc_ib_devices.list, list) { int i; for (i = 0; i < SMC_MAX_PORTS; i++) set_bit(i, smcibdev->ports_going_away); } mutex_unlock(&smc_ib_devices.mutex); mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd, &smcd_dev_list.list, list) { smcd->going_away = 1; } mutex_unlock(&smcd_dev_list.mutex); } /* Clean up all SMC link groups */ static void smc_lgrs_shutdown(void) { struct smcd_dev *smcd; smc_core_going_away(); smc_smcr_terminate_all(NULL); mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd, &smcd_dev_list.list, list) smc_smcd_terminate_all(smcd); mutex_unlock(&smcd_dev_list.mutex); } static int smc_core_reboot_event(struct notifier_block *this, unsigned long event, void *ptr) { smc_lgrs_shutdown(); smc_ib_unregister_client(); smc_ism_exit(); return 0; } static struct notifier_block smc_reboot_notifier = { .notifier_call = smc_core_reboot_event, }; int __init smc_core_init(void) { return register_reboot_notifier(&smc_reboot_notifier); } /* Called (from smc_exit) when module is removed */ void smc_core_exit(void) { unregister_reboot_notifier(&smc_reboot_notifier); smc_lgrs_shutdown(); }
31 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the ICMP protocol. * * Version: @(#)icmp.h 1.0.3 04/28/93 * * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_ICMP_H #define _LINUX_ICMP_H #include <linux/skbuff.h> #include <uapi/linux/icmp.h> #include <uapi/linux/errqueue.h> static inline struct icmphdr *icmp_hdr(const struct sk_buff *skb) { return (struct icmphdr *)skb_transport_header(skb); } static inline bool icmp_is_err(int type) { switch (type) { case ICMP_DEST_UNREACH: case ICMP_SOURCE_QUENCH: case ICMP_REDIRECT: case ICMP_TIME_EXCEEDED: case ICMP_PARAMETERPROB: return true; } return false; } void ip_icmp_error_rfc4884(const struct sk_buff *skb, struct sock_ee_data_rfc4884 *out, int thlen, int off); #endif /* _LINUX_ICMP_H */
385 64 1967 30 583 275 584 584 373 2423 2431 1967 1746 72 581 584 583 584 2420 2419 2396 30 2423 2428 2430 526 308 222 472 527 233 233 234 234 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/file.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include "common.h" #include <linux/slab.h> /* * Mapping table from "enum tomoyo_path_acl_index" to "enum tomoyo_mac_index". */ static const u8 tomoyo_p2mac[TOMOYO_MAX_PATH_OPERATION] = { [TOMOYO_TYPE_EXECUTE] = TOMOYO_MAC_FILE_EXECUTE, [TOMOYO_TYPE_READ] = TOMOYO_MAC_FILE_OPEN, [TOMOYO_TYPE_WRITE] = TOMOYO_MAC_FILE_OPEN, [TOMOYO_TYPE_APPEND] = TOMOYO_MAC_FILE_OPEN, [TOMOYO_TYPE_UNLINK] = TOMOYO_MAC_FILE_UNLINK, [TOMOYO_TYPE_GETATTR] = TOMOYO_MAC_FILE_GETATTR, [TOMOYO_TYPE_RMDIR] = TOMOYO_MAC_FILE_RMDIR, [TOMOYO_TYPE_TRUNCATE] = TOMOYO_MAC_FILE_TRUNCATE, [TOMOYO_TYPE_SYMLINK] = TOMOYO_MAC_FILE_SYMLINK, [TOMOYO_TYPE_CHROOT] = TOMOYO_MAC_FILE_CHROOT, [TOMOYO_TYPE_UMOUNT] = TOMOYO_MAC_FILE_UMOUNT, }; /* * Mapping table from "enum tomoyo_mkdev_acl_index" to "enum tomoyo_mac_index". */ const u8 tomoyo_pnnn2mac[TOMOYO_MAX_MKDEV_OPERATION] = { [TOMOYO_TYPE_MKBLOCK] = TOMOYO_MAC_FILE_MKBLOCK, [TOMOYO_TYPE_MKCHAR] = TOMOYO_MAC_FILE_MKCHAR, }; /* * Mapping table from "enum tomoyo_path2_acl_index" to "enum tomoyo_mac_index". */ const u8 tomoyo_pp2mac[TOMOYO_MAX_PATH2_OPERATION] = { [TOMOYO_TYPE_LINK] = TOMOYO_MAC_FILE_LINK, [TOMOYO_TYPE_RENAME] = TOMOYO_MAC_FILE_RENAME, [TOMOYO_TYPE_PIVOT_ROOT] = TOMOYO_MAC_FILE_PIVOT_ROOT, }; /* * Mapping table from "enum tomoyo_path_number_acl_index" to * "enum tomoyo_mac_index". */ const u8 tomoyo_pn2mac[TOMOYO_MAX_PATH_NUMBER_OPERATION] = { [TOMOYO_TYPE_CREATE] = TOMOYO_MAC_FILE_CREATE, [TOMOYO_TYPE_MKDIR] = TOMOYO_MAC_FILE_MKDIR, [TOMOYO_TYPE_MKFIFO] = TOMOYO_MAC_FILE_MKFIFO, [TOMOYO_TYPE_MKSOCK] = TOMOYO_MAC_FILE_MKSOCK, [TOMOYO_TYPE_IOCTL] = TOMOYO_MAC_FILE_IOCTL, [TOMOYO_TYPE_CHMOD] = TOMOYO_MAC_FILE_CHMOD, [TOMOYO_TYPE_CHOWN] = TOMOYO_MAC_FILE_CHOWN, [TOMOYO_TYPE_CHGRP] = TOMOYO_MAC_FILE_CHGRP, }; /** * tomoyo_put_name_union - Drop reference on "struct tomoyo_name_union". * * @ptr: Pointer to "struct tomoyo_name_union". * * Returns nothing. */ void tomoyo_put_name_union(struct tomoyo_name_union *ptr) { tomoyo_put_group(ptr->group); tomoyo_put_name(ptr->filename); } /** * tomoyo_compare_name_union - Check whether a name matches "struct tomoyo_name_union" or not. * * @name: Pointer to "struct tomoyo_path_info". * @ptr: Pointer to "struct tomoyo_name_union". * * Returns "struct tomoyo_path_info" if @name matches @ptr, NULL otherwise. */ const struct tomoyo_path_info * tomoyo_compare_name_union(const struct tomoyo_path_info *name, const struct tomoyo_name_union *ptr) { if (ptr->group) return tomoyo_path_matches_group(name, ptr->group); if (tomoyo_path_matches_pattern(name, ptr->filename)) return ptr->filename; return NULL; } /** * tomoyo_put_number_union - Drop reference on "struct tomoyo_number_union". * * @ptr: Pointer to "struct tomoyo_number_union". * * Returns nothing. */ void tomoyo_put_number_union(struct tomoyo_number_union *ptr) { tomoyo_put_group(ptr->group); } /** * tomoyo_compare_number_union - Check whether a value matches "struct tomoyo_number_union" or not. * * @value: Number to check. * @ptr: Pointer to "struct tomoyo_number_union". * * Returns true if @value matches @ptr, false otherwise. */ bool tomoyo_compare_number_union(const unsigned long value, const struct tomoyo_number_union *ptr) { if (ptr->group) return tomoyo_number_matches_group(value, value, ptr->group); return value >= ptr->values[0] && value <= ptr->values[1]; } /** * tomoyo_add_slash - Add trailing '/' if needed. * * @buf: Pointer to "struct tomoyo_path_info". * * Returns nothing. * * @buf must be generated by tomoyo_encode() because this function does not * allocate memory for adding '/'. */ static void tomoyo_add_slash(struct tomoyo_path_info *buf) { if (buf->is_dir) return; /* * This is OK because tomoyo_encode() reserves space for appending "/". */ strcat((char *) buf->name, "/"); tomoyo_fill_path_info(buf); } /** * tomoyo_get_realpath - Get realpath. * * @buf: Pointer to "struct tomoyo_path_info". * @path: Pointer to "struct path". * * Returns true on success, false otherwise. */ static bool tomoyo_get_realpath(struct tomoyo_path_info *buf, const struct path *path) { buf->name = tomoyo_realpath_from_path(path); if (buf->name) { tomoyo_fill_path_info(buf); return true; } return false; } /** * tomoyo_audit_path_log - Audit path request log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_audit_path_log(struct tomoyo_request_info *r) { return tomoyo_supervisor(r, "file %s %s\n", tomoyo_path_keyword [r->param.path.operation], r->param.path.filename->name); } /** * tomoyo_audit_path2_log - Audit path/path request log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_audit_path2_log(struct tomoyo_request_info *r) { return tomoyo_supervisor(r, "file %s %s %s\n", tomoyo_mac_keywords [tomoyo_pp2mac[r->param.path2.operation]], r->param.path2.filename1->name, r->param.path2.filename2->name); } /** * tomoyo_audit_mkdev_log - Audit path/number/number/number request log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_audit_mkdev_log(struct tomoyo_request_info *r) { return tomoyo_supervisor(r, "file %s %s 0%o %u %u\n", tomoyo_mac_keywords [tomoyo_pnnn2mac[r->param.mkdev.operation]], r->param.mkdev.filename->name, r->param.mkdev.mode, r->param.mkdev.major, r->param.mkdev.minor); } /** * tomoyo_audit_path_number_log - Audit path/number request log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_audit_path_number_log(struct tomoyo_request_info *r) { const u8 type = r->param.path_number.operation; u8 radix; char buffer[64]; switch (type) { case TOMOYO_TYPE_CREATE: case TOMOYO_TYPE_MKDIR: case TOMOYO_TYPE_MKFIFO: case TOMOYO_TYPE_MKSOCK: case TOMOYO_TYPE_CHMOD: radix = TOMOYO_VALUE_TYPE_OCTAL; break; case TOMOYO_TYPE_IOCTL: radix = TOMOYO_VALUE_TYPE_HEXADECIMAL; break; default: radix = TOMOYO_VALUE_TYPE_DECIMAL; break; } tomoyo_print_ulong(buffer, sizeof(buffer), r->param.path_number.number, radix); return tomoyo_supervisor(r, "file %s %s %s\n", tomoyo_mac_keywords [tomoyo_pn2mac[type]], r->param.path_number.filename->name, buffer); } /** * tomoyo_check_path_acl - Check permission for path operation. * * @r: Pointer to "struct tomoyo_request_info". * @ptr: Pointer to "struct tomoyo_acl_info". * * Returns true if granted, false otherwise. * * To be able to use wildcard for domain transition, this function sets * matching entry on success. Since the caller holds tomoyo_read_lock(), * it is safe to set matching entry. */ static bool tomoyo_check_path_acl(struct tomoyo_request_info *r, const struct tomoyo_acl_info *ptr) { const struct tomoyo_path_acl *acl = container_of(ptr, typeof(*acl), head); if (acl->perm & (1 << r->param.path.operation)) { r->param.path.matched_path = tomoyo_compare_name_union(r->param.path.filename, &acl->name); return r->param.path.matched_path != NULL; } return false; } /** * tomoyo_check_path_number_acl - Check permission for path number operation. * * @r: Pointer to "struct tomoyo_request_info". * @ptr: Pointer to "struct tomoyo_acl_info". * * Returns true if granted, false otherwise. */ static bool tomoyo_check_path_number_acl(struct tomoyo_request_info *r, const struct tomoyo_acl_info *ptr) { const struct tomoyo_path_number_acl *acl = container_of(ptr, typeof(*acl), head); return (acl->perm & (1 << r->param.path_number.operation)) && tomoyo_compare_number_union(r->param.path_number.number, &acl->number) && tomoyo_compare_name_union(r->param.path_number.filename, &acl->name); } /** * tomoyo_check_path2_acl - Check permission for path path operation. * * @r: Pointer to "struct tomoyo_request_info". * @ptr: Pointer to "struct tomoyo_acl_info". * * Returns true if granted, false otherwise. */ static bool tomoyo_check_path2_acl(struct tomoyo_request_info *r, const struct tomoyo_acl_info *ptr) { const struct tomoyo_path2_acl *acl = container_of(ptr, typeof(*acl), head); return (acl->perm & (1 << r->param.path2.operation)) && tomoyo_compare_name_union(r->param.path2.filename1, &acl->name1) && tomoyo_compare_name_union(r->param.path2.filename2, &acl->name2); } /** * tomoyo_check_mkdev_acl - Check permission for path number number number operation. * * @r: Pointer to "struct tomoyo_request_info". * @ptr: Pointer to "struct tomoyo_acl_info". * * Returns true if granted, false otherwise. */ static bool tomoyo_check_mkdev_acl(struct tomoyo_request_info *r, const struct tomoyo_acl_info *ptr) { const struct tomoyo_mkdev_acl *acl = container_of(ptr, typeof(*acl), head); return (acl->perm & (1 << r->param.mkdev.operation)) && tomoyo_compare_number_union(r->param.mkdev.mode, &acl->mode) && tomoyo_compare_number_union(r->param.mkdev.major, &acl->major) && tomoyo_compare_number_union(r->param.mkdev.minor, &acl->minor) && tomoyo_compare_name_union(r->param.mkdev.filename, &acl->name); } /** * tomoyo_same_path_acl - Check for duplicated "struct tomoyo_path_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * * Returns true if @a == @b except permission bits, false otherwise. */ static bool tomoyo_same_path_acl(const struct tomoyo_acl_info *a, const struct tomoyo_acl_info *b) { const struct tomoyo_path_acl *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_path_acl *p2 = container_of(b, typeof(*p2), head); return tomoyo_same_name_union(&p1->name, &p2->name); } /** * tomoyo_merge_path_acl - Merge duplicated "struct tomoyo_path_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * @is_delete: True for @a &= ~@b, false for @a |= @b. * * Returns true if @a is empty, false otherwise. */ static bool tomoyo_merge_path_acl(struct tomoyo_acl_info *a, struct tomoyo_acl_info *b, const bool is_delete) { u16 * const a_perm = &container_of(a, struct tomoyo_path_acl, head) ->perm; u16 perm = READ_ONCE(*a_perm); const u16 b_perm = container_of(b, struct tomoyo_path_acl, head)->perm; if (is_delete) perm &= ~b_perm; else perm |= b_perm; WRITE_ONCE(*a_perm, perm); return !perm; } /** * tomoyo_update_path_acl - Update "struct tomoyo_path_acl" list. * * @perm: Permission. * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_update_path_acl(const u16 perm, struct tomoyo_acl_param *param) { struct tomoyo_path_acl e = { .head.type = TOMOYO_TYPE_PATH_ACL, .perm = perm }; int error; if (!tomoyo_parse_name_union(param, &e.name)) error = -EINVAL; else error = tomoyo_update_domain(&e.head, sizeof(e), param, tomoyo_same_path_acl, tomoyo_merge_path_acl); tomoyo_put_name_union(&e.name); return error; } /** * tomoyo_same_mkdev_acl - Check for duplicated "struct tomoyo_mkdev_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * * Returns true if @a == @b except permission bits, false otherwise. */ static bool tomoyo_same_mkdev_acl(const struct tomoyo_acl_info *a, const struct tomoyo_acl_info *b) { const struct tomoyo_mkdev_acl *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_mkdev_acl *p2 = container_of(b, typeof(*p2), head); return tomoyo_same_name_union(&p1->name, &p2->name) && tomoyo_same_number_union(&p1->mode, &p2->mode) && tomoyo_same_number_union(&p1->major, &p2->major) && tomoyo_same_number_union(&p1->minor, &p2->minor); } /** * tomoyo_merge_mkdev_acl - Merge duplicated "struct tomoyo_mkdev_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * @is_delete: True for @a &= ~@b, false for @a |= @b. * * Returns true if @a is empty, false otherwise. */ static bool tomoyo_merge_mkdev_acl(struct tomoyo_acl_info *a, struct tomoyo_acl_info *b, const bool is_delete) { u8 *const a_perm = &container_of(a, struct tomoyo_mkdev_acl, head)->perm; u8 perm = READ_ONCE(*a_perm); const u8 b_perm = container_of(b, struct tomoyo_mkdev_acl, head) ->perm; if (is_delete) perm &= ~b_perm; else perm |= b_perm; WRITE_ONCE(*a_perm, perm); return !perm; } /** * tomoyo_update_mkdev_acl - Update "struct tomoyo_mkdev_acl" list. * * @perm: Permission. * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_update_mkdev_acl(const u8 perm, struct tomoyo_acl_param *param) { struct tomoyo_mkdev_acl e = { .head.type = TOMOYO_TYPE_MKDEV_ACL, .perm = perm }; int error; if (!tomoyo_parse_name_union(param, &e.name) || !tomoyo_parse_number_union(param, &e.mode) || !tomoyo_parse_number_union(param, &e.major) || !tomoyo_parse_number_union(param, &e.minor)) error = -EINVAL; else error = tomoyo_update_domain(&e.head, sizeof(e), param, tomoyo_same_mkdev_acl, tomoyo_merge_mkdev_acl); tomoyo_put_name_union(&e.name); tomoyo_put_number_union(&e.mode); tomoyo_put_number_union(&e.major); tomoyo_put_number_union(&e.minor); return error; } /** * tomoyo_same_path2_acl - Check for duplicated "struct tomoyo_path2_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * * Returns true if @a == @b except permission bits, false otherwise. */ static bool tomoyo_same_path2_acl(const struct tomoyo_acl_info *a, const struct tomoyo_acl_info *b) { const struct tomoyo_path2_acl *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_path2_acl *p2 = container_of(b, typeof(*p2), head); return tomoyo_same_name_union(&p1->name1, &p2->name1) && tomoyo_same_name_union(&p1->name2, &p2->name2); } /** * tomoyo_merge_path2_acl - Merge duplicated "struct tomoyo_path2_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * @is_delete: True for @a &= ~@b, false for @a |= @b. * * Returns true if @a is empty, false otherwise. */ static bool tomoyo_merge_path2_acl(struct tomoyo_acl_info *a, struct tomoyo_acl_info *b, const bool is_delete) { u8 * const a_perm = &container_of(a, struct tomoyo_path2_acl, head) ->perm; u8 perm = READ_ONCE(*a_perm); const u8 b_perm = container_of(b, struct tomoyo_path2_acl, head)->perm; if (is_delete) perm &= ~b_perm; else perm |= b_perm; WRITE_ONCE(*a_perm, perm); return !perm; } /** * tomoyo_update_path2_acl - Update "struct tomoyo_path2_acl" list. * * @perm: Permission. * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_update_path2_acl(const u8 perm, struct tomoyo_acl_param *param) { struct tomoyo_path2_acl e = { .head.type = TOMOYO_TYPE_PATH2_ACL, .perm = perm }; int error; if (!tomoyo_parse_name_union(param, &e.name1) || !tomoyo_parse_name_union(param, &e.name2)) error = -EINVAL; else error = tomoyo_update_domain(&e.head, sizeof(e), param, tomoyo_same_path2_acl, tomoyo_merge_path2_acl); tomoyo_put_name_union(&e.name1); tomoyo_put_name_union(&e.name2); return error; } /** * tomoyo_path_permission - Check permission for single path operation. * * @r: Pointer to "struct tomoyo_request_info". * @operation: Type of operation. * @filename: Filename to check. * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_path_permission(struct tomoyo_request_info *r, u8 operation, const struct tomoyo_path_info *filename) { int error; r->type = tomoyo_p2mac[operation]; r->mode = tomoyo_get_mode(r->domain->ns, r->profile, r->type); if (r->mode == TOMOYO_CONFIG_DISABLED) return 0; r->param_type = TOMOYO_TYPE_PATH_ACL; r->param.path.filename = filename; r->param.path.operation = operation; do { tomoyo_check_acl(r, tomoyo_check_path_acl); error = tomoyo_audit_path_log(r); } while (error == TOMOYO_RETRY_REQUEST); return error; } /** * tomoyo_execute_permission - Check permission for execute operation. * * @r: Pointer to "struct tomoyo_request_info". * @filename: Filename to check. * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ int tomoyo_execute_permission(struct tomoyo_request_info *r, const struct tomoyo_path_info *filename) { /* * Unlike other permission checks, this check is done regardless of * profile mode settings in order to check for domain transition * preference. */ r->type = TOMOYO_MAC_FILE_EXECUTE; r->mode = tomoyo_get_mode(r->domain->ns, r->profile, r->type); r->param_type = TOMOYO_TYPE_PATH_ACL; r->param.path.filename = filename; r->param.path.operation = TOMOYO_TYPE_EXECUTE; tomoyo_check_acl(r, tomoyo_check_path_acl); r->ee->transition = r->matched_acl && r->matched_acl->cond ? r->matched_acl->cond->transit : NULL; if (r->mode != TOMOYO_CONFIG_DISABLED) return tomoyo_audit_path_log(r); return 0; } /** * tomoyo_same_path_number_acl - Check for duplicated "struct tomoyo_path_number_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * * Returns true if @a == @b except permission bits, false otherwise. */ static bool tomoyo_same_path_number_acl(const struct tomoyo_acl_info *a, const struct tomoyo_acl_info *b) { const struct tomoyo_path_number_acl *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_path_number_acl *p2 = container_of(b, typeof(*p2), head); return tomoyo_same_name_union(&p1->name, &p2->name) && tomoyo_same_number_union(&p1->number, &p2->number); } /** * tomoyo_merge_path_number_acl - Merge duplicated "struct tomoyo_path_number_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * @is_delete: True for @a &= ~@b, false for @a |= @b. * * Returns true if @a is empty, false otherwise. */ static bool tomoyo_merge_path_number_acl(struct tomoyo_acl_info *a, struct tomoyo_acl_info *b, const bool is_delete) { u8 * const a_perm = &container_of(a, struct tomoyo_path_number_acl, head)->perm; u8 perm = READ_ONCE(*a_perm); const u8 b_perm = container_of(b, struct tomoyo_path_number_acl, head) ->perm; if (is_delete) perm &= ~b_perm; else perm |= b_perm; WRITE_ONCE(*a_perm, perm); return !perm; } /** * tomoyo_update_path_number_acl - Update ioctl/chmod/chown/chgrp ACL. * * @perm: Permission. * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_update_path_number_acl(const u8 perm, struct tomoyo_acl_param *param) { struct tomoyo_path_number_acl e = { .head.type = TOMOYO_TYPE_PATH_NUMBER_ACL, .perm = perm }; int error; if (!tomoyo_parse_name_union(param, &e.name) || !tomoyo_parse_number_union(param, &e.number)) error = -EINVAL; else error = tomoyo_update_domain(&e.head, sizeof(e), param, tomoyo_same_path_number_acl, tomoyo_merge_path_number_acl); tomoyo_put_name_union(&e.name); tomoyo_put_number_union(&e.number); return error; } /** * tomoyo_path_number_perm - Check permission for "create", "mkdir", "mkfifo", "mksock", "ioctl", "chmod", "chown", "chgrp". * * @type: Type of operation. * @path: Pointer to "struct path". * @number: Number. * * Returns 0 on success, negative value otherwise. */ int tomoyo_path_number_perm(const u8 type, const struct path *path, unsigned long number) { struct tomoyo_request_info r; struct tomoyo_obj_info obj = { .path1 = { .mnt = path->mnt, .dentry = path->dentry }, }; int error = -ENOMEM; struct tomoyo_path_info buf; int idx; if (tomoyo_init_request_info(&r, NULL, tomoyo_pn2mac[type]) == TOMOYO_CONFIG_DISABLED) return 0; idx = tomoyo_read_lock(); if (!tomoyo_get_realpath(&buf, path)) goto out; r.obj = &obj; if (type == TOMOYO_TYPE_MKDIR) tomoyo_add_slash(&buf); r.param_type = TOMOYO_TYPE_PATH_NUMBER_ACL; r.param.path_number.operation = type; r.param.path_number.filename = &buf; r.param.path_number.number = number; do { tomoyo_check_acl(&r, tomoyo_check_path_number_acl); error = tomoyo_audit_path_number_log(&r); } while (error == TOMOYO_RETRY_REQUEST); kfree(buf.name); out: tomoyo_read_unlock(idx); if (r.mode != TOMOYO_CONFIG_ENFORCING) error = 0; return error; } /** * tomoyo_check_open_permission - Check permission for "read" and "write". * * @domain: Pointer to "struct tomoyo_domain_info". * @path: Pointer to "struct path". * @flag: Flags for open(). * * Returns 0 on success, negative value otherwise. */ int tomoyo_check_open_permission(struct tomoyo_domain_info *domain, const struct path *path, const int flag) { const u8 acc_mode = ACC_MODE(flag); int error = 0; struct tomoyo_path_info buf; struct tomoyo_request_info r; struct tomoyo_obj_info obj = { .path1 = { .mnt = path->mnt, .dentry = path->dentry }, }; int idx; buf.name = NULL; r.mode = TOMOYO_CONFIG_DISABLED; idx = tomoyo_read_lock(); if (acc_mode && tomoyo_init_request_info(&r, domain, TOMOYO_MAC_FILE_OPEN) != TOMOYO_CONFIG_DISABLED) { if (!tomoyo_get_realpath(&buf, path)) { error = -ENOMEM; goto out; } r.obj = &obj; if (acc_mode & MAY_READ) error = tomoyo_path_permission(&r, TOMOYO_TYPE_READ, &buf); if (!error && (acc_mode & MAY_WRITE)) error = tomoyo_path_permission(&r, (flag & O_APPEND) ? TOMOYO_TYPE_APPEND : TOMOYO_TYPE_WRITE, &buf); } out: kfree(buf.name); tomoyo_read_unlock(idx); if (r.mode != TOMOYO_CONFIG_ENFORCING) error = 0; return error; } /** * tomoyo_path_perm - Check permission for "unlink", "rmdir", "truncate", "symlink", "append", "chroot" and "unmount". * * @operation: Type of operation. * @path: Pointer to "struct path". * @target: Symlink's target if @operation is TOMOYO_TYPE_SYMLINK, * NULL otherwise. * * Returns 0 on success, negative value otherwise. */ int tomoyo_path_perm(const u8 operation, const struct path *path, const char *target) { struct tomoyo_request_info r; struct tomoyo_obj_info obj = { .path1 = { .mnt = path->mnt, .dentry = path->dentry }, }; int error; struct tomoyo_path_info buf; bool is_enforce; struct tomoyo_path_info symlink_target; int idx; if (tomoyo_init_request_info(&r, NULL, tomoyo_p2mac[operation]) == TOMOYO_CONFIG_DISABLED) return 0; is_enforce = (r.mode == TOMOYO_CONFIG_ENFORCING); error = -ENOMEM; buf.name = NULL; idx = tomoyo_read_lock(); if (!tomoyo_get_realpath(&buf, path)) goto out; r.obj = &obj; switch (operation) { case TOMOYO_TYPE_RMDIR: case TOMOYO_TYPE_CHROOT: tomoyo_add_slash(&buf); break; case TOMOYO_TYPE_SYMLINK: symlink_target.name = tomoyo_encode(target); if (!symlink_target.name) goto out; tomoyo_fill_path_info(&symlink_target); obj.symlink_target = &symlink_target; break; } error = tomoyo_path_permission(&r, operation, &buf); if (operation == TOMOYO_TYPE_SYMLINK) kfree(symlink_target.name); out: kfree(buf.name); tomoyo_read_unlock(idx); if (!is_enforce) error = 0; return error; } /** * tomoyo_mkdev_perm - Check permission for "mkblock" and "mkchar". * * @operation: Type of operation. (TOMOYO_TYPE_MKCHAR or TOMOYO_TYPE_MKBLOCK) * @path: Pointer to "struct path". * @mode: Create mode. * @dev: Device number. * * Returns 0 on success, negative value otherwise. */ int tomoyo_mkdev_perm(const u8 operation, const struct path *path, const unsigned int mode, unsigned int dev) { struct tomoyo_request_info r; struct tomoyo_obj_info obj = { .path1 = { .mnt = path->mnt, .dentry = path->dentry }, }; int error = -ENOMEM; struct tomoyo_path_info buf; int idx; if (tomoyo_init_request_info(&r, NULL, tomoyo_pnnn2mac[operation]) == TOMOYO_CONFIG_DISABLED) return 0; idx = tomoyo_read_lock(); error = -ENOMEM; if (tomoyo_get_realpath(&buf, path)) { r.obj = &obj; dev = new_decode_dev(dev); r.param_type = TOMOYO_TYPE_MKDEV_ACL; r.param.mkdev.filename = &buf; r.param.mkdev.operation = operation; r.param.mkdev.mode = mode; r.param.mkdev.major = MAJOR(dev); r.param.mkdev.minor = MINOR(dev); tomoyo_check_acl(&r, tomoyo_check_mkdev_acl); error = tomoyo_audit_mkdev_log(&r); kfree(buf.name); } tomoyo_read_unlock(idx); if (r.mode != TOMOYO_CONFIG_ENFORCING) error = 0; return error; } /** * tomoyo_path2_perm - Check permission for "rename", "link" and "pivot_root". * * @operation: Type of operation. * @path1: Pointer to "struct path". * @path2: Pointer to "struct path". * * Returns 0 on success, negative value otherwise. */ int tomoyo_path2_perm(const u8 operation, const struct path *path1, const struct path *path2) { int error = -ENOMEM; struct tomoyo_path_info buf1; struct tomoyo_path_info buf2; struct tomoyo_request_info r; struct tomoyo_obj_info obj = { .path1 = { .mnt = path1->mnt, .dentry = path1->dentry }, .path2 = { .mnt = path2->mnt, .dentry = path2->dentry } }; int idx; if (tomoyo_init_request_info(&r, NULL, tomoyo_pp2mac[operation]) == TOMOYO_CONFIG_DISABLED) return 0; buf1.name = NULL; buf2.name = NULL; idx = tomoyo_read_lock(); if (!tomoyo_get_realpath(&buf1, path1) || !tomoyo_get_realpath(&buf2, path2)) goto out; switch (operation) { case TOMOYO_TYPE_RENAME: case TOMOYO_TYPE_LINK: if (!d_is_dir(path1->dentry)) break; fallthrough; case TOMOYO_TYPE_PIVOT_ROOT: tomoyo_add_slash(&buf1); tomoyo_add_slash(&buf2); break; } r.obj = &obj; r.param_type = TOMOYO_TYPE_PATH2_ACL; r.param.path2.operation = operation; r.param.path2.filename1 = &buf1; r.param.path2.filename2 = &buf2; do { tomoyo_check_acl(&r, tomoyo_check_path2_acl); error = tomoyo_audit_path2_log(&r); } while (error == TOMOYO_RETRY_REQUEST); out: kfree(buf1.name); kfree(buf2.name); tomoyo_read_unlock(idx); if (r.mode != TOMOYO_CONFIG_ENFORCING) error = 0; return error; } /** * tomoyo_same_mount_acl - Check for duplicated "struct tomoyo_mount_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * * Returns true if @a == @b, false otherwise. */ static bool tomoyo_same_mount_acl(const struct tomoyo_acl_info *a, const struct tomoyo_acl_info *b) { const struct tomoyo_mount_acl *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_mount_acl *p2 = container_of(b, typeof(*p2), head); return tomoyo_same_name_union(&p1->dev_name, &p2->dev_name) && tomoyo_same_name_union(&p1->dir_name, &p2->dir_name) && tomoyo_same_name_union(&p1->fs_type, &p2->fs_type) && tomoyo_same_number_union(&p1->flags, &p2->flags); } /** * tomoyo_update_mount_acl - Write "struct tomoyo_mount_acl" list. * * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_update_mount_acl(struct tomoyo_acl_param *param) { struct tomoyo_mount_acl e = { .head.type = TOMOYO_TYPE_MOUNT_ACL }; int error; if (!tomoyo_parse_name_union(param, &e.dev_name) || !tomoyo_parse_name_union(param, &e.dir_name) || !tomoyo_parse_name_union(param, &e.fs_type) || !tomoyo_parse_number_union(param, &e.flags)) error = -EINVAL; else error = tomoyo_update_domain(&e.head, sizeof(e), param, tomoyo_same_mount_acl, NULL); tomoyo_put_name_union(&e.dev_name); tomoyo_put_name_union(&e.dir_name); tomoyo_put_name_union(&e.fs_type); tomoyo_put_number_union(&e.flags); return error; } /** * tomoyo_write_file - Update file related list. * * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ int tomoyo_write_file(struct tomoyo_acl_param *param) { u16 perm = 0; u8 type; const char *operation = tomoyo_read_token(param); for (type = 0; type < TOMOYO_MAX_PATH_OPERATION; type++) if (tomoyo_permstr(operation, tomoyo_path_keyword[type])) perm |= 1 << type; if (perm) return tomoyo_update_path_acl(perm, param); for (type = 0; type < TOMOYO_MAX_PATH2_OPERATION; type++) if (tomoyo_permstr(operation, tomoyo_mac_keywords[tomoyo_pp2mac[type]])) perm |= 1 << type; if (perm) return tomoyo_update_path2_acl(perm, param); for (type = 0; type < TOMOYO_MAX_PATH_NUMBER_OPERATION; type++) if (tomoyo_permstr(operation, tomoyo_mac_keywords[tomoyo_pn2mac[type]])) perm |= 1 << type; if (perm) return tomoyo_update_path_number_acl(perm, param); for (type = 0; type < TOMOYO_MAX_MKDEV_OPERATION; type++) if (tomoyo_permstr(operation, tomoyo_mac_keywords[tomoyo_pnnn2mac[type]])) perm |= 1 << type; if (perm) return tomoyo_update_mkdev_acl(perm, param); if (tomoyo_permstr(operation, tomoyo_mac_keywords[TOMOYO_MAC_FILE_MOUNT])) return tomoyo_update_mount_acl(param); return -EINVAL; }
3 127 125 11 126 127 126 11 11 485 129 129 129 129 129 129 126 126 10 116 126 126 126 126 116 124 3 126 127 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2008 IBM Corporation * * Author: Mimi Zohar <zohar@us.ibm.com> * * File: ima_api.c * Implements must_appraise_or_measure, collect_measurement, * appraise_measurement, store_measurement and store_template. */ #include <linux/slab.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/xattr.h> #include <linux/evm.h> #include <linux/fsverity.h> #include "ima.h" /* * ima_free_template_entry - free an existing template entry */ void ima_free_template_entry(struct ima_template_entry *entry) { int i; for (i = 0; i < entry->template_desc->num_fields; i++) kfree(entry->template_data[i].data); kfree(entry->digests); kfree(entry); } /* * ima_alloc_init_template - create and initialize a new template entry */ int ima_alloc_init_template(struct ima_event_data *event_data, struct ima_template_entry **entry, struct ima_template_desc *desc) { struct ima_template_desc *template_desc; struct tpm_digest *digests; int i, result = 0; if (desc) template_desc = desc; else template_desc = ima_template_desc_current(); *entry = kzalloc(struct_size(*entry, template_data, template_desc->num_fields), GFP_NOFS); if (!*entry) return -ENOMEM; digests = kcalloc(NR_BANKS(ima_tpm_chip) + ima_extra_slots, sizeof(*digests), GFP_NOFS); if (!digests) { kfree(*entry); *entry = NULL; return -ENOMEM; } (*entry)->digests = digests; (*entry)->template_desc = template_desc; for (i = 0; i < template_desc->num_fields; i++) { const struct ima_template_field *field = template_desc->fields[i]; u32 len; result = field->field_init(event_data, &((*entry)->template_data[i])); if (result != 0) goto out; len = (*entry)->template_data[i].len; (*entry)->template_data_len += sizeof(len); (*entry)->template_data_len += len; } return 0; out: ima_free_template_entry(*entry); *entry = NULL; return result; } /* * ima_store_template - store ima template measurements * * Calculate the hash of a template entry, add the template entry * to an ordered list of measurement entries maintained inside the kernel, * and also update the aggregate integrity value (maintained inside the * configured TPM PCR) over the hashes of the current list of measurement * entries. * * Applications retrieve the current kernel-held measurement list through * the securityfs entries in /sys/kernel/security/ima. The signed aggregate * TPM PCR (called quote) can be retrieved using a TPM user space library * and is used to validate the measurement list. * * Returns 0 on success, error code otherwise */ int ima_store_template(struct ima_template_entry *entry, int violation, struct inode *inode, const unsigned char *filename, int pcr) { static const char op[] = "add_template_measure"; static const char audit_cause[] = "hashing_error"; char *template_name = entry->template_desc->name; int result; if (!violation) { result = ima_calc_field_array_hash(&entry->template_data[0], entry); if (result < 0) { integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, template_name, op, audit_cause, result, 0); return result; } } entry->pcr = pcr; result = ima_add_template_entry(entry, violation, op, inode, filename); return result; } /* * ima_add_violation - add violation to measurement list. * * Violations are flagged in the measurement list with zero hash values. * By extending the PCR with 0xFF's instead of with zeroes, the PCR * value is invalidated. */ void ima_add_violation(struct file *file, const unsigned char *filename, struct integrity_iint_cache *iint, const char *op, const char *cause) { struct ima_template_entry *entry; struct inode *inode = file_inode(file); struct ima_event_data event_data = { .iint = iint, .file = file, .filename = filename, .violation = cause }; int violation = 1; int result; /* can overflow, only indicator */ atomic_long_inc(&ima_htable.violations); result = ima_alloc_init_template(&event_data, &entry, NULL); if (result < 0) { result = -ENOMEM; goto err_out; } result = ima_store_template(entry, violation, inode, filename, CONFIG_IMA_MEASURE_PCR_IDX); if (result < 0) ima_free_template_entry(entry); err_out: integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, filename, op, cause, result, 0); } /** * ima_get_action - appraise & measure decision based on policy. * @idmap: idmap of the mount the inode was found from * @inode: pointer to the inode associated with the object being validated * @cred: pointer to credentials structure to validate * @secid: secid of the task being validated * @mask: contains the permission mask (MAY_READ, MAY_WRITE, MAY_EXEC, * MAY_APPEND) * @func: caller identifier * @pcr: pointer filled in if matched measure policy sets pcr= * @template_desc: pointer filled in if matched measure policy sets template= * @func_data: func specific data, may be NULL * @allowed_algos: allowlist of hash algorithms for the IMA xattr * * The policy is defined in terms of keypairs: * subj=, obj=, type=, func=, mask=, fsmagic= * subj,obj, and type: are LSM specific. * func: FILE_CHECK | BPRM_CHECK | CREDS_CHECK | MMAP_CHECK | MODULE_CHECK * | KEXEC_CMDLINE | KEY_CHECK | CRITICAL_DATA | SETXATTR_CHECK * | MMAP_CHECK_REQPROT * mask: contains the permission mask * fsmagic: hex value * * Returns IMA_MEASURE, IMA_APPRAISE mask. * */ int ima_get_action(struct mnt_idmap *idmap, struct inode *inode, const struct cred *cred, u32 secid, int mask, enum ima_hooks func, int *pcr, struct ima_template_desc **template_desc, const char *func_data, unsigned int *allowed_algos) { int flags = IMA_MEASURE | IMA_AUDIT | IMA_APPRAISE | IMA_HASH; flags &= ima_policy_flag; return ima_match_policy(idmap, inode, cred, secid, func, mask, flags, pcr, template_desc, func_data, allowed_algos); } static bool ima_get_verity_digest(struct integrity_iint_cache *iint, struct ima_max_digest_data *hash) { enum hash_algo alg; int digest_len; /* * On failure, 'measure' policy rules will result in a file data * hash containing 0's. */ digest_len = fsverity_get_digest(iint->inode, hash->digest, NULL, &alg); if (digest_len == 0) return false; /* * Unlike in the case of actually calculating the file hash, in * the fsverity case regardless of the hash algorithm, return * the verity digest to be included in the measurement list. A * mismatch between the verity algorithm and the xattr signature * algorithm, if one exists, will be detected later. */ hash->hdr.algo = alg; hash->hdr.length = digest_len; return true; } /* * ima_collect_measurement - collect file measurement * * Calculate the file hash, if it doesn't already exist, * storing the measurement and i_version in the iint. * * Must be called with iint->mutex held. * * Return 0 on success, error code otherwise */ int ima_collect_measurement(struct integrity_iint_cache *iint, struct file *file, void *buf, loff_t size, enum hash_algo algo, struct modsig *modsig) { const char *audit_cause = "failed"; struct inode *inode = file_inode(file); struct inode *real_inode = d_real_inode(file_dentry(file)); const char *filename = file->f_path.dentry->d_name.name; struct ima_max_digest_data hash; struct kstat stat; int result = 0; int length; void *tmpbuf; u64 i_version = 0; /* * Always collect the modsig, because IMA might have already collected * the file digest without collecting the modsig in a previous * measurement rule. */ if (modsig) ima_collect_modsig(modsig, buf, size); if (iint->flags & IMA_COLLECTED) goto out; /* * Detecting file change is based on i_version. On filesystems * which do not support i_version, support was originally limited * to an initial measurement/appraisal/audit, but was modified to * assume the file changed. */ result = vfs_getattr_nosec(&file->f_path, &stat, STATX_CHANGE_COOKIE, AT_STATX_SYNC_AS_STAT); if (!result && (stat.result_mask & STATX_CHANGE_COOKIE)) i_version = stat.change_cookie; hash.hdr.algo = algo; hash.hdr.length = hash_digest_size[algo]; /* Initialize hash digest to 0's in case of failure */ memset(&hash.digest, 0, sizeof(hash.digest)); if (iint->flags & IMA_VERITY_REQUIRED) { if (!ima_get_verity_digest(iint, &hash)) { audit_cause = "no-verity-digest"; result = -ENODATA; } } else if (buf) { result = ima_calc_buffer_hash(buf, size, &hash.hdr); } else { result = ima_calc_file_hash(file, &hash.hdr); } if (result && result != -EBADF && result != -EINVAL) goto out; length = sizeof(hash.hdr) + hash.hdr.length; tmpbuf = krealloc(iint->ima_hash, length, GFP_NOFS); if (!tmpbuf) { result = -ENOMEM; goto out; } iint->ima_hash = tmpbuf; memcpy(iint->ima_hash, &hash, length); iint->version = i_version; if (real_inode != inode) { iint->real_ino = real_inode->i_ino; iint->real_dev = real_inode->i_sb->s_dev; } /* Possibly temporary failure due to type of read (eg. O_DIRECT) */ if (!result) iint->flags |= IMA_COLLECTED; out: if (result) { if (file->f_flags & O_DIRECT) audit_cause = "failed(directio)"; integrity_audit_msg(AUDIT_INTEGRITY_DATA, inode, filename, "collect_data", audit_cause, result, 0); } return result; } /* * ima_store_measurement - store file measurement * * Create an "ima" template and then store the template by calling * ima_store_template. * * We only get here if the inode has not already been measured, * but the measurement could already exist: * - multiple copies of the same file on either the same or * different filesystems. * - the inode was previously flushed as well as the iint info, * containing the hashing info. * * Must be called with iint->mutex held. */ void ima_store_measurement(struct integrity_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, int xattr_len, const struct modsig *modsig, int pcr, struct ima_template_desc *template_desc) { static const char op[] = "add_template_measure"; static const char audit_cause[] = "ENOMEM"; int result = -ENOMEM; struct inode *inode = file_inode(file); struct ima_template_entry *entry; struct ima_event_data event_data = { .iint = iint, .file = file, .filename = filename, .xattr_value = xattr_value, .xattr_len = xattr_len, .modsig = modsig }; int violation = 0; /* * We still need to store the measurement in the case of MODSIG because * we only have its contents to put in the list at the time of * appraisal, but a file measurement from earlier might already exist in * the measurement list. */ if (iint->measured_pcrs & (0x1 << pcr) && !modsig) return; result = ima_alloc_init_template(&event_data, &entry, template_desc); if (result < 0) { integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, filename, op, audit_cause, result, 0); return; } result = ima_store_template(entry, violation, inode, filename, pcr); if ((!result || result == -EEXIST) && !(file->f_flags & O_DIRECT)) { iint->flags |= IMA_MEASURED; iint->measured_pcrs |= (0x1 << pcr); } if (result < 0) ima_free_template_entry(entry); } void ima_audit_measurement(struct integrity_iint_cache *iint, const unsigned char *filename) { struct audit_buffer *ab; char *hash; const char *algo_name = hash_algo_name[iint->ima_hash->algo]; int i; if (iint->flags & IMA_AUDITED) return; hash = kzalloc((iint->ima_hash->length * 2) + 1, GFP_KERNEL); if (!hash) return; for (i = 0; i < iint->ima_hash->length; i++) hex_byte_pack(hash + (i * 2), iint->ima_hash->digest[i]); hash[i * 2] = '\0'; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_INTEGRITY_RULE); if (!ab) goto out; audit_log_format(ab, "file="); audit_log_untrustedstring(ab, filename); audit_log_format(ab, " hash=\"%s:%s\"", algo_name, hash); audit_log_task_info(ab); audit_log_end(ab); iint->flags |= IMA_AUDITED; out: kfree(hash); return; } /* * ima_d_path - return a pointer to the full pathname * * Attempt to return a pointer to the full pathname for use in the * IMA measurement list, IMA audit records, and auditing logs. * * On failure, return a pointer to a copy of the filename, not dname. * Returning a pointer to dname, could result in using the pointer * after the memory has been freed. */ const char *ima_d_path(const struct path *path, char **pathbuf, char *namebuf) { char *pathname = NULL; *pathbuf = __getname(); if (*pathbuf) { pathname = d_absolute_path(path, *pathbuf, PATH_MAX); if (IS_ERR(pathname)) { __putname(*pathbuf); *pathbuf = NULL; pathname = NULL; } } if (!pathname) { strscpy(namebuf, path->dentry->d_name.name, NAME_MAX); pathname = namebuf; } return pathname; }
61 62 53 53 43 13 49 3 53 14 14 14 1 2 2 1 13 12 1 7 7 7 1 6 1 6 2 3 2 77 72 4 4 28 27 28 28 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 // SPDX-License-Identifier: GPL-2.0 /* * SUCS NET3: * * Generic stream handling routines. These are generic for most * protocols. Even IP. Tonight 8-). * This is used because TCP, LLC (others too) layer all have mostly * identical sendmsg() and recvmsg() code. * So we (will) share it here. * * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br> * (from old tcp.c code) * Alan Cox <alan@lxorguk.ukuu.org.uk> (Borrowed comments 8-)) */ #include <linux/module.h> #include <linux/sched/signal.h> #include <linux/net.h> #include <linux/signal.h> #include <linux/tcp.h> #include <linux/wait.h> #include <net/sock.h> /** * sk_stream_write_space - stream socket write_space callback. * @sk: socket * * FIXME: write proper description */ void sk_stream_write_space(struct sock *sk) { struct socket *sock = sk->sk_socket; struct socket_wq *wq; if (__sk_stream_is_writeable(sk, 1) && sock) { clear_bit(SOCK_NOSPACE, &sock->flags); rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_poll(&wq->wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT); rcu_read_unlock(); } } /** * sk_stream_wait_connect - Wait for a socket to get into the connected state * @sk: sock to wait on * @timeo_p: for how long to wait * * Must be called with the socket locked. */ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct task_struct *tsk = current; int done; do { int err = sock_error(sk); if (err) return err; if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) return -EPIPE; if (!*timeo_p) return -EAGAIN; if (signal_pending(tsk)) return sock_intr_errno(*timeo_p); add_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending++; done = sk_wait_event(sk, timeo_p, !READ_ONCE(sk->sk_err) && !((1 << READ_ONCE(sk->sk_state)) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)), &wait); remove_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending--; } while (!done); return done < 0 ? done : 0; } EXPORT_SYMBOL(sk_stream_wait_connect); /** * sk_stream_closing - Return 1 if we still have things to send in our buffers. * @sk: socket to verify */ static int sk_stream_closing(const struct sock *sk) { return (1 << READ_ONCE(sk->sk_state)) & (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK); } void sk_stream_wait_close(struct sock *sk, long timeout) { if (timeout) { DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(sk_sleep(sk), &wait); do { if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk), &wait)) break; } while (!signal_pending(current) && timeout); remove_wait_queue(sk_sleep(sk), &wait); } } EXPORT_SYMBOL(sk_stream_wait_close); /** * sk_stream_wait_memory - Wait for more memory for a socket * @sk: socket to wait for memory * @timeo_p: for how long */ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) { int ret, err = 0; long vm_wait = 0; long current_timeo = *timeo_p; DEFINE_WAIT_FUNC(wait, woken_wake_function); if (sk_stream_memory_free(sk)) current_timeo = vm_wait = get_random_u32_below(HZ / 5) + 2; add_wait_queue(sk_sleep(sk), &wait); while (1) { sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto do_error; if (!*timeo_p) goto do_eagain; if (signal_pending(current)) goto do_interrupted; sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (sk_stream_memory_free(sk) && !vm_wait) break; set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk->sk_write_pending++; ret = sk_wait_event(sk, &current_timeo, READ_ONCE(sk->sk_err) || (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) || (sk_stream_memory_free(sk) && !vm_wait), &wait); sk->sk_write_pending--; if (ret < 0) goto do_error; if (vm_wait) { vm_wait -= current_timeo; current_timeo = *timeo_p; if (current_timeo != MAX_SCHEDULE_TIMEOUT && (current_timeo -= vm_wait) < 0) current_timeo = 0; vm_wait = 0; } *timeo_p = current_timeo; } out: if (!sock_flag(sk, SOCK_DEAD)) remove_wait_queue(sk_sleep(sk), &wait); return err; do_error: err = -EPIPE; goto out; do_eagain: /* Make sure that whenever EAGAIN is returned, EPOLLOUT event can * be generated later. * When TCP receives ACK packets that make room, tcp_check_space() * only calls tcp_new_space() if SOCK_NOSPACE is set. */ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); err = -EAGAIN; goto out; do_interrupted: err = sock_intr_errno(*timeo_p); goto out; } EXPORT_SYMBOL(sk_stream_wait_memory); int sk_stream_error(struct sock *sk, int flags, int err) { if (err == -EPIPE) err = sock_error(sk) ? : -EPIPE; if (err == -EPIPE && !(flags & MSG_NOSIGNAL)) send_sig(SIGPIPE, current, 0); return err; } EXPORT_SYMBOL(sk_stream_error); void sk_stream_kill_queues(struct sock *sk) { /* First the read buffer. */ __skb_queue_purge(&sk->sk_receive_queue); /* Next, the error queue. * We need to use queue lock, because other threads might * add packets to the queue without socket lock being held. */ skb_queue_purge(&sk->sk_error_queue); /* Next, the write queue. */ WARN_ON_ONCE(!skb_queue_empty(&sk->sk_write_queue)); /* Account for returned memory. */ sk_mem_reclaim_final(sk); WARN_ON_ONCE(sk->sk_wmem_queued); /* It is _impossible_ for the backlog to contain anything * when we get here. All user references to this socket * have gone away, only the net layer knows can touch it. */ } EXPORT_SYMBOL(sk_stream_kill_queues);
17 17 17 17 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 // SPDX-License-Identifier: GPL-2.0-only /* * File: sysctl.c * * Phonet /proc/sys/net/phonet interface implementation * * Copyright (C) 2008 Nokia Corporation. * * Author: Rémi Denis-Courmont */ #include <linux/seqlock.h> #include <linux/sysctl.h> #include <linux/errno.h> #include <linux/init.h> #include <net/sock.h> #include <linux/phonet.h> #include <net/phonet/phonet.h> #define DYNAMIC_PORT_MIN 0x40 #define DYNAMIC_PORT_MAX 0x7f static DEFINE_SEQLOCK(local_port_range_lock); static int local_port_range_min[2] = {0, 0}; static int local_port_range_max[2] = {1023, 1023}; static int local_port_range[2] = {DYNAMIC_PORT_MIN, DYNAMIC_PORT_MAX}; static struct ctl_table_header *phonet_table_hrd; static void set_local_port_range(int range[2]) { write_seqlock(&local_port_range_lock); local_port_range[0] = range[0]; local_port_range[1] = range[1]; write_sequnlock(&local_port_range_lock); } void phonet_get_local_port_range(int *min, int *max) { unsigned int seq; do { seq = read_seqbegin(&local_port_range_lock); if (min) *min = local_port_range[0]; if (max) *max = local_port_range[1]; } while (read_seqretry(&local_port_range_lock, seq)); } static int proc_local_port_range(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; int range[2] = {local_port_range[0], local_port_range[1]}; struct ctl_table tmp = { .data = &range, .maxlen = sizeof(range), .mode = table->mode, .extra1 = &local_port_range_min, .extra2 = &local_port_range_max, }; ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) { if (range[1] < range[0]) ret = -EINVAL; else set_local_port_range(range); } return ret; } static struct ctl_table phonet_table[] = { { .procname = "local_port_range", .data = &local_port_range, .maxlen = sizeof(local_port_range), .mode = 0644, .proc_handler = proc_local_port_range, }, { } }; int __init phonet_sysctl_init(void) { phonet_table_hrd = register_net_sysctl(&init_net, "net/phonet", phonet_table); return phonet_table_hrd == NULL ? -ENOMEM : 0; } void phonet_sysctl_exit(void) { unregister_net_sysctl_table(phonet_table_hrd); }
117 116 115 27 10 79 79 2410 27 2406 4853 2530 4436 2926 4213 3165 1266 162 163 163 163 4787 4769 147 6188 5725 2900 6171 240 3234 3243 4787 2382 6173 3 1591 6183 6 670 6174 6 240 6 240 9 6178 4 696 6187 15 15 6206 1701 6179 6189 6 3 827 6371 6376 827 6367 6370 4811 4334 2 2 6366 2 6376 10 10 10 4 4 9 4 1 1 3 9 2 2 2 2 2 3 3 3 3 307 307 307 307 329 7 325 248 7 248 320 248 35 248 235 12 33 143 321 7 8 3 5 336 8 329 337 298 300 633 251 902 645 2 251 4 1 6 6942 667 6 6965 6678 9 637 240 2370 6840 666 6855 6 6869 3917 5903 2881 6354 902 62 276 1101 1352 3916 2 4057 667 6 6831 6852 4889 6295 667 6 2882 664 2886 2879 6347 890 893 62 1349 3920 2 2899 3544 3551 6588 3135 3198 5692 98 98 1957 276 277 2 23 175 9 3 1 4 17 37 240 116 104 116 15 105 98 9 99 7 96 8 8 96 95 96 79 11 3 79 8 90 115 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 // SPDX-License-Identifier: GPL-2.0-only /* * linux/lib/vsprintf.c * * Copyright (C) 1991, 1992 Linus Torvalds */ /* vsprintf.c -- Lars Wirzenius & Linus Torvalds. */ /* * Wirzenius wrote this portably, Torvalds fucked it up :-) */ /* * Fri Jul 13 2001 Crutcher Dunnavant <crutcher+kernel@datastacks.com> * - changed to provide snprintf and vsnprintf functions * So Feb 1 16:51:32 CET 2004 Juergen Quade <quade@hsnr.de> * - scnprintf and vscnprintf */ #include <linux/stdarg.h> #include <linux/build_bug.h> #include <linux/clk.h> #include <linux/clk-provider.h> #include <linux/errname.h> #include <linux/module.h> /* for KSYM_SYMBOL_LEN */ #include <linux/types.h> #include <linux/string.h> #include <linux/ctype.h> #include <linux/kernel.h> #include <linux/kallsyms.h> #include <linux/math64.h> #include <linux/uaccess.h> #include <linux/ioport.h> #include <linux/dcache.h> #include <linux/cred.h> #include <linux/rtc.h> #include <linux/sprintf.h> #include <linux/time.h> #include <linux/uuid.h> #include <linux/of.h> #include <net/addrconf.h> #include <linux/siphash.h> #include <linux/compiler.h> #include <linux/property.h> #include <linux/notifier.h> #ifdef CONFIG_BLOCK #include <linux/blkdev.h> #endif #include "../mm/internal.h" /* For the trace_print_flags arrays */ #include <asm/page.h> /* for PAGE_SIZE */ #include <asm/byteorder.h> /* cpu_to_le16 */ #include <asm/unaligned.h> #include <linux/string_helpers.h> #include "kstrtox.h" /* Disable pointer hashing if requested */ bool no_hash_pointers __ro_after_init; EXPORT_SYMBOL_GPL(no_hash_pointers); noinline static unsigned long long simple_strntoull(const char *startp, char **endp, unsigned int base, size_t max_chars) { const char *cp; unsigned long long result = 0ULL; size_t prefix_chars; unsigned int rv; cp = _parse_integer_fixup_radix(startp, &base); prefix_chars = cp - startp; if (prefix_chars < max_chars) { rv = _parse_integer_limit(cp, base, &result, max_chars - prefix_chars); /* FIXME */ cp += (rv & ~KSTRTOX_OVERFLOW); } else { /* Field too short for prefix + digit, skip over without converting */ cp = startp + max_chars; } if (endp) *endp = (char *)cp; return result; } /** * simple_strtoull - convert a string to an unsigned long long * @cp: The start of the string * @endp: A pointer to the end of the parsed string will be placed here * @base: The number base to use * * This function has caveats. Please use kstrtoull instead. */ noinline unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base) { return simple_strntoull(cp, endp, base, INT_MAX); } EXPORT_SYMBOL(simple_strtoull); /** * simple_strtoul - convert a string to an unsigned long * @cp: The start of the string * @endp: A pointer to the end of the parsed string will be placed here * @base: The number base to use * * This function has caveats. Please use kstrtoul instead. */ unsigned long simple_strtoul(const char *cp, char **endp, unsigned int base) { return simple_strtoull(cp, endp, base); } EXPORT_SYMBOL(simple_strtoul); /** * simple_strtol - convert a string to a signed long * @cp: The start of the string * @endp: A pointer to the end of the parsed string will be placed here * @base: The number base to use * * This function has caveats. Please use kstrtol instead. */ long simple_strtol(const char *cp, char **endp, unsigned int base) { if (*cp == '-') return -simple_strtoul(cp + 1, endp, base); return simple_strtoul(cp, endp, base); } EXPORT_SYMBOL(simple_strtol); noinline static long long simple_strntoll(const char *cp, char **endp, unsigned int base, size_t max_chars) { /* * simple_strntoull() safely handles receiving max_chars==0 in the * case cp[0] == '-' && max_chars == 1. * If max_chars == 0 we can drop through and pass it to simple_strntoull() * and the content of *cp is irrelevant. */ if (*cp == '-' && max_chars > 0) return -simple_strntoull(cp + 1, endp, base, max_chars - 1); return simple_strntoull(cp, endp, base, max_chars); } /** * simple_strtoll - convert a string to a signed long long * @cp: The start of the string * @endp: A pointer to the end of the parsed string will be placed here * @base: The number base to use * * This function has caveats. Please use kstrtoll instead. */ long long simple_strtoll(const char *cp, char **endp, unsigned int base) { return simple_strntoll(cp, endp, base, INT_MAX); } EXPORT_SYMBOL(simple_strtoll); static noinline_for_stack int skip_atoi(const char **s) { int i = 0; do { i = i*10 + *((*s)++) - '0'; } while (isdigit(**s)); return i; } /* * Decimal conversion is by far the most typical, and is used for * /proc and /sys data. This directly impacts e.g. top performance * with many processes running. We optimize it for speed by emitting * two characters at a time, using a 200 byte lookup table. This * roughly halves the number of multiplications compared to computing * the digits one at a time. Implementation strongly inspired by the * previous version, which in turn used ideas described at * <http://www.cs.uiowa.edu/~jones/bcd/divide.html> (with permission * from the author, Douglas W. Jones). * * It turns out there is precisely one 26 bit fixed-point * approximation a of 64/100 for which x/100 == (x * (u64)a) >> 32 * holds for all x in [0, 10^8-1], namely a = 0x28f5c29. The actual * range happens to be somewhat larger (x <= 1073741898), but that's * irrelevant for our purpose. * * For dividing a number in the range [10^4, 10^6-1] by 100, we still * need a 32x32->64 bit multiply, so we simply use the same constant. * * For dividing a number in the range [100, 10^4-1] by 100, there are * several options. The simplest is (x * 0x147b) >> 19, which is valid * for all x <= 43698. */ static const u16 decpair[100] = { #define _(x) (__force u16) cpu_to_le16(((x % 10) | ((x / 10) << 8)) + 0x3030) _( 0), _( 1), _( 2), _( 3), _( 4), _( 5), _( 6), _( 7), _( 8), _( 9), _(10), _(11), _(12), _(13), _(14), _(15), _(16), _(17), _(18), _(19), _(20), _(21), _(22), _(23), _(24), _(25), _(26), _(27), _(28), _(29), _(30), _(31), _(32), _(33), _(34), _(35), _(36), _(37), _(38), _(39), _(40), _(41), _(42), _(43), _(44), _(45), _(46), _(47), _(48), _(49), _(50), _(51), _(52), _(53), _(54), _(55), _(56), _(57), _(58), _(59), _(60), _(61), _(62), _(63), _(64), _(65), _(66), _(67), _(68), _(69), _(70), _(71), _(72), _(73), _(74), _(75), _(76), _(77), _(78), _(79), _(80), _(81), _(82), _(83), _(84), _(85), _(86), _(87), _(88), _(89), _(90), _(91), _(92), _(93), _(94), _(95), _(96), _(97), _(98), _(99), #undef _ }; /* * This will print a single '0' even if r == 0, since we would * immediately jump to out_r where two 0s would be written but only * one of them accounted for in buf. This is needed by ip4_string * below. All other callers pass a non-zero value of r. */ static noinline_for_stack char *put_dec_trunc8(char *buf, unsigned r) { unsigned q; /* 1 <= r < 10^8 */ if (r < 100) goto out_r; /* 100 <= r < 10^8 */ q = (r * (u64)0x28f5c29) >> 32; *((u16 *)buf) = decpair[r - 100*q]; buf += 2; /* 1 <= q < 10^6 */ if (q < 100) goto out_q; /* 100 <= q < 10^6 */ r = (q * (u64)0x28f5c29) >> 32; *((u16 *)buf) = decpair[q - 100*r]; buf += 2; /* 1 <= r < 10^4 */ if (r < 100) goto out_r; /* 100 <= r < 10^4 */ q = (r * 0x147b) >> 19; *((u16 *)buf) = decpair[r - 100*q]; buf += 2; out_q: /* 1 <= q < 100 */ r = q; out_r: /* 1 <= r < 100 */ *((u16 *)buf) = decpair[r]; buf += r < 10 ? 1 : 2; return buf; } #if BITS_PER_LONG == 64 && BITS_PER_LONG_LONG == 64 static noinline_for_stack char *put_dec_full8(char *buf, unsigned r) { unsigned q; /* 0 <= r < 10^8 */ q = (r * (u64)0x28f5c29) >> 32; *((u16 *)buf) = decpair[r - 100*q]; buf += 2; /* 0 <= q < 10^6 */ r = (q * (u64)0x28f5c29) >> 32; *((u16 *)buf) = decpair[q - 100*r]; buf += 2; /* 0 <= r < 10^4 */ q = (r * 0x147b) >> 19; *((u16 *)buf) = decpair[r - 100*q]; buf += 2; /* 0 <= q < 100 */ *((u16 *)buf) = decpair[q]; buf += 2; return buf; } static noinline_for_stack char *put_dec(char *buf, unsigned long long n) { if (n >= 100*1000*1000) buf = put_dec_full8(buf, do_div(n, 100*1000*1000)); /* 1 <= n <= 1.6e11 */ if (n >= 100*1000*1000) buf = put_dec_full8(buf, do_div(n, 100*1000*1000)); /* 1 <= n < 1e8 */ return put_dec_trunc8(buf, n); } #elif BITS_PER_LONG == 32 && BITS_PER_LONG_LONG == 64 static void put_dec_full4(char *buf, unsigned r) { unsigned q; /* 0 <= r < 10^4 */ q = (r * 0x147b) >> 19; *((u16 *)buf) = decpair[r - 100*q]; buf += 2; /* 0 <= q < 100 */ *((u16 *)buf) = decpair[q]; } /* * Call put_dec_full4 on x % 10000, return x / 10000. * The approximation x/10000 == (x * 0x346DC5D7) >> 43 * holds for all x < 1,128,869,999. The largest value this * helper will ever be asked to convert is 1,125,520,955. * (second call in the put_dec code, assuming n is all-ones). */ static noinline_for_stack unsigned put_dec_helper4(char *buf, unsigned x) { uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43; put_dec_full4(buf, x - q * 10000); return q; } /* Based on code by Douglas W. Jones found at * <http://www.cs.uiowa.edu/~jones/bcd/decimal.html#sixtyfour> * (with permission from the author). * Performs no 64-bit division and hence should be fast on 32-bit machines. */ static char *put_dec(char *buf, unsigned long long n) { uint32_t d3, d2, d1, q, h; if (n < 100*1000*1000) return put_dec_trunc8(buf, n); d1 = ((uint32_t)n >> 16); /* implicit "& 0xffff" */ h = (n >> 32); d2 = (h ) & 0xffff; d3 = (h >> 16); /* implicit "& 0xffff" */ /* n = 2^48 d3 + 2^32 d2 + 2^16 d1 + d0 = 281_4749_7671_0656 d3 + 42_9496_7296 d2 + 6_5536 d1 + d0 */ q = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff); q = put_dec_helper4(buf, q); q += 7671 * d3 + 9496 * d2 + 6 * d1; q = put_dec_helper4(buf+4, q); q += 4749 * d3 + 42 * d2; q = put_dec_helper4(buf+8, q); q += 281 * d3; buf += 12; if (q) buf = put_dec_trunc8(buf, q); else while (buf[-1] == '0') --buf; return buf; } #endif /* * Convert passed number to decimal string. * Returns the length of string. On buffer overflow, returns 0. * * If speed is not important, use snprintf(). It's easy to read the code. */ int num_to_str(char *buf, int size, unsigned long long num, unsigned int width) { /* put_dec requires 2-byte alignment of the buffer. */ char tmp[sizeof(num) * 3] __aligned(2); int idx, len; /* put_dec() may work incorrectly for num = 0 (generate "", not "0") */ if (num <= 9) { tmp[0] = '0' + num; len = 1; } else { len = put_dec(tmp, num) - tmp; } if (len > size || width > size) return 0; if (width > len) { width = width - len; for (idx = 0; idx < width; idx++) buf[idx] = ' '; } else { width = 0; } for (idx = 0; idx < len; ++idx) buf[idx + width] = tmp[len - idx - 1]; return len + width; } #define SIGN 1 /* unsigned/signed, must be 1 */ #define LEFT 2 /* left justified */ #define PLUS 4 /* show plus */ #define SPACE 8 /* space if plus */ #define ZEROPAD 16 /* pad with zero, must be 16 == '0' - ' ' */ #define SMALL 32 /* use lowercase in hex (must be 32 == 0x20) */ #define SPECIAL 64 /* prefix hex with "0x", octal with "0" */ static_assert(SIGN == 1); static_assert(ZEROPAD == ('0' - ' ')); static_assert(SMALL == ('a' ^ 'A')); enum format_type { FORMAT_TYPE_NONE, /* Just a string part */ FORMAT_TYPE_WIDTH, FORMAT_TYPE_PRECISION, FORMAT_TYPE_CHAR, FORMAT_TYPE_STR, FORMAT_TYPE_PTR, FORMAT_TYPE_PERCENT_CHAR, FORMAT_TYPE_INVALID, FORMAT_TYPE_LONG_LONG, FORMAT_TYPE_ULONG, FORMAT_TYPE_LONG, FORMAT_TYPE_UBYTE, FORMAT_TYPE_BYTE, FORMAT_TYPE_USHORT, FORMAT_TYPE_SHORT, FORMAT_TYPE_UINT, FORMAT_TYPE_INT, FORMAT_TYPE_SIZE_T, FORMAT_TYPE_PTRDIFF }; struct printf_spec { unsigned int type:8; /* format_type enum */ signed int field_width:24; /* width of output field */ unsigned int flags:8; /* flags to number() */ unsigned int base:8; /* number base, 8, 10 or 16 only */ signed int precision:16; /* # of digits/chars */ } __packed; static_assert(sizeof(struct printf_spec) == 8); #define FIELD_WIDTH_MAX ((1 << 23) - 1) #define PRECISION_MAX ((1 << 15) - 1) static noinline_for_stack char *number(char *buf, char *end, unsigned long long num, struct printf_spec spec) { /* put_dec requires 2-byte alignment of the buffer. */ char tmp[3 * sizeof(num)] __aligned(2); char sign; char locase; int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10); int i; bool is_zero = num == 0LL; int field_width = spec.field_width; int precision = spec.precision; /* locase = 0 or 0x20. ORing digits or letters with 'locase' * produces same digits or (maybe lowercased) letters */ locase = (spec.flags & SMALL); if (spec.flags & LEFT) spec.flags &= ~ZEROPAD; sign = 0; if (spec.flags & SIGN) { if ((signed long long)num < 0) { sign = '-'; num = -(signed long long)num; field_width--; } else if (spec.flags & PLUS) { sign = '+'; field_width--; } else if (spec.flags & SPACE) { sign = ' '; field_width--; } } if (need_pfx) { if (spec.base == 16) field_width -= 2; else if (!is_zero) field_width--; } /* generate full string in tmp[], in reverse order */ i = 0; if (num < spec.base) tmp[i++] = hex_asc_upper[num] | locase; else if (spec.base != 10) { /* 8 or 16 */ int mask = spec.base - 1; int shift = 3; if (spec.base == 16) shift = 4; do { tmp[i++] = (hex_asc_upper[((unsigned char)num) & mask] | locase); num >>= shift; } while (num); } else { /* base 10 */ i = put_dec(tmp, num) - tmp; } /* printing 100 using %2d gives "100", not "00" */ if (i > precision) precision = i; /* leading space padding */ field_width -= precision; if (!(spec.flags & (ZEROPAD | LEFT))) { while (--field_width >= 0) { if (buf < end) *buf = ' '; ++buf; } } /* sign */ if (sign) { if (buf < end) *buf = sign; ++buf; } /* "0x" / "0" prefix */ if (need_pfx) { if (spec.base == 16 || !is_zero) { if (buf < end) *buf = '0'; ++buf; } if (spec.base == 16) { if (buf < end) *buf = ('X' | locase); ++buf; } } /* zero or space padding */ if (!(spec.flags & LEFT)) { char c = ' ' + (spec.flags & ZEROPAD); while (--field_width >= 0) { if (buf < end) *buf = c; ++buf; } } /* hmm even more zero padding? */ while (i <= --precision) { if (buf < end) *buf = '0'; ++buf; } /* actual digits of result */ while (--i >= 0) { if (buf < end) *buf = tmp[i]; ++buf; } /* trailing space padding */ while (--field_width >= 0) { if (buf < end) *buf = ' '; ++buf; } return buf; } static noinline_for_stack char *special_hex_number(char *buf, char *end, unsigned long long num, int size) { struct printf_spec spec; spec.type = FORMAT_TYPE_PTR; spec.field_width = 2 + 2 * size; /* 0x + hex */ spec.flags = SPECIAL | SMALL | ZEROPAD; spec.base = 16; spec.precision = -1; return number(buf, end, num, spec); } static void move_right(char *buf, char *end, unsigned len, unsigned spaces) { size_t size; if (buf >= end) /* nowhere to put anything */ return; size = end - buf; if (size <= spaces) { memset(buf, ' ', size); return; } if (len) { if (len > size - spaces) len = size - spaces; memmove(buf + spaces, buf, len); } memset(buf, ' ', spaces); } /* * Handle field width padding for a string. * @buf: current buffer position * @n: length of string * @end: end of output buffer * @spec: for field width and flags * Returns: new buffer position after padding. */ static noinline_for_stack char *widen_string(char *buf, int n, char *end, struct printf_spec spec) { unsigned spaces; if (likely(n >= spec.field_width)) return buf; /* we want to pad the sucker */ spaces = spec.field_width - n; if (!(spec.flags & LEFT)) { move_right(buf - n, end, n, spaces); return buf + spaces; } while (spaces--) { if (buf < end) *buf = ' '; ++buf; } return buf; } /* Handle string from a well known address. */ static char *string_nocheck(char *buf, char *end, const char *s, struct printf_spec spec) { int len = 0; int lim = spec.precision; while (lim--) { char c = *s++; if (!c) break; if (buf < end) *buf = c; ++buf; ++len; } return widen_string(buf, len, end, spec); } static char *err_ptr(char *buf, char *end, void *ptr, struct printf_spec spec) { int err = PTR_ERR(ptr); const char *sym = errname(err); if (sym) return string_nocheck(buf, end, sym, spec); /* * Somebody passed ERR_PTR(-1234) or some other non-existing * Efoo - or perhaps CONFIG_SYMBOLIC_ERRNAME=n. Fall back to * printing it as its decimal representation. */ spec.flags |= SIGN; spec.base = 10; return number(buf, end, err, spec); } /* Be careful: error messages must fit into the given buffer. */ static char *error_string(char *buf, char *end, const char *s, struct printf_spec spec) { /* * Hard limit to avoid a completely insane messages. It actually * works pretty well because most error messages are in * the many pointer format modifiers. */ if (spec.precision == -1) spec.precision = 2 * sizeof(void *); return string_nocheck(buf, end, s, spec); } /* * Do not call any complex external code here. Nested printk()/vsprintf() * might cause infinite loops. Failures might break printk() and would * be hard to debug. */ static const char *check_pointer_msg(const void *ptr) { if (!ptr) return "(null)"; if ((unsigned long)ptr < PAGE_SIZE || IS_ERR_VALUE(ptr)) return "(efault)"; return NULL; } static int check_pointer(char **buf, char *end, const void *ptr, struct printf_spec spec) { const char *err_msg; err_msg = check_pointer_msg(ptr); if (err_msg) { *buf = error_string(*buf, end, err_msg, spec); return -EFAULT; } return 0; } static noinline_for_stack char *string(char *buf, char *end, const char *s, struct printf_spec spec) { if (check_pointer(&buf, end, s, spec)) return buf; return string_nocheck(buf, end, s, spec); } static char *pointer_string(char *buf, char *end, const void *ptr, struct printf_spec spec) { spec.base = 16; spec.flags |= SMALL; if (spec.field_width == -1) { spec.field_width = 2 * sizeof(ptr); spec.flags |= ZEROPAD; } return number(buf, end, (unsigned long int)ptr, spec); } /* Make pointers available for printing early in the boot sequence. */ static int debug_boot_weak_hash __ro_after_init; static int __init debug_boot_weak_hash_enable(char *str) { debug_boot_weak_hash = 1; pr_info("debug_boot_weak_hash enabled\n"); return 0; } early_param("debug_boot_weak_hash", debug_boot_weak_hash_enable); static bool filled_random_ptr_key __read_mostly; static siphash_key_t ptr_key __read_mostly; static int fill_ptr_key(struct notifier_block *nb, unsigned long action, void *data) { get_random_bytes(&ptr_key, sizeof(ptr_key)); /* Pairs with smp_rmb() before reading ptr_key. */ smp_wmb(); WRITE_ONCE(filled_random_ptr_key, true); return NOTIFY_DONE; } static int __init vsprintf_init_hashval(void) { static struct notifier_block fill_ptr_key_nb = { .notifier_call = fill_ptr_key }; execute_with_initialized_rng(&fill_ptr_key_nb); return 0; } subsys_initcall(vsprintf_init_hashval) /* Maps a pointer to a 32 bit unique identifier. */ static inline int __ptr_to_hashval(const void *ptr, unsigned long *hashval_out) { unsigned long hashval; if (!READ_ONCE(filled_random_ptr_key)) return -EBUSY; /* Pairs with smp_wmb() after writing ptr_key. */ smp_rmb(); #ifdef CONFIG_64BIT hashval = (unsigned long)siphash_1u64((u64)ptr, &ptr_key); /* * Mask off the first 32 bits, this makes explicit that we have * modified the address (and 32 bits is plenty for a unique ID). */ hashval = hashval & 0xffffffff; #else hashval = (unsigned long)siphash_1u32((u32)ptr, &ptr_key); #endif *hashval_out = hashval; return 0; } int ptr_to_hashval(const void *ptr, unsigned long *hashval_out) { return __ptr_to_hashval(ptr, hashval_out); } static char *ptr_to_id(char *buf, char *end, const void *ptr, struct printf_spec spec) { const char *str = sizeof(ptr) == 8 ? "(____ptrval____)" : "(ptrval)"; unsigned long hashval; int ret; /* * Print the real pointer value for NULL and error pointers, * as they are not actual addresses. */ if (IS_ERR_OR_NULL(ptr)) return pointer_string(buf, end, ptr, spec); /* When debugging early boot use non-cryptographically secure hash. */ if (unlikely(debug_boot_weak_hash)) { hashval = hash_long((unsigned long)ptr, 32); return pointer_string(buf, end, (const void *)hashval, spec); } ret = __ptr_to_hashval(ptr, &hashval); if (ret) { spec.field_width = 2 * sizeof(ptr); /* string length must be less than default_width */ return error_string(buf, end, str, spec); } return pointer_string(buf, end, (const void *)hashval, spec); } static char *default_pointer(char *buf, char *end, const void *ptr, struct printf_spec spec) { /* * default is to _not_ leak addresses, so hash before printing, * unless no_hash_pointers is specified on the command line. */ if (unlikely(no_hash_pointers)) return pointer_string(buf, end, ptr, spec); return ptr_to_id(buf, end, ptr, spec); } int kptr_restrict __read_mostly; static noinline_for_stack char *restricted_pointer(char *buf, char *end, const void *ptr, struct printf_spec spec) { switch (kptr_restrict) { case 0: /* Handle as %p, hash and do _not_ leak addresses. */ return default_pointer(buf, end, ptr, spec); case 1: { const struct cred *cred; /* * kptr_restrict==1 cannot be used in IRQ context * because its test for CAP_SYSLOG would be meaningless. */ if (in_hardirq() || in_serving_softirq() || in_nmi()) { if (spec.field_width == -1) spec.field_width = 2 * sizeof(ptr); return error_string(buf, end, "pK-error", spec); } /* * Only print the real pointer value if the current * process has CAP_SYSLOG and is running with the * same credentials it started with. This is because * access to files is checked at open() time, but %pK * checks permission at read() time. We don't want to * leak pointer values if a binary opens a file using * %pK and then elevates privileges before reading it. */ cred = current_cred(); if (!has_capability_noaudit(current, CAP_SYSLOG) || !uid_eq(cred->euid, cred->uid) || !gid_eq(cred->egid, cred->gid)) ptr = NULL; break; } case 2: default: /* Always print 0's for %pK */ ptr = NULL; break; } return pointer_string(buf, end, ptr, spec); } static noinline_for_stack char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_spec spec, const char *fmt) { const char *array[4], *s; const struct dentry *p; int depth; int i, n; switch (fmt[1]) { case '2': case '3': case '4': depth = fmt[1] - '0'; break; default: depth = 1; } rcu_read_lock(); for (i = 0; i < depth; i++, d = p) { if (check_pointer(&buf, end, d, spec)) { rcu_read_unlock(); return buf; } p = READ_ONCE(d->d_parent); array[i] = READ_ONCE(d->d_name.name); if (p == d) { if (i) array[i] = ""; i++; break; } } s = array[--i]; for (n = 0; n != spec.precision; n++, buf++) { char c = *s++; if (!c) { if (!i) break; c = '/'; s = array[--i]; } if (buf < end) *buf = c; } rcu_read_unlock(); return widen_string(buf, n, end, spec); } static noinline_for_stack char *file_dentry_name(char *buf, char *end, const struct file *f, struct printf_spec spec, const char *fmt) { if (check_pointer(&buf, end, f, spec)) return buf; return dentry_name(buf, end, f->f_path.dentry, spec, fmt); } #ifdef CONFIG_BLOCK static noinline_for_stack char *bdev_name(char *buf, char *end, struct block_device *bdev, struct printf_spec spec, const char *fmt) { struct gendisk *hd; if (check_pointer(&buf, end, bdev, spec)) return buf; hd = bdev->bd_disk; buf = string(buf, end, hd->disk_name, spec); if (bdev->bd_partno) { if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) { if (buf < end) *buf = 'p'; buf++; } buf = number(buf, end, bdev->bd_partno, spec); } return buf; } #endif static noinline_for_stack char *symbol_string(char *buf, char *end, void *ptr, struct printf_spec spec, const char *fmt) { unsigned long value; #ifdef CONFIG_KALLSYMS char sym[KSYM_SYMBOL_LEN]; #endif if (fmt[1] == 'R') ptr = __builtin_extract_return_addr(ptr); value = (unsigned long)ptr; #ifdef CONFIG_KALLSYMS if (*fmt == 'B' && fmt[1] == 'b') sprint_backtrace_build_id(sym, value); else if (*fmt == 'B') sprint_backtrace(sym, value); else if (*fmt == 'S' && (fmt[1] == 'b' || (fmt[1] == 'R' && fmt[2] == 'b'))) sprint_symbol_build_id(sym, value); else if (*fmt != 's') sprint_symbol(sym, value); else sprint_symbol_no_offset(sym, value); return string_nocheck(buf, end, sym, spec); #else return special_hex_number(buf, end, value, sizeof(void *)); #endif } static const struct printf_spec default_str_spec = { .field_width = -1, .precision = -1, }; static const struct printf_spec default_flag_spec = { .base = 16, .precision = -1, .flags = SPECIAL | SMALL, }; static const struct printf_spec default_dec_spec = { .base = 10, .precision = -1, }; static const struct printf_spec default_dec02_spec = { .base = 10, .field_width = 2, .precision = -1, .flags = ZEROPAD, }; static const struct printf_spec default_dec04_spec = { .base = 10, .field_width = 4, .precision = -1, .flags = ZEROPAD, }; static noinline_for_stack char *resource_string(char *buf, char *end, struct resource *res, struct printf_spec spec, const char *fmt) { #ifndef IO_RSRC_PRINTK_SIZE #define IO_RSRC_PRINTK_SIZE 6 #endif #ifndef MEM_RSRC_PRINTK_SIZE #define MEM_RSRC_PRINTK_SIZE 10 #endif static const struct printf_spec io_spec = { .base = 16, .field_width = IO_RSRC_PRINTK_SIZE, .precision = -1, .flags = SPECIAL | SMALL | ZEROPAD, }; static const struct printf_spec mem_spec = { .base = 16, .field_width = MEM_RSRC_PRINTK_SIZE, .precision = -1, .flags = SPECIAL | SMALL | ZEROPAD, }; static const struct printf_spec bus_spec = { .base = 16, .field_width = 2, .precision = -1, .flags = SMALL | ZEROPAD, }; static const struct printf_spec str_spec = { .field_width = -1, .precision = 10, .flags = LEFT, }; /* 32-bit res (sizeof==4): 10 chars in dec, 10 in hex ("0x" + 8) * 64-bit res (sizeof==8): 20 chars in dec, 18 in hex ("0x" + 16) */ #define RSRC_BUF_SIZE ((2 * sizeof(resource_size_t)) + 4) #define FLAG_BUF_SIZE (2 * sizeof(res->flags)) #define DECODED_BUF_SIZE sizeof("[mem - 64bit pref window disabled]") #define RAW_BUF_SIZE sizeof("[mem - flags 0x]") char sym[max(2*RSRC_BUF_SIZE + DECODED_BUF_SIZE, 2*RSRC_BUF_SIZE + FLAG_BUF_SIZE + RAW_BUF_SIZE)]; char *p = sym, *pend = sym + sizeof(sym); int decode = (fmt[0] == 'R') ? 1 : 0; const struct printf_spec *specp; if (check_pointer(&buf, end, res, spec)) return buf; *p++ = '['; if (res->flags & IORESOURCE_IO) { p = string_nocheck(p, pend, "io ", str_spec); specp = &io_spec; } else if (res->flags & IORESOURCE_MEM) { p = string_nocheck(p, pend, "mem ", str_spec); specp = &mem_spec; } else if (res->flags & IORESOURCE_IRQ) { p = string_nocheck(p, pend, "irq ", str_spec); specp = &default_dec_spec; } else if (res->flags & IORESOURCE_DMA) { p = string_nocheck(p, pend, "dma ", str_spec); specp = &default_dec_spec; } else if (res->flags & IORESOURCE_BUS) { p = string_nocheck(p, pend, "bus ", str_spec); specp = &bus_spec; } else { p = string_nocheck(p, pend, "??? ", str_spec); specp = &mem_spec; decode = 0; } if (decode && res->flags & IORESOURCE_UNSET) { p = string_nocheck(p, pend, "size ", str_spec); p = number(p, pend, resource_size(res), *specp); } else { p = number(p, pend, res->start, *specp); if (res->start != res->end) { *p++ = '-'; p = number(p, pend, res->end, *specp); } } if (decode) { if (res->flags & IORESOURCE_MEM_64) p = string_nocheck(p, pend, " 64bit", str_spec); if (res->flags & IORESOURCE_PREFETCH) p = string_nocheck(p, pend, " pref", str_spec); if (res->flags & IORESOURCE_WINDOW) p = string_nocheck(p, pend, " window", str_spec); if (res->flags & IORESOURCE_DISABLED) p = string_nocheck(p, pend, " disabled", str_spec); } else { p = string_nocheck(p, pend, " flags ", str_spec); p = number(p, pend, res->flags, default_flag_spec); } *p++ = ']'; *p = '\0'; return string_nocheck(buf, end, sym, spec); } static noinline_for_stack char *hex_string(char *buf, char *end, u8 *addr, struct printf_spec spec, const char *fmt) { int i, len = 1; /* if we pass '%ph[CDN]', field width remains negative value, fallback to the default */ char separator; if (spec.field_width == 0) /* nothing to print */ return buf; if (check_pointer(&buf, end, addr, spec)) return buf; switch (fmt[1]) { case 'C': separator = ':'; break; case 'D': separator = '-'; break; case 'N': separator = 0; break; default: separator = ' '; break; } if (spec.field_width > 0) len = min_t(int, spec.field_width, 64); for (i = 0; i < len; ++i) { if (buf < end) *buf = hex_asc_hi(addr[i]); ++buf; if (buf < end) *buf = hex_asc_lo(addr[i]); ++buf; if (separator && i != len - 1) { if (buf < end) *buf = separator; ++buf; } } return buf; } static noinline_for_stack char *bitmap_string(char *buf, char *end, const unsigned long *bitmap, struct printf_spec spec, const char *fmt) { const int CHUNKSZ = 32; int nr_bits = max_t(int, spec.field_width, 0); int i, chunksz; bool first = true; if (check_pointer(&buf, end, bitmap, spec)) return buf; /* reused to print numbers */ spec = (struct printf_spec){ .flags = SMALL | ZEROPAD, .base = 16 }; chunksz = nr_bits & (CHUNKSZ - 1); if (chunksz == 0) chunksz = CHUNKSZ; i = ALIGN(nr_bits, CHUNKSZ) - CHUNKSZ; for (; i >= 0; i -= CHUNKSZ) { u32 chunkmask, val; int word, bit; chunkmask = ((1ULL << chunksz) - 1); word = i / BITS_PER_LONG; bit = i % BITS_PER_LONG; val = (bitmap[word] >> bit) & chunkmask; if (!first) { if (buf < end) *buf = ','; buf++; } first = false; spec.field_width = DIV_ROUND_UP(chunksz, 4); buf = number(buf, end, val, spec); chunksz = CHUNKSZ; } return buf; } static noinline_for_stack char *bitmap_list_string(char *buf, char *end, const unsigned long *bitmap, struct printf_spec spec, const char *fmt) { int nr_bits = max_t(int, spec.field_width, 0); bool first = true; int rbot, rtop; if (check_pointer(&buf, end, bitmap, spec)) return buf; for_each_set_bitrange(rbot, rtop, bitmap, nr_bits) { if (!first) { if (buf < end) *buf = ','; buf++; } first = false; buf = number(buf, end, rbot, default_dec_spec); if (rtop == rbot + 1) continue; if (buf < end) *buf = '-'; buf = number(++buf, end, rtop - 1, default_dec_spec); } return buf; } static noinline_for_stack char *mac_address_string(char *buf, char *end, u8 *addr, struct printf_spec spec, const char *fmt) { char mac_addr[sizeof("xx:xx:xx:xx:xx:xx")]; char *p = mac_addr; int i; char separator; bool reversed = false; if (check_pointer(&buf, end, addr, spec)) return buf; switch (fmt[1]) { case 'F': separator = '-'; break; case 'R': reversed = true; fallthrough; default: separator = ':'; break; } for (i = 0; i < 6; i++) { if (reversed) p = hex_byte_pack(p, addr[5 - i]); else p = hex_byte_pack(p, addr[i]); if (fmt[0] == 'M' && i != 5) *p++ = separator; } *p = '\0'; return string_nocheck(buf, end, mac_addr, spec); } static noinline_for_stack char *ip4_string(char *p, const u8 *addr, const char *fmt) { int i; bool leading_zeros = (fmt[0] == 'i'); int index; int step; switch (fmt[2]) { case 'h': #ifdef __BIG_ENDIAN index = 0; step = 1; #else index = 3; step = -1; #endif break; case 'l': index = 3; step = -1; break; case 'n': case 'b': default: index = 0; step = 1; break; } for (i = 0; i < 4; i++) { char temp[4] __aligned(2); /* hold each IP quad in reverse order */ int digits = put_dec_trunc8(temp, addr[index]) - temp; if (leading_zeros) { if (digits < 3) *p++ = '0'; if (digits < 2) *p++ = '0'; } /* reverse the digits in the quad */ while (digits--) *p++ = temp[digits]; if (i < 3) *p++ = '.'; index += step; } *p = '\0'; return p; } static noinline_for_stack char *ip6_compressed_string(char *p, const char *addr) { int i, j, range; unsigned char zerolength[8]; int longest = 1; int colonpos = -1; u16 word; u8 hi, lo; bool needcolon = false; bool useIPv4; struct in6_addr in6; memcpy(&in6, addr, sizeof(struct in6_addr)); useIPv4 = ipv6_addr_v4mapped(&in6) || ipv6_addr_is_isatap(&in6); memset(zerolength, 0, sizeof(zerolength)); if (useIPv4) range = 6; else range = 8; /* find position of longest 0 run */ for (i = 0; i < range; i++) { for (j = i; j < range; j++) { if (in6.s6_addr16[j] != 0) break; zerolength[i]++; } } for (i = 0; i < range; i++) { if (zerolength[i] > longest) { longest = zerolength[i]; colonpos = i; } } if (longest == 1) /* don't compress a single 0 */ colonpos = -1; /* emit address */ for (i = 0; i < range; i++) { if (i == colonpos) { if (needcolon || i == 0) *p++ = ':'; *p++ = ':'; needcolon = false; i += longest - 1; continue; } if (needcolon) { *p++ = ':'; needcolon = false; } /* hex u16 without leading 0s */ word = ntohs(in6.s6_addr16[i]); hi = word >> 8; lo = word & 0xff; if (hi) { if (hi > 0x0f) p = hex_byte_pack(p, hi); else *p++ = hex_asc_lo(hi); p = hex_byte_pack(p, lo); } else if (lo > 0x0f) p = hex_byte_pack(p, lo); else *p++ = hex_asc_lo(lo); needcolon = true; } if (useIPv4) { if (needcolon) *p++ = ':'; p = ip4_string(p, &in6.s6_addr[12], "I4"); } *p = '\0'; return p; } static noinline_for_stack char *ip6_string(char *p, const char *addr, const char *fmt) { int i; for (i = 0; i < 8; i++) { p = hex_byte_pack(p, *addr++); p = hex_byte_pack(p, *addr++); if (fmt[0] == 'I' && i != 7) *p++ = ':'; } *p = '\0'; return p; } static noinline_for_stack char *ip6_addr_string(char *buf, char *end, const u8 *addr, struct printf_spec spec, const char *fmt) { char ip6_addr[sizeof("xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255")]; if (fmt[0] == 'I' && fmt[2] == 'c') ip6_compressed_string(ip6_addr, addr); else ip6_string(ip6_addr, addr, fmt); return string_nocheck(buf, end, ip6_addr, spec); } static noinline_for_stack char *ip4_addr_string(char *buf, char *end, const u8 *addr, struct printf_spec spec, const char *fmt) { char ip4_addr[sizeof("255.255.255.255")]; ip4_string(ip4_addr, addr, fmt); return string_nocheck(buf, end, ip4_addr, spec); } static noinline_for_stack char *ip6_addr_string_sa(char *buf, char *end, const struct sockaddr_in6 *sa, struct printf_spec spec, const char *fmt) { bool have_p = false, have_s = false, have_f = false, have_c = false; char ip6_addr[sizeof("[xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255]") + sizeof(":12345") + sizeof("/123456789") + sizeof("%1234567890")]; char *p = ip6_addr, *pend = ip6_addr + sizeof(ip6_addr); const u8 *addr = (const u8 *) &sa->sin6_addr; char fmt6[2] = { fmt[0], '6' }; u8 off = 0; fmt++; while (isalpha(*++fmt)) { switch (*fmt) { case 'p': have_p = true; break; case 'f': have_f = true; break; case 's': have_s = true; break; case 'c': have_c = true; break; } } if (have_p || have_s || have_f) { *p = '['; off = 1; } if (fmt6[0] == 'I' && have_c) p = ip6_compressed_string(ip6_addr + off, addr); else p = ip6_string(ip6_addr + off, addr, fmt6); if (have_p || have_s || have_f) *p++ = ']'; if (have_p) { *p++ = ':'; p = number(p, pend, ntohs(sa->sin6_port), spec); } if (have_f) { *p++ = '/'; p = number(p, pend, ntohl(sa->sin6_flowinfo & IPV6_FLOWINFO_MASK), spec); } if (have_s) { *p++ = '%'; p = number(p, pend, sa->sin6_scope_id, spec); } *p = '\0'; return string_nocheck(buf, end, ip6_addr, spec); } static noinline_for_stack char *ip4_addr_string_sa(char *buf, char *end, const struct sockaddr_in *sa, struct printf_spec spec, const char *fmt) { bool have_p = false; char *p, ip4_addr[sizeof("255.255.255.255") + sizeof(":12345")]; char *pend = ip4_addr + sizeof(ip4_addr); const u8 *addr = (const u8 *) &sa->sin_addr.s_addr; char fmt4[3] = { fmt[0], '4', 0 }; fmt++; while (isalpha(*++fmt)) { switch (*fmt) { case 'p': have_p = true; break; case 'h': case 'l': case 'n': case 'b': fmt4[2] = *fmt; break; } } p = ip4_string(ip4_addr, addr, fmt4); if (have_p) { *p++ = ':'; p = number(p, pend, ntohs(sa->sin_port), spec); } *p = '\0'; return string_nocheck(buf, end, ip4_addr, spec); } static noinline_for_stack char *ip_addr_string(char *buf, char *end, const void *ptr, struct printf_spec spec, const char *fmt) { char *err_fmt_msg; if (check_pointer(&buf, end, ptr, spec)) return buf; switch (fmt[1]) { case '6': return ip6_addr_string(buf, end, ptr, spec, fmt); case '4': return ip4_addr_string(buf, end, ptr, spec, fmt); case 'S': { const union { struct sockaddr raw; struct sockaddr_in v4; struct sockaddr_in6 v6; } *sa = ptr; switch (sa->raw.sa_family) { case AF_INET: return ip4_addr_string_sa(buf, end, &sa->v4, spec, fmt); case AF_INET6: return ip6_addr_string_sa(buf, end, &sa->v6, spec, fmt); default: return error_string(buf, end, "(einval)", spec); }} } err_fmt_msg = fmt[0] == 'i' ? "(%pi?)" : "(%pI?)"; return error_string(buf, end, err_fmt_msg, spec); } static noinline_for_stack char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec, const char *fmt) { bool found = true; int count = 1; unsigned int flags = 0; int len; if (spec.field_width == 0) return buf; /* nothing to print */ if (check_pointer(&buf, end, addr, spec)) return buf; do { switch (fmt[count++]) { case 'a': flags |= ESCAPE_ANY; break; case 'c': flags |= ESCAPE_SPECIAL; break; case 'h': flags |= ESCAPE_HEX; break; case 'n': flags |= ESCAPE_NULL; break; case 'o': flags |= ESCAPE_OCTAL; break; case 'p': flags |= ESCAPE_NP; break; case 's': flags |= ESCAPE_SPACE; break; default: found = false; break; } } while (found); if (!flags) flags = ESCAPE_ANY_NP; len = spec.field_width < 0 ? 1 : spec.field_width; /* * string_escape_mem() writes as many characters as it can to * the given buffer, and returns the total size of the output * had the buffer been big enough. */ buf += string_escape_mem(addr, len, buf, buf < end ? end - buf : 0, flags, NULL); return buf; } static char *va_format(char *buf, char *end, struct va_format *va_fmt, struct printf_spec spec, const char *fmt) { va_list va; if (check_pointer(&buf, end, va_fmt, spec)) return buf; va_copy(va, *va_fmt->va); buf += vsnprintf(buf, end > buf ? end - buf : 0, va_fmt->fmt, va); va_end(va); return buf; } static noinline_for_stack char *uuid_string(char *buf, char *end, const u8 *addr, struct printf_spec spec, const char *fmt) { char uuid[UUID_STRING_LEN + 1]; char *p = uuid; int i; const u8 *index = uuid_index; bool uc = false; if (check_pointer(&buf, end, addr, spec)) return buf; switch (*(++fmt)) { case 'L': uc = true; fallthrough; case 'l': index = guid_index; break; case 'B': uc = true; break; } for (i = 0; i < 16; i++) { if (uc) p = hex_byte_pack_upper(p, addr[index[i]]); else p = hex_byte_pack(p, addr[index[i]]); switch (i) { case 3: case 5: case 7: case 9: *p++ = '-'; break; } } *p = 0; return string_nocheck(buf, end, uuid, spec); } static noinline_for_stack char *netdev_bits(char *buf, char *end, const void *addr, struct printf_spec spec, const char *fmt) { unsigned long long num; int size; if (check_pointer(&buf, end, addr, spec)) return buf; switch (fmt[1]) { case 'F': num = *(const netdev_features_t *)addr; size = sizeof(netdev_features_t); break; default: return error_string(buf, end, "(%pN?)", spec); } return special_hex_number(buf, end, num, size); } static noinline_for_stack char *fourcc_string(char *buf, char *end, const u32 *fourcc, struct printf_spec spec, const char *fmt) { char output[sizeof("0123 little-endian (0x01234567)")]; char *p = output; unsigned int i; u32 orig, val; if (fmt[1] != 'c' || fmt[2] != 'c') return error_string(buf, end, "(%p4?)", spec); if (check_pointer(&buf, end, fourcc, spec)) return buf; orig = get_unaligned(fourcc); val = orig & ~BIT(31); for (i = 0; i < sizeof(u32); i++) { unsigned char c = val >> (i * 8); /* Print non-control ASCII characters as-is, dot otherwise */ *p++ = isascii(c) && isprint(c) ? c : '.'; } *p++ = ' '; strcpy(p, orig & BIT(31) ? "big-endian" : "little-endian"); p += strlen(p); *p++ = ' '; *p++ = '('; p = special_hex_number(p, output + sizeof(output) - 2, orig, sizeof(u32)); *p++ = ')'; *p = '\0'; return string(buf, end, output, spec); } static noinline_for_stack char *address_val(char *buf, char *end, const void *addr, struct printf_spec spec, const char *fmt) { unsigned long long num; int size; if (check_pointer(&buf, end, addr, spec)) return buf; switch (fmt[1]) { case 'd': num = *(const dma_addr_t *)addr; size = sizeof(dma_addr_t); break; case 'p': default: num = *(const phys_addr_t *)addr; size = sizeof(phys_addr_t); break; } return special_hex_number(buf, end, num, size); } static noinline_for_stack char *date_str(char *buf, char *end, const struct rtc_time *tm, bool r) { int year = tm->tm_year + (r ? 0 : 1900); int mon = tm->tm_mon + (r ? 0 : 1); buf = number(buf, end, year, default_dec04_spec); if (buf < end) *buf = '-'; buf++; buf = number(buf, end, mon, default_dec02_spec); if (buf < end) *buf = '-'; buf++; return number(buf, end, tm->tm_mday, default_dec02_spec); } static noinline_for_stack char *time_str(char *buf, char *end, const struct rtc_time *tm, bool r) { buf = number(buf, end, tm->tm_hour, default_dec02_spec); if (buf < end) *buf = ':'; buf++; buf = number(buf, end, tm->tm_min, default_dec02_spec); if (buf < end) *buf = ':'; buf++; return number(buf, end, tm->tm_sec, default_dec02_spec); } static noinline_for_stack char *rtc_str(char *buf, char *end, const struct rtc_time *tm, struct printf_spec spec, const char *fmt) { bool have_t = true, have_d = true; bool raw = false, iso8601_separator = true; bool found = true; int count = 2; if (check_pointer(&buf, end, tm, spec)) return buf; switch (fmt[count]) { case 'd': have_t = false; count++; break; case 't': have_d = false; count++; break; } do { switch (fmt[count++]) { case 'r': raw = true; break; case 's': iso8601_separator = false; break; default: found = false; break; } } while (found); if (have_d) buf = date_str(buf, end, tm, raw); if (have_d && have_t) { if (buf < end) *buf = iso8601_separator ? 'T' : ' '; buf++; } if (have_t) buf = time_str(buf, end, tm, raw); return buf; } static noinline_for_stack char *time64_str(char *buf, char *end, const time64_t time, struct printf_spec spec, const char *fmt) { struct rtc_time rtc_time; struct tm tm; time64_to_tm(time, 0, &tm); rtc_time.tm_sec = tm.tm_sec; rtc_time.tm_min = tm.tm_min; rtc_time.tm_hour = tm.tm_hour; rtc_time.tm_mday = tm.tm_mday; rtc_time.tm_mon = tm.tm_mon; rtc_time.tm_year = tm.tm_year; rtc_time.tm_wday = tm.tm_wday; rtc_time.tm_yday = tm.tm_yday; rtc_time.tm_isdst = 0; return rtc_str(buf, end, &rtc_time, spec, fmt); } static noinline_for_stack char *time_and_date(char *buf, char *end, void *ptr, struct printf_spec spec, const char *fmt) { switch (fmt[1]) { case 'R': return rtc_str(buf, end, (const struct rtc_time *)ptr, spec, fmt); case 'T': return time64_str(buf, end, *(const time64_t *)ptr, spec, fmt); default: return error_string(buf, end, "(%pt?)", spec); } } static noinline_for_stack char *clock(char *buf, char *end, struct clk *clk, struct printf_spec spec, const char *fmt) { if (!IS_ENABLED(CONFIG_HAVE_CLK)) return error_string(buf, end, "(%pC?)", spec); if (check_pointer(&buf, end, clk, spec)) return buf; switch (fmt[1]) { case 'n': default: #ifdef CONFIG_COMMON_CLK return string(buf, end, __clk_get_name(clk), spec); #else return ptr_to_id(buf, end, clk, spec); #endif } } static char *format_flags(char *buf, char *end, unsigned long flags, const struct trace_print_flags *names) { unsigned long mask; for ( ; flags && names->name; names++) { mask = names->mask; if ((flags & mask) != mask) continue; buf = string(buf, end, names->name, default_str_spec); flags &= ~mask; if (flags) { if (buf < end) *buf = '|'; buf++; } } if (flags) buf = number(buf, end, flags, default_flag_spec); return buf; } struct page_flags_fields { int width; int shift; int mask; const struct printf_spec *spec; const char *name; }; static const struct page_flags_fields pff[] = { {SECTIONS_WIDTH, SECTIONS_PGSHIFT, SECTIONS_MASK, &default_dec_spec, "section"}, {NODES_WIDTH, NODES_PGSHIFT, NODES_MASK, &default_dec_spec, "node"}, {ZONES_WIDTH, ZONES_PGSHIFT, ZONES_MASK, &default_dec_spec, "zone"}, {LAST_CPUPID_WIDTH, LAST_CPUPID_PGSHIFT, LAST_CPUPID_MASK, &default_flag_spec, "lastcpupid"}, {KASAN_TAG_WIDTH, KASAN_TAG_PGSHIFT, KASAN_TAG_MASK, &default_flag_spec, "kasantag"}, }; static char *format_page_flags(char *buf, char *end, unsigned long flags) { unsigned long main_flags = flags & PAGEFLAGS_MASK; bool append = false; int i; buf = number(buf, end, flags, default_flag_spec); if (buf < end) *buf = '('; buf++; /* Page flags from the main area. */ if (main_flags) { buf = format_flags(buf, end, main_flags, pageflag_names); append = true; } /* Page flags from the fields area */ for (i = 0; i < ARRAY_SIZE(pff); i++) { /* Skip undefined fields. */ if (!pff[i].width) continue; /* Format: Flag Name + '=' (equals sign) + Number + '|' (separator) */ if (append) { if (buf < end) *buf = '|'; buf++; } buf = string(buf, end, pff[i].name, default_str_spec); if (buf < end) *buf = '='; buf++; buf = number(buf, end, (flags >> pff[i].shift) & pff[i].mask, *pff[i].spec); append = true; } if (buf < end) *buf = ')'; buf++; return buf; } static char *format_page_type(char *buf, char *end, unsigned int page_type) { buf = number(buf, end, page_type, default_flag_spec); if (buf < end) *buf = '('; buf++; if (page_type_has_type(page_type)) buf = format_flags(buf, end, ~page_type, pagetype_names); if (buf < end) *buf = ')'; buf++; return buf; } static noinline_for_stack char *flags_string(char *buf, char *end, void *flags_ptr, struct printf_spec spec, const char *fmt) { unsigned long flags; const struct trace_print_flags *names; if (check_pointer(&buf, end, flags_ptr, spec)) return buf; switch (fmt[1]) { case 'p': return format_page_flags(buf, end, *(unsigned long *)flags_ptr); case 't': return format_page_type(buf, end, *(unsigned int *)flags_ptr); case 'v': flags = *(unsigned long *)flags_ptr; names = vmaflag_names; break; case 'g': flags = (__force unsigned long)(*(gfp_t *)flags_ptr); names = gfpflag_names; break; default: return error_string(buf, end, "(%pG?)", spec); } return format_flags(buf, end, flags, names); } static noinline_for_stack char *fwnode_full_name_string(struct fwnode_handle *fwnode, char *buf, char *end) { int depth; /* Loop starting from the root node to the current node. */ for (depth = fwnode_count_parents(fwnode); depth >= 0; depth--) { /* * Only get a reference for other nodes (i.e. parent nodes). * fwnode refcount may be 0 here. */ struct fwnode_handle *__fwnode = depth ? fwnode_get_nth_parent(fwnode, depth) : fwnode; buf = string(buf, end, fwnode_get_name_prefix(__fwnode), default_str_spec); buf = string(buf, end, fwnode_get_name(__fwnode), default_str_spec); if (depth) fwnode_handle_put(__fwnode); } return buf; } static noinline_for_stack char *device_node_string(char *buf, char *end, struct device_node *dn, struct printf_spec spec, const char *fmt) { char tbuf[sizeof("xxxx") + 1]; const char *p; int ret; char *buf_start = buf; struct property *prop; bool has_mult, pass; struct printf_spec str_spec = spec; str_spec.field_width = -1; if (fmt[0] != 'F') return error_string(buf, end, "(%pO?)", spec); if (!IS_ENABLED(CONFIG_OF)) return error_string(buf, end, "(%pOF?)", spec); if (check_pointer(&buf, end, dn, spec)) return buf; /* simple case without anything any more format specifiers */ fmt++; if (fmt[0] == '\0' || strcspn(fmt,"fnpPFcC") > 0) fmt = "f"; for (pass = false; strspn(fmt,"fnpPFcC"); fmt++, pass = true) { int precision; if (pass) { if (buf < end) *buf = ':'; buf++; } switch (*fmt) { case 'f': /* full_name */ buf = fwnode_full_name_string(of_fwnode_handle(dn), buf, end); break; case 'n': /* name */ p = fwnode_get_name(of_fwnode_handle(dn)); precision = str_spec.precision; str_spec.precision = strchrnul(p, '@') - p; buf = string(buf, end, p, str_spec); str_spec.precision = precision; break; case 'p': /* phandle */ buf = number(buf, end, (unsigned int)dn->phandle, default_dec_spec); break; case 'P': /* path-spec */ p = fwnode_get_name(of_fwnode_handle(dn)); if (!p[1]) p = "/"; buf = string(buf, end, p, str_spec); break; case 'F': /* flags */ tbuf[0] = of_node_check_flag(dn, OF_DYNAMIC) ? 'D' : '-'; tbuf[1] = of_node_check_flag(dn, OF_DETACHED) ? 'd' : '-'; tbuf[2] = of_node_check_flag(dn, OF_POPULATED) ? 'P' : '-'; tbuf[3] = of_node_check_flag(dn, OF_POPULATED_BUS) ? 'B' : '-'; tbuf[4] = 0; buf = string_nocheck(buf, end, tbuf, str_spec); break; case 'c': /* major compatible string */ ret = of_property_read_string(dn, "compatible", &p); if (!ret) buf = string(buf, end, p, str_spec); break; case 'C': /* full compatible string */ has_mult = false; of_property_for_each_string(dn, "compatible", prop, p) { if (has_mult) buf = string_nocheck(buf, end, ",", str_spec); buf = string_nocheck(buf, end, "\"", str_spec); buf = string(buf, end, p, str_spec); buf = string_nocheck(buf, end, "\"", str_spec); has_mult = true; } break; default: break; } } return widen_string(buf, buf - buf_start, end, spec); } static noinline_for_stack char *fwnode_string(char *buf, char *end, struct fwnode_handle *fwnode, struct printf_spec spec, const char *fmt) { struct printf_spec str_spec = spec; char *buf_start = buf; str_spec.field_width = -1; if (*fmt != 'w') return error_string(buf, end, "(%pf?)", spec); if (check_pointer(&buf, end, fwnode, spec)) return buf; fmt++; switch (*fmt) { case 'P': /* name */ buf = string(buf, end, fwnode_get_name(fwnode), str_spec); break; case 'f': /* full_name */ default: buf = fwnode_full_name_string(fwnode, buf, end); break; } return widen_string(buf, buf - buf_start, end, spec); } int __init no_hash_pointers_enable(char *str) { if (no_hash_pointers) return 0; no_hash_pointers = true; pr_warn("**********************************************************\n"); pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); pr_warn("** **\n"); pr_warn("** This system shows unhashed kernel memory addresses **\n"); pr_warn("** via the console, logs, and other interfaces. This **\n"); pr_warn("** might reduce the security of your system. **\n"); pr_warn("** **\n"); pr_warn("** If you see this message and you are not debugging **\n"); pr_warn("** the kernel, report this immediately to your system **\n"); pr_warn("** administrator! **\n"); pr_warn("** **\n"); pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); pr_warn("**********************************************************\n"); return 0; } early_param("no_hash_pointers", no_hash_pointers_enable); /* Used for Rust formatting ('%pA'). */ char *rust_fmt_argument(char *buf, char *end, void *ptr); /* * Show a '%p' thing. A kernel extension is that the '%p' is followed * by an extra set of alphanumeric characters that are extended format * specifiers. * * Please update scripts/checkpatch.pl when adding/removing conversion * characters. (Search for "check for vsprintf extension"). * * Right now we handle: * * - 'S' For symbolic direct pointers (or function descriptors) with offset * - 's' For symbolic direct pointers (or function descriptors) without offset * - '[Ss]R' as above with __builtin_extract_return_addr() translation * - 'S[R]b' as above with module build ID (for use in backtraces) * - '[Ff]' %pf and %pF were obsoleted and later removed in favor of * %ps and %pS. Be careful when re-using these specifiers. * - 'B' For backtraced symbolic direct pointers with offset * - 'Bb' as above with module build ID (for use in backtraces) * - 'R' For decoded struct resource, e.g., [mem 0x0-0x1f 64bit pref] * - 'r' For raw struct resource, e.g., [mem 0x0-0x1f flags 0x201] * - 'b[l]' For a bitmap, the number of bits is determined by the field * width which must be explicitly specified either as part of the * format string '%32b[l]' or through '%*b[l]', [l] selects * range-list format instead of hex format * - 'M' For a 6-byte MAC address, it prints the address in the * usual colon-separated hex notation * - 'm' For a 6-byte MAC address, it prints the hex address without colons * - 'MF' For a 6-byte MAC FDDI address, it prints the address * with a dash-separated hex notation * - '[mM]R' For a 6-byte MAC address, Reverse order (Bluetooth) * - 'I' [46] for IPv4/IPv6 addresses printed in the usual way * IPv4 uses dot-separated decimal without leading 0's (1.2.3.4) * IPv6 uses colon separated network-order 16 bit hex with leading 0's * [S][pfs] * Generic IPv4/IPv6 address (struct sockaddr *) that falls back to * [4] or [6] and is able to print port [p], flowinfo [f], scope [s] * - 'i' [46] for 'raw' IPv4/IPv6 addresses * IPv6 omits the colons (01020304...0f) * IPv4 uses dot-separated decimal with leading 0's (010.123.045.006) * [S][pfs] * Generic IPv4/IPv6 address (struct sockaddr *) that falls back to * [4] or [6] and is able to print port [p], flowinfo [f], scope [s] * - '[Ii][4S][hnbl]' IPv4 addresses in host, network, big or little endian order * - 'I[6S]c' for IPv6 addresses printed as specified by * https://tools.ietf.org/html/rfc5952 * - 'E[achnops]' For an escaped buffer, where rules are defined by combination * of the following flags (see string_escape_mem() for the * details): * a - ESCAPE_ANY * c - ESCAPE_SPECIAL * h - ESCAPE_HEX * n - ESCAPE_NULL * o - ESCAPE_OCTAL * p - ESCAPE_NP * s - ESCAPE_SPACE * By default ESCAPE_ANY_NP is used. * - 'U' For a 16 byte UUID/GUID, it prints the UUID/GUID in the form * "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" * Options for %pU are: * b big endian lower case hex (default) * B big endian UPPER case hex * l little endian lower case hex * L little endian UPPER case hex * big endian output byte order is: * [0][1][2][3]-[4][5]-[6][7]-[8][9]-[10][11][12][13][14][15] * little endian output byte order is: * [3][2][1][0]-[5][4]-[7][6]-[8][9]-[10][11][12][13][14][15] * - 'V' For a struct va_format which contains a format string * and va_list *, * call vsnprintf(->format, *->va_list). * Implements a "recursive vsnprintf". * Do not use this feature without some mechanism to verify the * correctness of the format string and va_list arguments. * - 'K' For a kernel pointer that should be hidden from unprivileged users. * Use only for procfs, sysfs and similar files, not printk(); please * read the documentation (path below) first. * - 'NF' For a netdev_features_t * - '4cc' V4L2 or DRM FourCC code, with endianness and raw numerical value. * - 'h[CDN]' For a variable-length buffer, it prints it as a hex string with * a certain separator (' ' by default): * C colon * D dash * N no separator * The maximum supported length is 64 bytes of the input. Consider * to use print_hex_dump() for the larger input. * - 'a[pd]' For address types [p] phys_addr_t, [d] dma_addr_t and derivatives * (default assumed to be phys_addr_t, passed by reference) * - 'd[234]' For a dentry name (optionally 2-4 last components) * - 'D[234]' Same as 'd' but for a struct file * - 'g' For block_device name (gendisk + partition number) * - 't[RT][dt][r][s]' For time and date as represented by: * R struct rtc_time * T time64_t * - 'C' For a clock, it prints the name (Common Clock Framework) or address * (legacy clock framework) of the clock * - 'Cn' For a clock, it prints the name (Common Clock Framework) or address * (legacy clock framework) of the clock * - 'G' For flags to be printed as a collection of symbolic strings that would * construct the specific value. Supported flags given by option: * p page flags (see struct page) given as pointer to unsigned long * g gfp flags (GFP_* and __GFP_*) given as pointer to gfp_t * v vma flags (VM_*) given as pointer to unsigned long * - 'OF[fnpPcCF]' For a device tree object * Without any optional arguments prints the full_name * f device node full_name * n device node name * p device node phandle * P device node path spec (name + @unit) * F device node flags * c major compatible string * C full compatible string * - 'fw[fP]' For a firmware node (struct fwnode_handle) pointer * Without an option prints the full name of the node * f full name * P node name, including a possible unit address * - 'x' For printing the address unmodified. Equivalent to "%lx". * Please read the documentation (path below) before using! * - '[ku]s' For a BPF/tracing related format specifier, e.g. used out of * bpf_trace_printk() where [ku] prefix specifies either kernel (k) * or user (u) memory to probe, and: * s a string, equivalent to "%s" on direct vsnprintf() use * * ** When making changes please also update: * Documentation/core-api/printk-formats.rst * * Note: The default behaviour (unadorned %p) is to hash the address, * rendering it useful as a unique identifier. * * There is also a '%pA' format specifier, but it is only intended to be used * from Rust code to format core::fmt::Arguments. Do *not* use it from C. * See rust/kernel/print.rs for details. */ static noinline_for_stack char *pointer(const char *fmt, char *buf, char *end, void *ptr, struct printf_spec spec) { switch (*fmt) { case 'S': case 's': ptr = dereference_symbol_descriptor(ptr); fallthrough; case 'B': return symbol_string(buf, end, ptr, spec, fmt); case 'R': case 'r': return resource_string(buf, end, ptr, spec, fmt); case 'h': return hex_string(buf, end, ptr, spec, fmt); case 'b': switch (fmt[1]) { case 'l': return bitmap_list_string(buf, end, ptr, spec, fmt); default: return bitmap_string(buf, end, ptr, spec, fmt); } case 'M': /* Colon separated: 00:01:02:03:04:05 */ case 'm': /* Contiguous: 000102030405 */ /* [mM]F (FDDI) */ /* [mM]R (Reverse order; Bluetooth) */ return mac_address_string(buf, end, ptr, spec, fmt); case 'I': /* Formatted IP supported * 4: 1.2.3.4 * 6: 0001:0203:...:0708 * 6c: 1::708 or 1::1.2.3.4 */ case 'i': /* Contiguous: * 4: 001.002.003.004 * 6: 000102...0f */ return ip_addr_string(buf, end, ptr, spec, fmt); case 'E': return escaped_string(buf, end, ptr, spec, fmt); case 'U': return uuid_string(buf, end, ptr, spec, fmt); case 'V': return va_format(buf, end, ptr, spec, fmt); case 'K': return restricted_pointer(buf, end, ptr, spec); case 'N': return netdev_bits(buf, end, ptr, spec, fmt); case '4': return fourcc_string(buf, end, ptr, spec, fmt); case 'a': return address_val(buf, end, ptr, spec, fmt); case 'd': return dentry_name(buf, end, ptr, spec, fmt); case 't': return time_and_date(buf, end, ptr, spec, fmt); case 'C': return clock(buf, end, ptr, spec, fmt); case 'D': return file_dentry_name(buf, end, ptr, spec, fmt); #ifdef CONFIG_BLOCK case 'g': return bdev_name(buf, end, ptr, spec, fmt); #endif case 'G': return flags_string(buf, end, ptr, spec, fmt); case 'O': return device_node_string(buf, end, ptr, spec, fmt + 1); case 'f': return fwnode_string(buf, end, ptr, spec, fmt + 1); case 'A': if (!IS_ENABLED(CONFIG_RUST)) { WARN_ONCE(1, "Please remove %%pA from non-Rust code\n"); return error_string(buf, end, "(%pA?)", spec); } return rust_fmt_argument(buf, end, ptr); case 'x': return pointer_string(buf, end, ptr, spec); case 'e': /* %pe with a non-ERR_PTR gets treated as plain %p */ if (!IS_ERR(ptr)) return default_pointer(buf, end, ptr, spec); return err_ptr(buf, end, ptr, spec); case 'u': case 'k': switch (fmt[1]) { case 's': return string(buf, end, ptr, spec); default: return error_string(buf, end, "(einval)", spec); } default: return default_pointer(buf, end, ptr, spec); } } /* * Helper function to decode printf style format. * Each call decode a token from the format and return the * number of characters read (or likely the delta where it wants * to go on the next call). * The decoded token is returned through the parameters * * 'h', 'l', or 'L' for integer fields * 'z' support added 23/7/1999 S.H. * 'z' changed to 'Z' --davidm 1/25/99 * 'Z' changed to 'z' --adobriyan 2017-01-25 * 't' added for ptrdiff_t * * @fmt: the format string * @type of the token returned * @flags: various flags such as +, -, # tokens.. * @field_width: overwritten width * @base: base of the number (octal, hex, ...) * @precision: precision of a number * @qualifier: qualifier of a number (long, size_t, ...) */ static noinline_for_stack int format_decode(const char *fmt, struct printf_spec *spec) { const char *start = fmt; char qualifier; /* we finished early by reading the field width */ if (spec->type == FORMAT_TYPE_WIDTH) { if (spec->field_width < 0) { spec->field_width = -spec->field_width; spec->flags |= LEFT; } spec->type = FORMAT_TYPE_NONE; goto precision; } /* we finished early by reading the precision */ if (spec->type == FORMAT_TYPE_PRECISION) { if (spec->precision < 0) spec->precision = 0; spec->type = FORMAT_TYPE_NONE; goto qualifier; } /* By default */ spec->type = FORMAT_TYPE_NONE; for (; *fmt ; ++fmt) { if (*fmt == '%') break; } /* Return the current non-format string */ if (fmt != start || !*fmt) return fmt - start; /* Process flags */ spec->flags = 0; while (1) { /* this also skips first '%' */ bool found = true; ++fmt; switch (*fmt) { case '-': spec->flags |= LEFT; break; case '+': spec->flags |= PLUS; break; case ' ': spec->flags |= SPACE; break; case '#': spec->flags |= SPECIAL; break; case '0': spec->flags |= ZEROPAD; break; default: found = false; } if (!found) break; } /* get field width */ spec->field_width = -1; if (isdigit(*fmt)) spec->field_width = skip_atoi(&fmt); else if (*fmt == '*') { /* it's the next argument */ spec->type = FORMAT_TYPE_WIDTH; return ++fmt - start; } precision: /* get the precision */ spec->precision = -1; if (*fmt == '.') { ++fmt; if (isdigit(*fmt)) { spec->precision = skip_atoi(&fmt); if (spec->precision < 0) spec->precision = 0; } else if (*fmt == '*') { /* it's the next argument */ spec->type = FORMAT_TYPE_PRECISION; return ++fmt - start; } } qualifier: /* get the conversion qualifier */ qualifier = 0; if (*fmt == 'h' || _tolower(*fmt) == 'l' || *fmt == 'z' || *fmt == 't') { qualifier = *fmt++; if (unlikely(qualifier == *fmt)) { if (qualifier == 'l') { qualifier = 'L'; ++fmt; } else if (qualifier == 'h') { qualifier = 'H'; ++fmt; } } } /* default base */ spec->base = 10; switch (*fmt) { case 'c': spec->type = FORMAT_TYPE_CHAR; return ++fmt - start; case 's': spec->type = FORMAT_TYPE_STR; return ++fmt - start; case 'p': spec->type = FORMAT_TYPE_PTR; return ++fmt - start; case '%': spec->type = FORMAT_TYPE_PERCENT_CHAR; return ++fmt - start; /* integer number formats - set up the flags and "break" */ case 'o': spec->base = 8; break; case 'x': spec->flags |= SMALL; fallthrough; case 'X': spec->base = 16; break; case 'd': case 'i': spec->flags |= SIGN; break; case 'u': break; case 'n': /* * Since %n poses a greater security risk than * utility, treat it as any other invalid or * unsupported format specifier. */ fallthrough; default: WARN_ONCE(1, "Please remove unsupported %%%c in format string\n", *fmt); spec->type = FORMAT_TYPE_INVALID; return fmt - start; } if (qualifier == 'L') spec->type = FORMAT_TYPE_LONG_LONG; else if (qualifier == 'l') { BUILD_BUG_ON(FORMAT_TYPE_ULONG + SIGN != FORMAT_TYPE_LONG); spec->type = FORMAT_TYPE_ULONG + (spec->flags & SIGN); } else if (qualifier == 'z') { spec->type = FORMAT_TYPE_SIZE_T; } else if (qualifier == 't') { spec->type = FORMAT_TYPE_PTRDIFF; } else if (qualifier == 'H') { BUILD_BUG_ON(FORMAT_TYPE_UBYTE + SIGN != FORMAT_TYPE_BYTE); spec->type = FORMAT_TYPE_UBYTE + (spec->flags & SIGN); } else if (qualifier == 'h') { BUILD_BUG_ON(FORMAT_TYPE_USHORT + SIGN != FORMAT_TYPE_SHORT); spec->type = FORMAT_TYPE_USHORT + (spec->flags & SIGN); } else { BUILD_BUG_ON(FORMAT_TYPE_UINT + SIGN != FORMAT_TYPE_INT); spec->type = FORMAT_TYPE_UINT + (spec->flags & SIGN); } return ++fmt - start; } static void set_field_width(struct printf_spec *spec, int width) { spec->field_width = width; if (WARN_ONCE(spec->field_width != width, "field width %d too large", width)) { spec->field_width = clamp(width, -FIELD_WIDTH_MAX, FIELD_WIDTH_MAX); } } static void set_precision(struct printf_spec *spec, int prec) { spec->precision = prec; if (WARN_ONCE(spec->precision != prec, "precision %d too large", prec)) { spec->precision = clamp(prec, 0, PRECISION_MAX); } } /** * vsnprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into * @size: The size of the buffer, including the trailing null space * @fmt: The format string to use * @args: Arguments for the format string * * This function generally follows C99 vsnprintf, but has some * extensions and a few limitations: * * - ``%n`` is unsupported * - ``%p*`` is handled by pointer() * * See pointer() or Documentation/core-api/printk-formats.rst for more * extensive description. * * **Please update the documentation in both places when making changes** * * The return value is the number of characters which would * be generated for the given input, excluding the trailing * '\0', as per ISO C99. If you want to have the exact * number of characters written into @buf as return value * (not including the trailing '\0'), use vscnprintf(). If the * return is greater than or equal to @size, the resulting * string is truncated. * * If you're not already dealing with a va_list consider using snprintf(). */ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) { unsigned long long num; char *str, *end; struct printf_spec spec = {0}; /* Reject out-of-range values early. Large positive sizes are used for unknown buffer sizes. */ if (WARN_ON_ONCE(size > INT_MAX)) return 0; str = buf; end = buf + size; /* Make sure end is always >= buf */ if (end < buf) { end = ((void *)-1); size = end - buf; } while (*fmt) { const char *old_fmt = fmt; int read = format_decode(fmt, &spec); fmt += read; switch (spec.type) { case FORMAT_TYPE_NONE: { int copy = read; if (str < end) { if (copy > end - str) copy = end - str; memcpy(str, old_fmt, copy); } str += read; break; } case FORMAT_TYPE_WIDTH: set_field_width(&spec, va_arg(args, int)); break; case FORMAT_TYPE_PRECISION: set_precision(&spec, va_arg(args, int)); break; case FORMAT_TYPE_CHAR: { char c; if (!(spec.flags & LEFT)) { while (--spec.field_width > 0) { if (str < end) *str = ' '; ++str; } } c = (unsigned char) va_arg(args, int); if (str < end) *str = c; ++str; while (--spec.field_width > 0) { if (str < end) *str = ' '; ++str; } break; } case FORMAT_TYPE_STR: str = string(str, end, va_arg(args, char *), spec); break; case FORMAT_TYPE_PTR: str = pointer(fmt, str, end, va_arg(args, void *), spec); while (isalnum(*fmt)) fmt++; break; case FORMAT_TYPE_PERCENT_CHAR: if (str < end) *str = '%'; ++str; break; case FORMAT_TYPE_INVALID: /* * Presumably the arguments passed gcc's type * checking, but there is no safe or sane way * for us to continue parsing the format and * fetching from the va_list; the remaining * specifiers and arguments would be out of * sync. */ goto out; default: switch (spec.type) { case FORMAT_TYPE_LONG_LONG: num = va_arg(args, long long); break; case FORMAT_TYPE_ULONG: num = va_arg(args, unsigned long); break; case FORMAT_TYPE_LONG: num = va_arg(args, long); break; case FORMAT_TYPE_SIZE_T: if (spec.flags & SIGN) num = va_arg(args, ssize_t); else num = va_arg(args, size_t); break; case FORMAT_TYPE_PTRDIFF: num = va_arg(args, ptrdiff_t); break; case FORMAT_TYPE_UBYTE: num = (unsigned char) va_arg(args, int); break; case FORMAT_TYPE_BYTE: num = (signed char) va_arg(args, int); break; case FORMAT_TYPE_USHORT: num = (unsigned short) va_arg(args, int); break; case FORMAT_TYPE_SHORT: num = (short) va_arg(args, int); break; case FORMAT_TYPE_INT: num = (int) va_arg(args, int); break; default: num = va_arg(args, unsigned int); } str = number(str, end, num, spec); } } out: if (size > 0) { if (str < end) *str = '\0'; else end[-1] = '\0'; } /* the trailing null byte doesn't count towards the total */ return str-buf; } EXPORT_SYMBOL(vsnprintf); /** * vscnprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into * @size: The size of the buffer, including the trailing null space * @fmt: The format string to use * @args: Arguments for the format string * * The return value is the number of characters which have been written into * the @buf not including the trailing '\0'. If @size is == 0 the function * returns 0. * * If you're not already dealing with a va_list consider using scnprintf(). * * See the vsnprintf() documentation for format string extensions over C99. */ int vscnprintf(char *buf, size_t size, const char *fmt, va_list args) { int i; if (unlikely(!size)) return 0; i = vsnprintf(buf, size, fmt, args); if (likely(i < size)) return i; return size - 1; } EXPORT_SYMBOL(vscnprintf); /** * snprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into * @size: The size of the buffer, including the trailing null space * @fmt: The format string to use * @...: Arguments for the format string * * The return value is the number of characters which would be * generated for the given input, excluding the trailing null, * as per ISO C99. If the return is greater than or equal to * @size, the resulting string is truncated. * * See the vsnprintf() documentation for format string extensions over C99. */ int snprintf(char *buf, size_t size, const char *fmt, ...) { va_list args; int i; va_start(args, fmt); i = vsnprintf(buf, size, fmt, args); va_end(args); return i; } EXPORT_SYMBOL(snprintf); /** * scnprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into * @size: The size of the buffer, including the trailing null space * @fmt: The format string to use * @...: Arguments for the format string * * The return value is the number of characters written into @buf not including * the trailing '\0'. If @size is == 0 the function returns 0. */ int scnprintf(char *buf, size_t size, const char *fmt, ...) { va_list args; int i; va_start(args, fmt); i = vscnprintf(buf, size, fmt, args); va_end(args); return i; } EXPORT_SYMBOL(scnprintf); /** * vsprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into * @fmt: The format string to use * @args: Arguments for the format string * * The function returns the number of characters written * into @buf. Use vsnprintf() or vscnprintf() in order to avoid * buffer overflows. * * If you're not already dealing with a va_list consider using sprintf(). * * See the vsnprintf() documentation for format string extensions over C99. */ int vsprintf(char *buf, const char *fmt, va_list args) { return vsnprintf(buf, INT_MAX, fmt, args); } EXPORT_SYMBOL(vsprintf); /** * sprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into * @fmt: The format string to use * @...: Arguments for the format string * * The function returns the number of characters written * into @buf. Use snprintf() or scnprintf() in order to avoid * buffer overflows. * * See the vsnprintf() documentation for format string extensions over C99. */ int sprintf(char *buf, const char *fmt, ...) { va_list args; int i; va_start(args, fmt); i = vsnprintf(buf, INT_MAX, fmt, args); va_end(args); return i; } EXPORT_SYMBOL(sprintf); #ifdef CONFIG_BINARY_PRINTF /* * bprintf service: * vbin_printf() - VA arguments to binary data * bstr_printf() - Binary data to text string */ /** * vbin_printf - Parse a format string and place args' binary value in a buffer * @bin_buf: The buffer to place args' binary value * @size: The size of the buffer(by words(32bits), not characters) * @fmt: The format string to use * @args: Arguments for the format string * * The format follows C99 vsnprintf, except %n is ignored, and its argument * is skipped. * * The return value is the number of words(32bits) which would be generated for * the given input. * * NOTE: * If the return value is greater than @size, the resulting bin_buf is NOT * valid for bstr_printf(). */ int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args) { struct printf_spec spec = {0}; char *str, *end; int width; str = (char *)bin_buf; end = (char *)(bin_buf + size); #define save_arg(type) \ ({ \ unsigned long long value; \ if (sizeof(type) == 8) { \ unsigned long long val8; \ str = PTR_ALIGN(str, sizeof(u32)); \ val8 = va_arg(args, unsigned long long); \ if (str + sizeof(type) <= end) { \ *(u32 *)str = *(u32 *)&val8; \ *(u32 *)(str + 4) = *((u32 *)&val8 + 1); \ } \ value = val8; \ } else { \ unsigned int val4; \ str = PTR_ALIGN(str, sizeof(type)); \ val4 = va_arg(args, int); \ if (str + sizeof(type) <= end) \ *(typeof(type) *)str = (type)(long)val4; \ value = (unsigned long long)val4; \ } \ str += sizeof(type); \ value; \ }) while (*fmt) { int read = format_decode(fmt, &spec); fmt += read; switch (spec.type) { case FORMAT_TYPE_NONE: case FORMAT_TYPE_PERCENT_CHAR: break; case FORMAT_TYPE_INVALID: goto out; case FORMAT_TYPE_WIDTH: case FORMAT_TYPE_PRECISION: width = (int)save_arg(int); /* Pointers may require the width */ if (*fmt == 'p') set_field_width(&spec, width); break; case FORMAT_TYPE_CHAR: save_arg(char); break; case FORMAT_TYPE_STR: { const char *save_str = va_arg(args, char *); const char *err_msg; size_t len; err_msg = check_pointer_msg(save_str); if (err_msg) save_str = err_msg; len = strlen(save_str) + 1; if (str + len < end) memcpy(str, save_str, len); str += len; break; } case FORMAT_TYPE_PTR: /* Dereferenced pointers must be done now */ switch (*fmt) { /* Dereference of functions is still OK */ case 'S': case 's': case 'x': case 'K': case 'e': save_arg(void *); break; default: if (!isalnum(*fmt)) { save_arg(void *); break; } str = pointer(fmt, str, end, va_arg(args, void *), spec); if (str + 1 < end) *str++ = '\0'; else end[-1] = '\0'; /* Must be nul terminated */ } /* skip all alphanumeric pointer suffixes */ while (isalnum(*fmt)) fmt++; break; default: switch (spec.type) { case FORMAT_TYPE_LONG_LONG: save_arg(long long); break; case FORMAT_TYPE_ULONG: case FORMAT_TYPE_LONG: save_arg(unsigned long); break; case FORMAT_TYPE_SIZE_T: save_arg(size_t); break; case FORMAT_TYPE_PTRDIFF: save_arg(ptrdiff_t); break; case FORMAT_TYPE_UBYTE: case FORMAT_TYPE_BYTE: save_arg(char); break; case FORMAT_TYPE_USHORT: case FORMAT_TYPE_SHORT: save_arg(short); break; default: save_arg(int); } } } out: return (u32 *)(PTR_ALIGN(str, sizeof(u32))) - bin_buf; #undef save_arg } EXPORT_SYMBOL_GPL(vbin_printf); /** * bstr_printf - Format a string from binary arguments and place it in a buffer * @buf: The buffer to place the result into * @size: The size of the buffer, including the trailing null space * @fmt: The format string to use * @bin_buf: Binary arguments for the format string * * This function like C99 vsnprintf, but the difference is that vsnprintf gets * arguments from stack, and bstr_printf gets arguments from @bin_buf which is * a binary buffer that generated by vbin_printf. * * The format follows C99 vsnprintf, but has some extensions: * see vsnprintf comment for details. * * The return value is the number of characters which would * be generated for the given input, excluding the trailing * '\0', as per ISO C99. If you want to have the exact * number of characters written into @buf as return value * (not including the trailing '\0'), use vscnprintf(). If the * return is greater than or equal to @size, the resulting * string is truncated. */ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) { struct printf_spec spec = {0}; char *str, *end; const char *args = (const char *)bin_buf; if (WARN_ON_ONCE(size > INT_MAX)) return 0; str = buf; end = buf + size; #define get_arg(type) \ ({ \ typeof(type) value; \ if (sizeof(type) == 8) { \ args = PTR_ALIGN(args, sizeof(u32)); \ *(u32 *)&value = *(u32 *)args; \ *((u32 *)&value + 1) = *(u32 *)(args + 4); \ } else { \ args = PTR_ALIGN(args, sizeof(type)); \ value = *(typeof(type) *)args; \ } \ args += sizeof(type); \ value; \ }) /* Make sure end is always >= buf */ if (end < buf) { end = ((void *)-1); size = end - buf; } while (*fmt) { const char *old_fmt = fmt; int read = format_decode(fmt, &spec); fmt += read; switch (spec.type) { case FORMAT_TYPE_NONE: { int copy = read; if (str < end) { if (copy > end - str) copy = end - str; memcpy(str, old_fmt, copy); } str += read; break; } case FORMAT_TYPE_WIDTH: set_field_width(&spec, get_arg(int)); break; case FORMAT_TYPE_PRECISION: set_precision(&spec, get_arg(int)); break; case FORMAT_TYPE_CHAR: { char c; if (!(spec.flags & LEFT)) { while (--spec.field_width > 0) { if (str < end) *str = ' '; ++str; } } c = (unsigned char) get_arg(char); if (str < end) *str = c; ++str; while (--spec.field_width > 0) { if (str < end) *str = ' '; ++str; } break; } case FORMAT_TYPE_STR: { const char *str_arg = args; args += strlen(str_arg) + 1; str = string(str, end, (char *)str_arg, spec); break; } case FORMAT_TYPE_PTR: { bool process = false; int copy, len; /* Non function dereferences were already done */ switch (*fmt) { case 'S': case 's': case 'x': case 'K': case 'e': process = true; break; default: if (!isalnum(*fmt)) { process = true; break; } /* Pointer dereference was already processed */ if (str < end) { len = copy = strlen(args); if (copy > end - str) copy = end - str; memcpy(str, args, copy); str += len; args += len + 1; } } if (process) str = pointer(fmt, str, end, get_arg(void *), spec); while (isalnum(*fmt)) fmt++; break; } case FORMAT_TYPE_PERCENT_CHAR: if (str < end) *str = '%'; ++str; break; case FORMAT_TYPE_INVALID: goto out; default: { unsigned long long num; switch (spec.type) { case FORMAT_TYPE_LONG_LONG: num = get_arg(long long); break; case FORMAT_TYPE_ULONG: case FORMAT_TYPE_LONG: num = get_arg(unsigned long); break; case FORMAT_TYPE_SIZE_T: num = get_arg(size_t); break; case FORMAT_TYPE_PTRDIFF: num = get_arg(ptrdiff_t); break; case FORMAT_TYPE_UBYTE: num = get_arg(unsigned char); break; case FORMAT_TYPE_BYTE: num = get_arg(signed char); break; case FORMAT_TYPE_USHORT: num = get_arg(unsigned short); break; case FORMAT_TYPE_SHORT: num = get_arg(short); break; case FORMAT_TYPE_UINT: num = get_arg(unsigned int); break; default: num = get_arg(int); } str = number(str, end, num, spec); } /* default: */ } /* switch(spec.type) */ } /* while(*fmt) */ out: if (size > 0) { if (str < end) *str = '\0'; else end[-1] = '\0'; } #undef get_arg /* the trailing null byte doesn't count towards the total */ return str - buf; } EXPORT_SYMBOL_GPL(bstr_printf); /** * bprintf - Parse a format string and place args' binary value in a buffer * @bin_buf: The buffer to place args' binary value * @size: The size of the buffer(by words(32bits), not characters) * @fmt: The format string to use * @...: Arguments for the format string * * The function returns the number of words(u32) written * into @bin_buf. */ int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) { va_list args; int ret; va_start(args, fmt); ret = vbin_printf(bin_buf, size, fmt, args); va_end(args); return ret; } EXPORT_SYMBOL_GPL(bprintf); #endif /* CONFIG_BINARY_PRINTF */ /** * vsscanf - Unformat a buffer into a list of arguments * @buf: input buffer * @fmt: format of buffer * @args: arguments */ int vsscanf(const char *buf, const char *fmt, va_list args) { const char *str = buf; char *next; char digit; int num = 0; u8 qualifier; unsigned int base; union { long long s; unsigned long long u; } val; s16 field_width; bool is_sign; while (*fmt) { /* skip any white space in format */ /* white space in format matches any amount of * white space, including none, in the input. */ if (isspace(*fmt)) { fmt = skip_spaces(++fmt); str = skip_spaces(str); } /* anything that is not a conversion must match exactly */ if (*fmt != '%' && *fmt) { if (*fmt++ != *str++) break; continue; } if (!*fmt) break; ++fmt; /* skip this conversion. * advance both strings to next white space */ if (*fmt == '*') { if (!*str) break; while (!isspace(*fmt) && *fmt != '%' && *fmt) { /* '%*[' not yet supported, invalid format */ if (*fmt == '[') return num; fmt++; } while (!isspace(*str) && *str) str++; continue; } /* get field width */ field_width = -1; if (isdigit(*fmt)) { field_width = skip_atoi(&fmt); if (field_width <= 0) break; } /* get conversion qualifier */ qualifier = -1; if (*fmt == 'h' || _tolower(*fmt) == 'l' || *fmt == 'z') { qualifier = *fmt++; if (unlikely(qualifier == *fmt)) { if (qualifier == 'h') { qualifier = 'H'; fmt++; } else if (qualifier == 'l') { qualifier = 'L'; fmt++; } } } if (!*fmt) break; if (*fmt == 'n') { /* return number of characters read so far */ *va_arg(args, int *) = str - buf; ++fmt; continue; } if (!*str) break; base = 10; is_sign = false; switch (*fmt++) { case 'c': { char *s = (char *)va_arg(args, char*); if (field_width == -1) field_width = 1; do { *s++ = *str++; } while (--field_width > 0 && *str); num++; } continue; case 's': { char *s = (char *)va_arg(args, char *); if (field_width == -1) field_width = SHRT_MAX; /* first, skip leading white space in buffer */ str = skip_spaces(str); /* now copy until next white space */ while (*str && !isspace(*str) && field_width--) *s++ = *str++; *s = '\0'; num++; } continue; /* * Warning: This implementation of the '[' conversion specifier * deviates from its glibc counterpart in the following ways: * (1) It does NOT support ranges i.e. '-' is NOT a special * character * (2) It cannot match the closing bracket ']' itself * (3) A field width is required * (4) '%*[' (discard matching input) is currently not supported * * Example usage: * ret = sscanf("00:0a:95","%2[^:]:%2[^:]:%2[^:]", * buf1, buf2, buf3); * if (ret < 3) * // etc.. */ case '[': { char *s = (char *)va_arg(args, char *); DECLARE_BITMAP(set, 256) = {0}; unsigned int len = 0; bool negate = (*fmt == '^'); /* field width is required */ if (field_width == -1) return num; if (negate) ++fmt; for ( ; *fmt && *fmt != ']'; ++fmt, ++len) __set_bit((u8)*fmt, set); /* no ']' or no character set found */ if (!*fmt || !len) return num; ++fmt; if (negate) { bitmap_complement(set, set, 256); /* exclude null '\0' byte */ __clear_bit(0, set); } /* match must be non-empty */ if (!test_bit((u8)*str, set)) return num; while (test_bit((u8)*str, set) && field_width--) *s++ = *str++; *s = '\0'; ++num; } continue; case 'o': base = 8; break; case 'x': case 'X': base = 16; break; case 'i': base = 0; fallthrough; case 'd': is_sign = true; fallthrough; case 'u': break; case '%': /* looking for '%' in str */ if (*str++ != '%') return num; continue; default: /* invalid format; stop here */ return num; } /* have some sort of integer conversion. * first, skip white space in buffer. */ str = skip_spaces(str); digit = *str; if (is_sign && digit == '-') { if (field_width == 1) break; digit = *(str + 1); } if (!digit || (base == 16 && !isxdigit(digit)) || (base == 10 && !isdigit(digit)) || (base == 8 && !isodigit(digit)) || (base == 0 && !isdigit(digit))) break; if (is_sign) val.s = simple_strntoll(str, &next, base, field_width >= 0 ? field_width : INT_MAX); else val.u = simple_strntoull(str, &next, base, field_width >= 0 ? field_width : INT_MAX); switch (qualifier) { case 'H': /* that's 'hh' in format */ if (is_sign) *va_arg(args, signed char *) = val.s; else *va_arg(args, unsigned char *) = val.u; break; case 'h': if (is_sign) *va_arg(args, short *) = val.s; else *va_arg(args, unsigned short *) = val.u; break; case 'l': if (is_sign) *va_arg(args, long *) = val.s; else *va_arg(args, unsigned long *) = val.u; break; case 'L': if (is_sign) *va_arg(args, long long *) = val.s; else *va_arg(args, unsigned long long *) = val.u; break; case 'z': *va_arg(args, size_t *) = val.u; break; default: if (is_sign) *va_arg(args, int *) = val.s; else *va_arg(args, unsigned int *) = val.u; break; } num++; if (!next) break; str = next; } return num; } EXPORT_SYMBOL(vsscanf); /** * sscanf - Unformat a buffer into a list of arguments * @buf: input buffer * @fmt: formatting of buffer * @...: resulting arguments */ int sscanf(const char *buf, const char *fmt, ...) { va_list args; int i; va_start(args, fmt); i = vsscanf(buf, fmt, args); va_end(args); return i; } EXPORT_SYMBOL(sscanf);
4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 // SPDX-License-Identifier: GPL-2.0 /* * procfs-based user access to knfsd statistics * * /proc/net/rpc/nfsd * * Format: * rc <hits> <misses> <nocache> * Statistsics for the reply cache * fh <stale> <deprecated filehandle cache stats> * statistics for filehandle lookup * io <bytes-read> <bytes-written> * statistics for IO throughput * th <threads> <deprecated thread usage histogram stats> * number of threads * ra <deprecated ra-cache stats> * * plus generic RPC stats (see net/sunrpc/stats.c) * * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> */ #include <linux/seq_file.h> #include <linux/module.h> #include <linux/sunrpc/stats.h> #include <net/net_namespace.h> #include "nfsd.h" static int nfsd_show(struct seq_file *seq, void *v) { struct net *net = pde_data(file_inode(seq->file)); struct nfsd_net *nn = net_generic(net, nfsd_net_id); int i; seq_printf(seq, "rc %lld %lld %lld\nfh %lld 0 0 0 0\nio %lld %lld\n", percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_HITS]), percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_MISSES]), percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_NOCACHE]), percpu_counter_sum_positive(&nn->counter[NFSD_STATS_FH_STALE]), percpu_counter_sum_positive(&nn->counter[NFSD_STATS_IO_READ]), percpu_counter_sum_positive(&nn->counter[NFSD_STATS_IO_WRITE])); /* thread usage: */ seq_printf(seq, "th %u 0", atomic_read(&nfsd_th_cnt)); /* deprecated thread usage histogram stats */ for (i = 0; i < 10; i++) seq_puts(seq, " 0.000"); /* deprecated ra-cache stats */ seq_puts(seq, "\nra 0 0 0 0 0 0 0 0 0 0 0 0\n"); /* show my rpc info */ svc_seq_show(seq, &nn->nfsd_svcstats); #ifdef CONFIG_NFSD_V4 /* Show count for individual nfsv4 operations */ /* Writing operation numbers 0 1 2 also for maintaining uniformity */ seq_printf(seq, "proc4ops %u", LAST_NFS4_OP + 1); for (i = 0; i <= LAST_NFS4_OP; i++) { seq_printf(seq, " %lld", percpu_counter_sum_positive(&nn->counter[NFSD_STATS_NFS4_OP(i)])); } seq_printf(seq, "\nwdeleg_getattr %lld", percpu_counter_sum_positive(&nn->counter[NFSD_STATS_WDELEG_GETATTR])); seq_putc(seq, '\n'); #endif return 0; } DEFINE_PROC_SHOW_ATTRIBUTE(nfsd); int nfsd_percpu_counters_init(struct percpu_counter *counters, int num) { int i, err = 0; for (i = 0; !err && i < num; i++) err = percpu_counter_init(&counters[i], 0, GFP_KERNEL); if (!err) return 0; for (; i > 0; i--) percpu_counter_destroy(&counters[i-1]); return err; } void nfsd_percpu_counters_reset(struct percpu_counter counters[], int num) { int i; for (i = 0; i < num; i++) percpu_counter_set(&counters[i], 0); } void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num) { int i; for (i = 0; i < num; i++) percpu_counter_destroy(&counters[i]); } int nfsd_stat_counters_init(struct nfsd_net *nn) { return nfsd_percpu_counters_init(nn->counter, NFSD_STATS_COUNTERS_NUM); } void nfsd_stat_counters_destroy(struct nfsd_net *nn) { nfsd_percpu_counters_destroy(nn->counter, NFSD_STATS_COUNTERS_NUM); } void nfsd_proc_stat_init(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); svc_proc_register(net, &nn->nfsd_svcstats, &nfsd_proc_ops); } void nfsd_proc_stat_shutdown(struct net *net) { svc_proc_unregister(net, "nfsd"); }
8 3 22 2 4 2 27 4 10 2 4 6 2 4 4 6 6 21 19 19 19 19 19 41 41 41 3 3 3 19 20 7 2 18 17 4 2 2 1 1 12 11 2 3 7 3 43 20 7 42 25 5 10 2 9 6 2 2 46 47 2 27 21 49 2 39 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 // SPDX-License-Identifier: GPL-2.0 /* * trace_events_filter - generic event filtering * * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com> */ #include <linux/uaccess.h> #include <linux/module.h> #include <linux/ctype.h> #include <linux/mutex.h> #include <linux/perf_event.h> #include <linux/slab.h> #include "trace.h" #include "trace_output.h" #define DEFAULT_SYS_FILTER_MESSAGE \ "### global filter ###\n" \ "# Use this to set filters for multiple events.\n" \ "# Only events with the given fields will be affected.\n" \ "# If no events are modified, an error message will be displayed here" /* Due to token parsing '<=' must be before '<' and '>=' must be before '>' */ #define OPS \ C( OP_GLOB, "~" ), \ C( OP_NE, "!=" ), \ C( OP_EQ, "==" ), \ C( OP_LE, "<=" ), \ C( OP_LT, "<" ), \ C( OP_GE, ">=" ), \ C( OP_GT, ">" ), \ C( OP_BAND, "&" ), \ C( OP_MAX, NULL ) #undef C #define C(a, b) a enum filter_op_ids { OPS }; #undef C #define C(a, b) b static const char * ops[] = { OPS }; enum filter_pred_fn { FILTER_PRED_FN_NOP, FILTER_PRED_FN_64, FILTER_PRED_FN_64_CPUMASK, FILTER_PRED_FN_S64, FILTER_PRED_FN_U64, FILTER_PRED_FN_32, FILTER_PRED_FN_32_CPUMASK, FILTER_PRED_FN_S32, FILTER_PRED_FN_U32, FILTER_PRED_FN_16, FILTER_PRED_FN_16_CPUMASK, FILTER_PRED_FN_S16, FILTER_PRED_FN_U16, FILTER_PRED_FN_8, FILTER_PRED_FN_8_CPUMASK, FILTER_PRED_FN_S8, FILTER_PRED_FN_U8, FILTER_PRED_FN_COMM, FILTER_PRED_FN_STRING, FILTER_PRED_FN_STRLOC, FILTER_PRED_FN_STRRELLOC, FILTER_PRED_FN_PCHAR_USER, FILTER_PRED_FN_PCHAR, FILTER_PRED_FN_CPU, FILTER_PRED_FN_CPU_CPUMASK, FILTER_PRED_FN_CPUMASK, FILTER_PRED_FN_CPUMASK_CPU, FILTER_PRED_FN_FUNCTION, FILTER_PRED_FN_, FILTER_PRED_TEST_VISITED, }; struct filter_pred { struct regex *regex; struct cpumask *mask; unsigned short *ops; struct ftrace_event_field *field; u64 val; u64 val2; enum filter_pred_fn fn_num; int offset; int not; int op; }; /* * pred functions are OP_LE, OP_LT, OP_GE, OP_GT, and OP_BAND * pred_funcs_##type below must match the order of them above. */ #define PRED_FUNC_START OP_LE #define PRED_FUNC_MAX (OP_BAND - PRED_FUNC_START) #define ERRORS \ C(NONE, "No error"), \ C(INVALID_OP, "Invalid operator"), \ C(TOO_MANY_OPEN, "Too many '('"), \ C(TOO_MANY_CLOSE, "Too few '('"), \ C(MISSING_QUOTE, "Missing matching quote"), \ C(MISSING_BRACE_OPEN, "Missing '{'"), \ C(MISSING_BRACE_CLOSE, "Missing '}'"), \ C(OPERAND_TOO_LONG, "Operand too long"), \ C(EXPECT_STRING, "Expecting string field"), \ C(EXPECT_DIGIT, "Expecting numeric field"), \ C(ILLEGAL_FIELD_OP, "Illegal operation for field type"), \ C(FIELD_NOT_FOUND, "Field not found"), \ C(ILLEGAL_INTVAL, "Illegal integer value"), \ C(BAD_SUBSYS_FILTER, "Couldn't find or set field in one of a subsystem's events"), \ C(TOO_MANY_PREDS, "Too many terms in predicate expression"), \ C(INVALID_FILTER, "Meaningless filter expression"), \ C(INVALID_CPULIST, "Invalid cpulist"), \ C(IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \ C(INVALID_VALUE, "Invalid value (did you forget quotes)?"), \ C(NO_FUNCTION, "Function not found"), \ C(ERRNO, "Error"), \ C(NO_FILTER, "No filter found") #undef C #define C(a, b) FILT_ERR_##a enum { ERRORS }; #undef C #define C(a, b) b static const char *err_text[] = { ERRORS }; /* Called after a '!' character but "!=" and "!~" are not "not"s */ static bool is_not(const char *str) { switch (str[1]) { case '=': case '~': return false; } return true; } /** * struct prog_entry - a singe entry in the filter program * @target: Index to jump to on a branch (actually one minus the index) * @when_to_branch: The value of the result of the predicate to do a branch * @pred: The predicate to execute. */ struct prog_entry { int target; int when_to_branch; struct filter_pred *pred; }; /** * update_preds - assign a program entry a label target * @prog: The program array * @N: The index of the current entry in @prog * @invert: What to assign a program entry for its branch condition * * The program entry at @N has a target that points to the index of a program * entry that can have its target and when_to_branch fields updated. * Update the current program entry denoted by index @N target field to be * that of the updated entry. This will denote the entry to update if * we are processing an "||" after an "&&". */ static void update_preds(struct prog_entry *prog, int N, int invert) { int t, s; t = prog[N].target; s = prog[t].target; prog[t].when_to_branch = invert; prog[t].target = N; prog[N].target = s; } struct filter_parse_error { int lasterr; int lasterr_pos; }; static void parse_error(struct filter_parse_error *pe, int err, int pos) { pe->lasterr = err; pe->lasterr_pos = pos; } typedef int (*parse_pred_fn)(const char *str, void *data, int pos, struct filter_parse_error *pe, struct filter_pred **pred); enum { INVERT = 1, PROCESS_AND = 2, PROCESS_OR = 4, }; static void free_predicate(struct filter_pred *pred) { if (pred) { kfree(pred->regex); kfree(pred->mask); kfree(pred); } } /* * Without going into a formal proof, this explains the method that is used in * parsing the logical expressions. * * For example, if we have: "a && !(!b || (c && g)) || d || e && !f" * The first pass will convert it into the following program: * * n1: r=a; l1: if (!r) goto l4; * n2: r=b; l2: if (!r) goto l4; * n3: r=c; r=!r; l3: if (r) goto l4; * n4: r=g; r=!r; l4: if (r) goto l5; * n5: r=d; l5: if (r) goto T * n6: r=e; l6: if (!r) goto l7; * n7: r=f; r=!r; l7: if (!r) goto F * T: return TRUE * F: return FALSE * * To do this, we use a data structure to represent each of the above * predicate and conditions that has: * * predicate, when_to_branch, invert, target * * The "predicate" will hold the function to determine the result "r". * The "when_to_branch" denotes what "r" should be if a branch is to be taken * "&&" would contain "!r" or (0) and "||" would contain "r" or (1). * The "invert" holds whether the value should be reversed before testing. * The "target" contains the label "l#" to jump to. * * A stack is created to hold values when parentheses are used. * * To simplify the logic, the labels will start at 0 and not 1. * * The possible invert values are 1 and 0. The number of "!"s that are in scope * before the predicate determines the invert value, if the number is odd then * the invert value is 1 and 0 otherwise. This means the invert value only * needs to be toggled when a new "!" is introduced compared to what is stored * on the stack, where parentheses were used. * * The top of the stack and "invert" are initialized to zero. * * ** FIRST PASS ** * * #1 A loop through all the tokens is done: * * #2 If the token is an "(", the stack is push, and the current stack value * gets the current invert value, and the loop continues to the next token. * The top of the stack saves the "invert" value to keep track of what * the current inversion is. As "!(a && !b || c)" would require all * predicates being affected separately by the "!" before the parentheses. * And that would end up being equivalent to "(!a || b) && !c" * * #3 If the token is an "!", the current "invert" value gets inverted, and * the loop continues. Note, if the next token is a predicate, then * this "invert" value is only valid for the current program entry, * and does not affect other predicates later on. * * The only other acceptable token is the predicate string. * * #4 A new entry into the program is added saving: the predicate and the * current value of "invert". The target is currently assigned to the * previous program index (this will not be its final value). * * #5 We now enter another loop and look at the next token. The only valid * tokens are ")", "&&", "||" or end of the input string "\0". * * #6 The invert variable is reset to the current value saved on the top of * the stack. * * #7 The top of the stack holds not only the current invert value, but also * if a "&&" or "||" needs to be processed. Note, the "&&" takes higher * precedence than "||". That is "a && b || c && d" is equivalent to * "(a && b) || (c && d)". Thus the first thing to do is to see if "&&" needs * to be processed. This is the case if an "&&" was the last token. If it was * then we call update_preds(). This takes the program, the current index in * the program, and the current value of "invert". More will be described * below about this function. * * #8 If the next token is "&&" then we set a flag in the top of the stack * that denotes that "&&" needs to be processed, break out of this loop * and continue with the outer loop. * * #9 Otherwise, if a "||" needs to be processed then update_preds() is called. * This is called with the program, the current index in the program, but * this time with an inverted value of "invert" (that is !invert). This is * because the value taken will become the "when_to_branch" value of the * program. * Note, this is called when the next token is not an "&&". As stated before, * "&&" takes higher precedence, and "||" should not be processed yet if the * next logical operation is "&&". * * #10 If the next token is "||" then we set a flag in the top of the stack * that denotes that "||" needs to be processed, break out of this loop * and continue with the outer loop. * * #11 If this is the end of the input string "\0" then we break out of both * loops. * * #12 Otherwise, the next token is ")", where we pop the stack and continue * this inner loop. * * Now to discuss the update_pred() function, as that is key to the setting up * of the program. Remember the "target" of the program is initialized to the * previous index and not the "l" label. The target holds the index into the * program that gets affected by the operand. Thus if we have something like * "a || b && c", when we process "a" the target will be "-1" (undefined). * When we process "b", its target is "0", which is the index of "a", as that's * the predicate that is affected by "||". But because the next token after "b" * is "&&" we don't call update_preds(). Instead continue to "c". As the * next token after "c" is not "&&" but the end of input, we first process the * "&&" by calling update_preds() for the "&&" then we process the "||" by * calling updates_preds() with the values for processing "||". * * What does that mean? What update_preds() does is to first save the "target" * of the program entry indexed by the current program entry's "target" * (remember the "target" is initialized to previous program entry), and then * sets that "target" to the current index which represents the label "l#". * That entry's "when_to_branch" is set to the value passed in (the "invert" * or "!invert"). Then it sets the current program entry's target to the saved * "target" value (the old value of the program that had its "target" updated * to the label). * * Looking back at "a || b && c", we have the following steps: * "a" - prog[0] = { "a", X, -1 } // pred, when_to_branch, target * "||" - flag that we need to process "||"; continue outer loop * "b" - prog[1] = { "b", X, 0 } * "&&" - flag that we need to process "&&"; continue outer loop * (Notice we did not process "||") * "c" - prog[2] = { "c", X, 1 } * update_preds(prog, 2, 0); // invert = 0 as we are processing "&&" * t = prog[2].target; // t = 1 * s = prog[t].target; // s = 0 * prog[t].target = 2; // Set target to "l2" * prog[t].when_to_branch = 0; * prog[2].target = s; * update_preds(prog, 2, 1); // invert = 1 as we are now processing "||" * t = prog[2].target; // t = 0 * s = prog[t].target; // s = -1 * prog[t].target = 2; // Set target to "l2" * prog[t].when_to_branch = 1; * prog[2].target = s; * * #13 Which brings us to the final step of the first pass, which is to set * the last program entry's when_to_branch and target, which will be * when_to_branch = 0; target = N; ( the label after the program entry after * the last program entry processed above). * * If we denote "TRUE" to be the entry after the last program entry processed, * and "FALSE" the program entry after that, we are now done with the first * pass. * * Making the above "a || b && c" have a program of: * prog[0] = { "a", 1, 2 } * prog[1] = { "b", 0, 2 } * prog[2] = { "c", 0, 3 } * * Which translates into: * n0: r = a; l0: if (r) goto l2; * n1: r = b; l1: if (!r) goto l2; * n2: r = c; l2: if (!r) goto l3; // Which is the same as "goto F;" * T: return TRUE; l3: * F: return FALSE * * Although, after the first pass, the program is correct, it is * inefficient. The simple sample of "a || b && c" could be easily been * converted into: * n0: r = a; if (r) goto T * n1: r = b; if (!r) goto F * n2: r = c; if (!r) goto F * T: return TRUE; * F: return FALSE; * * The First Pass is over the input string. The next too passes are over * the program itself. * * ** SECOND PASS ** * * Which brings us to the second pass. If a jump to a label has the * same condition as that label, it can instead jump to its target. * The original example of "a && !(!b || (c && g)) || d || e && !f" * where the first pass gives us: * * n1: r=a; l1: if (!r) goto l4; * n2: r=b; l2: if (!r) goto l4; * n3: r=c; r=!r; l3: if (r) goto l4; * n4: r=g; r=!r; l4: if (r) goto l5; * n5: r=d; l5: if (r) goto T * n6: r=e; l6: if (!r) goto l7; * n7: r=f; r=!r; l7: if (!r) goto F: * T: return TRUE; * F: return FALSE * * We can see that "l3: if (r) goto l4;" and at l4, we have "if (r) goto l5;". * And "l5: if (r) goto T", we could optimize this by converting l3 and l4 * to go directly to T. To accomplish this, we start from the last * entry in the program and work our way back. If the target of the entry * has the same "when_to_branch" then we could use that entry's target. * Doing this, the above would end up as: * * n1: r=a; l1: if (!r) goto l4; * n2: r=b; l2: if (!r) goto l4; * n3: r=c; r=!r; l3: if (r) goto T; * n4: r=g; r=!r; l4: if (r) goto T; * n5: r=d; l5: if (r) goto T; * n6: r=e; l6: if (!r) goto F; * n7: r=f; r=!r; l7: if (!r) goto F; * T: return TRUE * F: return FALSE * * In that same pass, if the "when_to_branch" doesn't match, we can simply * go to the program entry after the label. That is, "l2: if (!r) goto l4;" * where "l4: if (r) goto T;", then we can convert l2 to be: * "l2: if (!r) goto n5;". * * This will have the second pass give us: * n1: r=a; l1: if (!r) goto n5; * n2: r=b; l2: if (!r) goto n5; * n3: r=c; r=!r; l3: if (r) goto T; * n4: r=g; r=!r; l4: if (r) goto T; * n5: r=d; l5: if (r) goto T * n6: r=e; l6: if (!r) goto F; * n7: r=f; r=!r; l7: if (!r) goto F * T: return TRUE * F: return FALSE * * Notice, all the "l#" labels are no longer used, and they can now * be discarded. * * ** THIRD PASS ** * * For the third pass we deal with the inverts. As they simply just * make the "when_to_branch" get inverted, a simple loop over the * program to that does: "when_to_branch ^= invert;" will do the * job, leaving us with: * n1: r=a; if (!r) goto n5; * n2: r=b; if (!r) goto n5; * n3: r=c: if (!r) goto T; * n4: r=g; if (!r) goto T; * n5: r=d; if (r) goto T * n6: r=e; if (!r) goto F; * n7: r=f; if (r) goto F * T: return TRUE * F: return FALSE * * As "r = a; if (!r) goto n5;" is obviously the same as * "if (!a) goto n5;" without doing anything we can interpret the * program as: * n1: if (!a) goto n5; * n2: if (!b) goto n5; * n3: if (!c) goto T; * n4: if (!g) goto T; * n5: if (d) goto T * n6: if (!e) goto F; * n7: if (f) goto F * T: return TRUE * F: return FALSE * * Since the inverts are discarded at the end, there's no reason to store * them in the program array (and waste memory). A separate array to hold * the inverts is used and freed at the end. */ static struct prog_entry * predicate_parse(const char *str, int nr_parens, int nr_preds, parse_pred_fn parse_pred, void *data, struct filter_parse_error *pe) { struct prog_entry *prog_stack; struct prog_entry *prog; const char *ptr = str; char *inverts = NULL; int *op_stack; int *top; int invert = 0; int ret = -ENOMEM; int len; int N = 0; int i; nr_preds += 2; /* For TRUE and FALSE */ op_stack = kmalloc_array(nr_parens, sizeof(*op_stack), GFP_KERNEL); if (!op_stack) return ERR_PTR(-ENOMEM); prog_stack = kcalloc(nr_preds, sizeof(*prog_stack), GFP_KERNEL); if (!prog_stack) { parse_error(pe, -ENOMEM, 0); goto out_free; } inverts = kmalloc_array(nr_preds, sizeof(*inverts), GFP_KERNEL); if (!inverts) { parse_error(pe, -ENOMEM, 0); goto out_free; } top = op_stack; prog = prog_stack; *top = 0; /* First pass */ while (*ptr) { /* #1 */ const char *next = ptr++; if (isspace(*next)) continue; switch (*next) { case '(': /* #2 */ if (top - op_stack > nr_parens) { ret = -EINVAL; goto out_free; } *(++top) = invert; continue; case '!': /* #3 */ if (!is_not(next)) break; invert = !invert; continue; } if (N >= nr_preds) { parse_error(pe, FILT_ERR_TOO_MANY_PREDS, next - str); goto out_free; } inverts[N] = invert; /* #4 */ prog[N].target = N-1; len = parse_pred(next, data, ptr - str, pe, &prog[N].pred); if (len < 0) { ret = len; goto out_free; } ptr = next + len; N++; ret = -1; while (1) { /* #5 */ next = ptr++; if (isspace(*next)) continue; switch (*next) { case ')': case '\0': break; case '&': case '|': /* accepting only "&&" or "||" */ if (next[1] == next[0]) { ptr++; break; } fallthrough; default: parse_error(pe, FILT_ERR_TOO_MANY_PREDS, next - str); goto out_free; } invert = *top & INVERT; if (*top & PROCESS_AND) { /* #7 */ update_preds(prog, N - 1, invert); *top &= ~PROCESS_AND; } if (*next == '&') { /* #8 */ *top |= PROCESS_AND; break; } if (*top & PROCESS_OR) { /* #9 */ update_preds(prog, N - 1, !invert); *top &= ~PROCESS_OR; } if (*next == '|') { /* #10 */ *top |= PROCESS_OR; break; } if (!*next) /* #11 */ goto out; if (top == op_stack) { ret = -1; /* Too few '(' */ parse_error(pe, FILT_ERR_TOO_MANY_CLOSE, ptr - str); goto out_free; } top--; /* #12 */ } } out: if (top != op_stack) { /* Too many '(' */ parse_error(pe, FILT_ERR_TOO_MANY_OPEN, ptr - str); goto out_free; } if (!N) { /* No program? */ ret = -EINVAL; parse_error(pe, FILT_ERR_NO_FILTER, ptr - str); goto out_free; } prog[N].pred = NULL; /* #13 */ prog[N].target = 1; /* TRUE */ prog[N+1].pred = NULL; prog[N+1].target = 0; /* FALSE */ prog[N-1].target = N; prog[N-1].when_to_branch = false; /* Second Pass */ for (i = N-1 ; i--; ) { int target = prog[i].target; if (prog[i].when_to_branch == prog[target].when_to_branch) prog[i].target = prog[target].target; } /* Third Pass */ for (i = 0; i < N; i++) { invert = inverts[i] ^ prog[i].when_to_branch; prog[i].when_to_branch = invert; /* Make sure the program always moves forward */ if (WARN_ON(prog[i].target <= i)) { ret = -EINVAL; goto out_free; } } kfree(op_stack); kfree(inverts); return prog; out_free: kfree(op_stack); kfree(inverts); if (prog_stack) { for (i = 0; prog_stack[i].pred; i++) free_predicate(prog_stack[i].pred); kfree(prog_stack); } return ERR_PTR(ret); } static inline int do_filter_cpumask(int op, const struct cpumask *mask, const struct cpumask *cmp) { switch (op) { case OP_EQ: return cpumask_equal(mask, cmp); case OP_NE: return !cpumask_equal(mask, cmp); case OP_BAND: return cpumask_intersects(mask, cmp); default: return 0; } } /* Optimisation of do_filter_cpumask() for scalar fields */ static inline int do_filter_scalar_cpumask(int op, unsigned int cpu, const struct cpumask *mask) { /* * Per the weight-of-one cpumask optimisations, the mask passed in this * function has a weight >= 2, so it is never equal to a single scalar. */ switch (op) { case OP_EQ: return false; case OP_NE: return true; case OP_BAND: return cpumask_test_cpu(cpu, mask); default: return 0; } } static inline int do_filter_cpumask_scalar(int op, const struct cpumask *mask, unsigned int cpu) { switch (op) { case OP_EQ: return cpumask_test_cpu(cpu, mask) && cpumask_nth(1, mask) >= nr_cpu_ids; case OP_NE: return !cpumask_test_cpu(cpu, mask) || cpumask_nth(1, mask) < nr_cpu_ids; case OP_BAND: return cpumask_test_cpu(cpu, mask); default: return 0; } } enum pred_cmp_types { PRED_CMP_TYPE_NOP, PRED_CMP_TYPE_LT, PRED_CMP_TYPE_LE, PRED_CMP_TYPE_GT, PRED_CMP_TYPE_GE, PRED_CMP_TYPE_BAND, }; #define DEFINE_COMPARISON_PRED(type) \ static int filter_pred_##type(struct filter_pred *pred, void *event) \ { \ switch (pred->op) { \ case OP_LT: { \ type *addr = (type *)(event + pred->offset); \ type val = (type)pred->val; \ return *addr < val; \ } \ case OP_LE: { \ type *addr = (type *)(event + pred->offset); \ type val = (type)pred->val; \ return *addr <= val; \ } \ case OP_GT: { \ type *addr = (type *)(event + pred->offset); \ type val = (type)pred->val; \ return *addr > val; \ } \ case OP_GE: { \ type *addr = (type *)(event + pred->offset); \ type val = (type)pred->val; \ return *addr >= val; \ } \ case OP_BAND: { \ type *addr = (type *)(event + pred->offset); \ type val = (type)pred->val; \ return !!(*addr & val); \ } \ default: \ return 0; \ } \ } #define DEFINE_CPUMASK_COMPARISON_PRED(size) \ static int filter_pred_##size##_cpumask(struct filter_pred *pred, void *event) \ { \ u##size *addr = (u##size *)(event + pred->offset); \ unsigned int cpu = *addr; \ \ if (cpu >= nr_cpu_ids) \ return 0; \ \ return do_filter_scalar_cpumask(pred->op, cpu, pred->mask); \ } #define DEFINE_EQUALITY_PRED(size) \ static int filter_pred_##size(struct filter_pred *pred, void *event) \ { \ u##size *addr = (u##size *)(event + pred->offset); \ u##size val = (u##size)pred->val; \ int match; \ \ match = (val == *addr) ^ pred->not; \ \ return match; \ } DEFINE_COMPARISON_PRED(s64); DEFINE_COMPARISON_PRED(u64); DEFINE_COMPARISON_PRED(s32); DEFINE_COMPARISON_PRED(u32); DEFINE_COMPARISON_PRED(s16); DEFINE_COMPARISON_PRED(u16); DEFINE_COMPARISON_PRED(s8); DEFINE_COMPARISON_PRED(u8); DEFINE_CPUMASK_COMPARISON_PRED(64); DEFINE_CPUMASK_COMPARISON_PRED(32); DEFINE_CPUMASK_COMPARISON_PRED(16); DEFINE_CPUMASK_COMPARISON_PRED(8); DEFINE_EQUALITY_PRED(64); DEFINE_EQUALITY_PRED(32); DEFINE_EQUALITY_PRED(16); DEFINE_EQUALITY_PRED(8); /* user space strings temp buffer */ #define USTRING_BUF_SIZE 1024 struct ustring_buffer { char buffer[USTRING_BUF_SIZE]; }; static __percpu struct ustring_buffer *ustring_per_cpu; static __always_inline char *test_string(char *str) { struct ustring_buffer *ubuf; char *kstr; if (!ustring_per_cpu) return NULL; ubuf = this_cpu_ptr(ustring_per_cpu); kstr = ubuf->buffer; /* For safety, do not trust the string pointer */ if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE)) return NULL; return kstr; } static __always_inline char *test_ustring(char *str) { struct ustring_buffer *ubuf; char __user *ustr; char *kstr; if (!ustring_per_cpu) return NULL; ubuf = this_cpu_ptr(ustring_per_cpu); kstr = ubuf->buffer; /* user space address? */ ustr = (char __user *)str; if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE)) return NULL; return kstr; } /* Filter predicate for fixed sized arrays of characters */ static int filter_pred_string(struct filter_pred *pred, void *event) { char *addr = (char *)(event + pred->offset); int cmp, match; cmp = pred->regex->match(addr, pred->regex, pred->regex->field_len); match = cmp ^ pred->not; return match; } static __always_inline int filter_pchar(struct filter_pred *pred, char *str) { int cmp, match; int len; len = strlen(str) + 1; /* including tailing '\0' */ cmp = pred->regex->match(str, pred->regex, len); match = cmp ^ pred->not; return match; } /* Filter predicate for char * pointers */ static int filter_pred_pchar(struct filter_pred *pred, void *event) { char **addr = (char **)(event + pred->offset); char *str; str = test_string(*addr); if (!str) return 0; return filter_pchar(pred, str); } /* Filter predicate for char * pointers in user space*/ static int filter_pred_pchar_user(struct filter_pred *pred, void *event) { char **addr = (char **)(event + pred->offset); char *str; str = test_ustring(*addr); if (!str) return 0; return filter_pchar(pred, str); } /* * Filter predicate for dynamic sized arrays of characters. * These are implemented through a list of strings at the end * of the entry. * Also each of these strings have a field in the entry which * contains its offset from the beginning of the entry. * We have then first to get this field, dereference it * and add it to the address of the entry, and at last we have * the address of the string. */ static int filter_pred_strloc(struct filter_pred *pred, void *event) { u32 str_item = *(u32 *)(event + pred->offset); int str_loc = str_item & 0xffff; int str_len = str_item >> 16; char *addr = (char *)(event + str_loc); int cmp, match; cmp = pred->regex->match(addr, pred->regex, str_len); match = cmp ^ pred->not; return match; } /* * Filter predicate for relative dynamic sized arrays of characters. * These are implemented through a list of strings at the end * of the entry as same as dynamic string. * The difference is that the relative one records the location offset * from the field itself, not the event entry. */ static int filter_pred_strrelloc(struct filter_pred *pred, void *event) { u32 *item = (u32 *)(event + pred->offset); u32 str_item = *item; int str_loc = str_item & 0xffff; int str_len = str_item >> 16; char *addr = (char *)(&item[1]) + str_loc; int cmp, match; cmp = pred->regex->match(addr, pred->regex, str_len); match = cmp ^ pred->not; return match; } /* Filter predicate for CPUs. */ static int filter_pred_cpu(struct filter_pred *pred, void *event) { int cpu, cmp; cpu = raw_smp_processor_id(); cmp = pred->val; switch (pred->op) { case OP_EQ: return cpu == cmp; case OP_NE: return cpu != cmp; case OP_LT: return cpu < cmp; case OP_LE: return cpu <= cmp; case OP_GT: return cpu > cmp; case OP_GE: return cpu >= cmp; default: return 0; } } /* Filter predicate for current CPU vs user-provided cpumask */ static int filter_pred_cpu_cpumask(struct filter_pred *pred, void *event) { int cpu = raw_smp_processor_id(); return do_filter_scalar_cpumask(pred->op, cpu, pred->mask); } /* Filter predicate for cpumask field vs user-provided cpumask */ static int filter_pred_cpumask(struct filter_pred *pred, void *event) { u32 item = *(u32 *)(event + pred->offset); int loc = item & 0xffff; const struct cpumask *mask = (event + loc); const struct cpumask *cmp = pred->mask; return do_filter_cpumask(pred->op, mask, cmp); } /* Filter predicate for cpumask field vs user-provided scalar */ static int filter_pred_cpumask_cpu(struct filter_pred *pred, void *event) { u32 item = *(u32 *)(event + pred->offset); int loc = item & 0xffff; const struct cpumask *mask = (event + loc); unsigned int cpu = pred->val; return do_filter_cpumask_scalar(pred->op, mask, cpu); } /* Filter predicate for COMM. */ static int filter_pred_comm(struct filter_pred *pred, void *event) { int cmp; cmp = pred->regex->match(current->comm, pred->regex, TASK_COMM_LEN); return cmp ^ pred->not; } /* Filter predicate for functions. */ static int filter_pred_function(struct filter_pred *pred, void *event) { unsigned long *addr = (unsigned long *)(event + pred->offset); unsigned long start = (unsigned long)pred->val; unsigned long end = (unsigned long)pred->val2; int ret = *addr >= start && *addr < end; return pred->op == OP_EQ ? ret : !ret; } /* * regex_match_foo - Basic regex callbacks * * @str: the string to be searched * @r: the regex structure containing the pattern string * @len: the length of the string to be searched (including '\0') * * Note: * - @str might not be NULL-terminated if it's of type DYN_STRING * RDYN_STRING, or STATIC_STRING, unless @len is zero. */ static int regex_match_full(char *str, struct regex *r, int len) { /* len of zero means str is dynamic and ends with '\0' */ if (!len) return strcmp(str, r->pattern) == 0; return strncmp(str, r->pattern, len) == 0; } static int regex_match_front(char *str, struct regex *r, int len) { if (len && len < r->len) return 0; return strncmp(str, r->pattern, r->len) == 0; } static int regex_match_middle(char *str, struct regex *r, int len) { if (!len) return strstr(str, r->pattern) != NULL; return strnstr(str, r->pattern, len) != NULL; } static int regex_match_end(char *str, struct regex *r, int len) { int strlen = len - 1; if (strlen >= r->len && memcmp(str + strlen - r->len, r->pattern, r->len) == 0) return 1; return 0; } static int regex_match_glob(char *str, struct regex *r, int len __maybe_unused) { if (glob_match(r->pattern, str)) return 1; return 0; } /** * filter_parse_regex - parse a basic regex * @buff: the raw regex * @len: length of the regex * @search: will point to the beginning of the string to compare * @not: tell whether the match will have to be inverted * * This passes in a buffer containing a regex and this function will * set search to point to the search part of the buffer and * return the type of search it is (see enum above). * This does modify buff. * * Returns enum type. * search returns the pointer to use for comparison. * not returns 1 if buff started with a '!' * 0 otherwise. */ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not) { int type = MATCH_FULL; int i; if (buff[0] == '!') { *not = 1; buff++; len--; } else *not = 0; *search = buff; if (isdigit(buff[0])) return MATCH_INDEX; for (i = 0; i < len; i++) { if (buff[i] == '*') { if (!i) { type = MATCH_END_ONLY; } else if (i == len - 1) { if (type == MATCH_END_ONLY) type = MATCH_MIDDLE_ONLY; else type = MATCH_FRONT_ONLY; buff[i] = 0; break; } else { /* pattern continues, use full glob */ return MATCH_GLOB; } } else if (strchr("[?\\", buff[i])) { return MATCH_GLOB; } } if (buff[0] == '*') *search = buff + 1; return type; } static void filter_build_regex(struct filter_pred *pred) { struct regex *r = pred->regex; char *search; enum regex_type type = MATCH_FULL; if (pred->op == OP_GLOB) { type = filter_parse_regex(r->pattern, r->len, &search, &pred->not); r->len = strlen(search); memmove(r->pattern, search, r->len+1); } switch (type) { /* MATCH_INDEX should not happen, but if it does, match full */ case MATCH_INDEX: case MATCH_FULL: r->match = regex_match_full; break; case MATCH_FRONT_ONLY: r->match = regex_match_front; break; case MATCH_MIDDLE_ONLY: r->match = regex_match_middle; break; case MATCH_END_ONLY: r->match = regex_match_end; break; case MATCH_GLOB: r->match = regex_match_glob; break; } } #ifdef CONFIG_FTRACE_STARTUP_TEST static int test_pred_visited_fn(struct filter_pred *pred, void *event); #else static int test_pred_visited_fn(struct filter_pred *pred, void *event) { return 0; } #endif static int filter_pred_fn_call(struct filter_pred *pred, void *event); /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct event_filter *filter, void *rec) { struct prog_entry *prog; int i; /* no filter is considered a match */ if (!filter) return 1; /* Protected by either SRCU(tracepoint_srcu) or preempt_disable */ prog = rcu_dereference_raw(filter->prog); if (!prog) return 1; for (i = 0; prog[i].pred; i++) { struct filter_pred *pred = prog[i].pred; int match = filter_pred_fn_call(pred, rec); if (match == prog[i].when_to_branch) i = prog[i].target; } return prog[i].target; } EXPORT_SYMBOL_GPL(filter_match_preds); static void remove_filter_string(struct event_filter *filter) { if (!filter) return; kfree(filter->filter_string); filter->filter_string = NULL; } static void append_filter_err(struct trace_array *tr, struct filter_parse_error *pe, struct event_filter *filter) { struct trace_seq *s; int pos = pe->lasterr_pos; char *buf; int len; if (WARN_ON(!filter->filter_string)) return; s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) return; trace_seq_init(s); len = strlen(filter->filter_string); if (pos > len) pos = len; /* indexing is off by one */ if (pos) pos++; trace_seq_puts(s, filter->filter_string); if (pe->lasterr > 0) { trace_seq_printf(s, "\n%*s", pos, "^"); trace_seq_printf(s, "\nparse_error: %s\n", err_text[pe->lasterr]); tracing_log_err(tr, "event filter parse error", filter->filter_string, err_text, pe->lasterr, pe->lasterr_pos); } else { trace_seq_printf(s, "\nError: (%d)\n", pe->lasterr); tracing_log_err(tr, "event filter parse error", filter->filter_string, err_text, FILT_ERR_ERRNO, 0); } trace_seq_putc(s, 0); buf = kmemdup_nul(s->buffer, s->seq.len, GFP_KERNEL); if (buf) { kfree(filter->filter_string); filter->filter_string = buf; } kfree(s); } static inline struct event_filter *event_filter(struct trace_event_file *file) { return file->filter; } /* caller must hold event_mutex */ void print_event_filter(struct trace_event_file *file, struct trace_seq *s) { struct event_filter *filter = event_filter(file); if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else trace_seq_puts(s, "none\n"); } void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s) { struct event_filter *filter; mutex_lock(&event_mutex); filter = system->filter; if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else trace_seq_puts(s, DEFAULT_SYS_FILTER_MESSAGE "\n"); mutex_unlock(&event_mutex); } static void free_prog(struct event_filter *filter) { struct prog_entry *prog; int i; prog = rcu_access_pointer(filter->prog); if (!prog) return; for (i = 0; prog[i].pred; i++) free_predicate(prog[i].pred); kfree(prog); } static void filter_disable(struct trace_event_file *file) { unsigned long old_flags = file->flags; file->flags &= ~EVENT_FILE_FL_FILTERED; if (old_flags != file->flags) trace_buffered_event_disable(); } static void __free_filter(struct event_filter *filter) { if (!filter) return; free_prog(filter); kfree(filter->filter_string); kfree(filter); } void free_event_filter(struct event_filter *filter) { __free_filter(filter); } static inline void __remove_filter(struct trace_event_file *file) { filter_disable(file); remove_filter_string(file->filter); } static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir, struct trace_array *tr) { struct trace_event_file *file; list_for_each_entry(file, &tr->events, list) { if (file->system != dir) continue; __remove_filter(file); } } static inline void __free_subsystem_filter(struct trace_event_file *file) { __free_filter(file->filter); file->filter = NULL; } static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir, struct trace_array *tr) { struct trace_event_file *file; list_for_each_entry(file, &tr->events, list) { if (file->system != dir) continue; __free_subsystem_filter(file); } } int filter_assign_type(const char *type) { if (strstr(type, "__data_loc")) { if (strstr(type, "char")) return FILTER_DYN_STRING; if (strstr(type, "cpumask_t")) return FILTER_CPUMASK; } if (strstr(type, "__rel_loc") && strstr(type, "char")) return FILTER_RDYN_STRING; if (strchr(type, '[') && strstr(type, "char")) return FILTER_STATIC_STRING; if (strcmp(type, "char *") == 0 || strcmp(type, "const char *") == 0) return FILTER_PTR_STRING; return FILTER_OTHER; } static enum filter_pred_fn select_comparison_fn(enum filter_op_ids op, int field_size, int field_is_signed) { enum filter_pred_fn fn = FILTER_PRED_FN_NOP; int pred_func_index = -1; switch (op) { case OP_EQ: case OP_NE: break; default: if (WARN_ON_ONCE(op < PRED_FUNC_START)) return fn; pred_func_index = op - PRED_FUNC_START; if (WARN_ON_ONCE(pred_func_index > PRED_FUNC_MAX)) return fn; } switch (field_size) { case 8: if (pred_func_index < 0) fn = FILTER_PRED_FN_64; else if (field_is_signed) fn = FILTER_PRED_FN_S64; else fn = FILTER_PRED_FN_U64; break; case 4: if (pred_func_index < 0) fn = FILTER_PRED_FN_32; else if (field_is_signed) fn = FILTER_PRED_FN_S32; else fn = FILTER_PRED_FN_U32; break; case 2: if (pred_func_index < 0) fn = FILTER_PRED_FN_16; else if (field_is_signed) fn = FILTER_PRED_FN_S16; else fn = FILTER_PRED_FN_U16; break; case 1: if (pred_func_index < 0) fn = FILTER_PRED_FN_8; else if (field_is_signed) fn = FILTER_PRED_FN_S8; else fn = FILTER_PRED_FN_U8; break; } return fn; } static int filter_pred_fn_call(struct filter_pred *pred, void *event) { switch (pred->fn_num) { case FILTER_PRED_FN_64: return filter_pred_64(pred, event); case FILTER_PRED_FN_64_CPUMASK: return filter_pred_64_cpumask(pred, event); case FILTER_PRED_FN_S64: return filter_pred_s64(pred, event); case FILTER_PRED_FN_U64: return filter_pred_u64(pred, event); case FILTER_PRED_FN_32: return filter_pred_32(pred, event); case FILTER_PRED_FN_32_CPUMASK: return filter_pred_32_cpumask(pred, event); case FILTER_PRED_FN_S32: return filter_pred_s32(pred, event); case FILTER_PRED_FN_U32: return filter_pred_u32(pred, event); case FILTER_PRED_FN_16: return filter_pred_16(pred, event); case FILTER_PRED_FN_16_CPUMASK: return filter_pred_16_cpumask(pred, event); case FILTER_PRED_FN_S16: return filter_pred_s16(pred, event); case FILTER_PRED_FN_U16: return filter_pred_u16(pred, event); case FILTER_PRED_FN_8: return filter_pred_8(pred, event); case FILTER_PRED_FN_8_CPUMASK: return filter_pred_8_cpumask(pred, event); case FILTER_PRED_FN_S8: return filter_pred_s8(pred, event); case FILTER_PRED_FN_U8: return filter_pred_u8(pred, event); case FILTER_PRED_FN_COMM: return filter_pred_comm(pred, event); case FILTER_PRED_FN_STRING: return filter_pred_string(pred, event); case FILTER_PRED_FN_STRLOC: return filter_pred_strloc(pred, event); case FILTER_PRED_FN_STRRELLOC: return filter_pred_strrelloc(pred, event); case FILTER_PRED_FN_PCHAR_USER: return filter_pred_pchar_user(pred, event); case FILTER_PRED_FN_PCHAR: return filter_pred_pchar(pred, event); case FILTER_PRED_FN_CPU: return filter_pred_cpu(pred, event); case FILTER_PRED_FN_CPU_CPUMASK: return filter_pred_cpu_cpumask(pred, event); case FILTER_PRED_FN_CPUMASK: return filter_pred_cpumask(pred, event); case FILTER_PRED_FN_CPUMASK_CPU: return filter_pred_cpumask_cpu(pred, event); case FILTER_PRED_FN_FUNCTION: return filter_pred_function(pred, event); case FILTER_PRED_TEST_VISITED: return test_pred_visited_fn(pred, event); default: return 0; } } /* Called when a predicate is encountered by predicate_parse() */ static int parse_pred(const char *str, void *data, int pos, struct filter_parse_error *pe, struct filter_pred **pred_ptr) { struct trace_event_call *call = data; struct ftrace_event_field *field; struct filter_pred *pred = NULL; unsigned long offset; unsigned long size; unsigned long ip; char num_buf[24]; /* Big enough to hold an address */ char *field_name; char *name; bool function = false; bool ustring = false; char q; u64 val; int len; int ret; int op; int s; int i = 0; /* First find the field to associate to */ while (isspace(str[i])) i++; s = i; while (isalnum(str[i]) || str[i] == '_') i++; len = i - s; if (!len) return -1; field_name = kmemdup_nul(str + s, len, GFP_KERNEL); if (!field_name) return -ENOMEM; /* Make sure that the field exists */ field = trace_find_event_field(call, field_name); kfree(field_name); if (!field) { parse_error(pe, FILT_ERR_FIELD_NOT_FOUND, pos + i); return -EINVAL; } /* See if the field is a user space string */ if ((len = str_has_prefix(str + i, ".ustring"))) { ustring = true; i += len; } /* See if the field is a kernel function name */ if ((len = str_has_prefix(str + i, ".function"))) { function = true; i += len; } while (isspace(str[i])) i++; /* Make sure this op is supported */ for (op = 0; ops[op]; op++) { /* This is why '<=' must come before '<' in ops[] */ if (strncmp(str + i, ops[op], strlen(ops[op])) == 0) break; } if (!ops[op]) { parse_error(pe, FILT_ERR_INVALID_OP, pos + i); goto err_free; } i += strlen(ops[op]); while (isspace(str[i])) i++; s = i; pred = kzalloc(sizeof(*pred), GFP_KERNEL); if (!pred) return -ENOMEM; pred->field = field; pred->offset = field->offset; pred->op = op; if (function) { /* The field must be the same size as long */ if (field->size != sizeof(long)) { parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i); goto err_free; } /* Function only works with '==' or '!=' and an unquoted string */ switch (op) { case OP_NE: case OP_EQ: break; default: parse_error(pe, FILT_ERR_INVALID_OP, pos + i); goto err_free; } if (isdigit(str[i])) { /* We allow 0xDEADBEEF */ while (isalnum(str[i])) i++; len = i - s; /* 0xfeedfacedeadbeef is 18 chars max */ if (len >= sizeof(num_buf)) { parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i); goto err_free; } strncpy(num_buf, str + s, len); num_buf[len] = 0; ret = kstrtoul(num_buf, 0, &ip); if (ret) { parse_error(pe, FILT_ERR_INVALID_VALUE, pos + i); goto err_free; } } else { s = i; for (; str[i] && !isspace(str[i]); i++) ; len = i - s; name = kmemdup_nul(str + s, len, GFP_KERNEL); if (!name) goto err_mem; ip = kallsyms_lookup_name(name); kfree(name); if (!ip) { parse_error(pe, FILT_ERR_NO_FUNCTION, pos + i); goto err_free; } } /* Now find the function start and end address */ if (!kallsyms_lookup_size_offset(ip, &size, &offset)) { parse_error(pe, FILT_ERR_NO_FUNCTION, pos + i); goto err_free; } pred->fn_num = FILTER_PRED_FN_FUNCTION; pred->val = ip - offset; pred->val2 = pred->val + size; } else if (ftrace_event_is_function(call)) { /* * Perf does things different with function events. * It only allows an "ip" field, and expects a string. * But the string does not need to be surrounded by quotes. * If it is a string, the assigned function as a nop, * (perf doesn't use it) and grab everything. */ if (strcmp(field->name, "ip") != 0) { parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i); goto err_free; } pred->fn_num = FILTER_PRED_FN_NOP; /* * Quotes are not required, but if they exist then we need * to read them till we hit a matching one. */ if (str[i] == '\'' || str[i] == '"') q = str[i]; else q = 0; for (i++; str[i]; i++) { if (q && str[i] == q) break; if (!q && (str[i] == ')' || str[i] == '&' || str[i] == '|')) break; } /* Skip quotes */ if (q) s++; len = i - s; if (len >= MAX_FILTER_STR_VAL) { parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i); goto err_free; } pred->regex = kzalloc(sizeof(*pred->regex), GFP_KERNEL); if (!pred->regex) goto err_mem; pred->regex->len = len; strncpy(pred->regex->pattern, str + s, len); pred->regex->pattern[len] = 0; } else if (!strncmp(str + i, "CPUS", 4)) { unsigned int maskstart; bool single; char *tmp; switch (field->filter_type) { case FILTER_CPUMASK: case FILTER_CPU: case FILTER_OTHER: break; default: parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i); goto err_free; } switch (op) { case OP_EQ: case OP_NE: case OP_BAND: break; default: parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i); goto err_free; } /* Skip CPUS */ i += 4; if (str[i++] != '{') { parse_error(pe, FILT_ERR_MISSING_BRACE_OPEN, pos + i); goto err_free; } maskstart = i; /* Walk the cpulist until closing } */ for (; str[i] && str[i] != '}'; i++) ; if (str[i] != '}') { parse_error(pe, FILT_ERR_MISSING_BRACE_CLOSE, pos + i); goto err_free; } if (maskstart == i) { parse_error(pe, FILT_ERR_INVALID_CPULIST, pos + i); goto err_free; } /* Copy the cpulist between { and } */ tmp = kmalloc((i - maskstart) + 1, GFP_KERNEL); if (!tmp) goto err_mem; strscpy(tmp, str + maskstart, (i - maskstart) + 1); pred->mask = kzalloc(cpumask_size(), GFP_KERNEL); if (!pred->mask) { kfree(tmp); goto err_mem; } /* Now parse it */ if (cpulist_parse(tmp, pred->mask)) { kfree(tmp); parse_error(pe, FILT_ERR_INVALID_CPULIST, pos + i); goto err_free; } kfree(tmp); /* Move along */ i++; /* * Optimisation: if the user-provided mask has a weight of one * then we can treat it as a scalar input. */ single = cpumask_weight(pred->mask) == 1; if (single) { pred->val = cpumask_first(pred->mask); kfree(pred->mask); pred->mask = NULL; } if (field->filter_type == FILTER_CPUMASK) { pred->fn_num = single ? FILTER_PRED_FN_CPUMASK_CPU : FILTER_PRED_FN_CPUMASK; } else if (field->filter_type == FILTER_CPU) { if (single) { if (pred->op == OP_BAND) pred->op = OP_EQ; pred->fn_num = FILTER_PRED_FN_CPU; } else { pred->fn_num = FILTER_PRED_FN_CPU_CPUMASK; } } else if (single) { if (pred->op == OP_BAND) pred->op = OP_EQ; pred->fn_num = select_comparison_fn(pred->op, field->size, false); if (pred->op == OP_NE) pred->not = 1; } else { switch (field->size) { case 8: pred->fn_num = FILTER_PRED_FN_64_CPUMASK; break; case 4: pred->fn_num = FILTER_PRED_FN_32_CPUMASK; break; case 2: pred->fn_num = FILTER_PRED_FN_16_CPUMASK; break; case 1: pred->fn_num = FILTER_PRED_FN_8_CPUMASK; break; } } /* This is either a string, or an integer */ } else if (str[i] == '\'' || str[i] == '"') { char q = str[i]; /* Make sure the op is OK for strings */ switch (op) { case OP_NE: pred->not = 1; fallthrough; case OP_GLOB: case OP_EQ: break; default: parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i); goto err_free; } /* Make sure the field is OK for strings */ if (!is_string_field(field)) { parse_error(pe, FILT_ERR_EXPECT_DIGIT, pos + i); goto err_free; } for (i++; str[i]; i++) { if (str[i] == q) break; } if (!str[i]) { parse_error(pe, FILT_ERR_MISSING_QUOTE, pos + i); goto err_free; } /* Skip quotes */ s++; len = i - s; if (len >= MAX_FILTER_STR_VAL) { parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i); goto err_free; } pred->regex = kzalloc(sizeof(*pred->regex), GFP_KERNEL); if (!pred->regex) goto err_mem; pred->regex->len = len; strncpy(pred->regex->pattern, str + s, len); pred->regex->pattern[len] = 0; filter_build_regex(pred); if (field->filter_type == FILTER_COMM) { pred->fn_num = FILTER_PRED_FN_COMM; } else if (field->filter_type == FILTER_STATIC_STRING) { pred->fn_num = FILTER_PRED_FN_STRING; pred->regex->field_len = field->size; } else if (field->filter_type == FILTER_DYN_STRING) { pred->fn_num = FILTER_PRED_FN_STRLOC; } else if (field->filter_type == FILTER_RDYN_STRING) pred->fn_num = FILTER_PRED_FN_STRRELLOC; else { if (!ustring_per_cpu) { /* Once allocated, keep it around for good */ ustring_per_cpu = alloc_percpu(struct ustring_buffer); if (!ustring_per_cpu) goto err_mem; } if (ustring) pred->fn_num = FILTER_PRED_FN_PCHAR_USER; else pred->fn_num = FILTER_PRED_FN_PCHAR; } /* go past the last quote */ i++; } else if (isdigit(str[i]) || str[i] == '-') { /* Make sure the field is not a string */ if (is_string_field(field)) { parse_error(pe, FILT_ERR_EXPECT_STRING, pos + i); goto err_free; } if (op == OP_GLOB) { parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i); goto err_free; } if (str[i] == '-') i++; /* We allow 0xDEADBEEF */ while (isalnum(str[i])) i++; len = i - s; /* 0xfeedfacedeadbeef is 18 chars max */ if (len >= sizeof(num_buf)) { parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i); goto err_free; } strncpy(num_buf, str + s, len); num_buf[len] = 0; /* Make sure it is a value */ if (field->is_signed) ret = kstrtoll(num_buf, 0, &val); else ret = kstrtoull(num_buf, 0, &val); if (ret) { parse_error(pe, FILT_ERR_ILLEGAL_INTVAL, pos + s); goto err_free; } pred->val = val; if (field->filter_type == FILTER_CPU) pred->fn_num = FILTER_PRED_FN_CPU; else { pred->fn_num = select_comparison_fn(pred->op, field->size, field->is_signed); if (pred->op == OP_NE) pred->not = 1; } } else { parse_error(pe, FILT_ERR_INVALID_VALUE, pos + i); goto err_free; } *pred_ptr = pred; return i; err_free: free_predicate(pred); return -EINVAL; err_mem: free_predicate(pred); return -ENOMEM; } enum { TOO_MANY_CLOSE = -1, TOO_MANY_OPEN = -2, MISSING_QUOTE = -3, }; /* * Read the filter string once to calculate the number of predicates * as well as how deep the parentheses go. * * Returns: * 0 - everything is fine (err is undefined) * -1 - too many ')' * -2 - too many '(' * -3 - No matching quote */ static int calc_stack(const char *str, int *parens, int *preds, int *err) { bool is_pred = false; int nr_preds = 0; int open = 1; /* Count the expression as "(E)" */ int last_quote = 0; int max_open = 1; int quote = 0; int i; *err = 0; for (i = 0; str[i]; i++) { if (isspace(str[i])) continue; if (quote) { if (str[i] == quote) quote = 0; continue; } switch (str[i]) { case '\'': case '"': quote = str[i]; last_quote = i; break; case '|': case '&': if (str[i+1] != str[i]) break; is_pred = false; continue; case '(': is_pred = false; open++; if (open > max_open) max_open = open; continue; case ')': is_pred = false; if (open == 1) { *err = i; return TOO_MANY_CLOSE; } open--; continue; } if (!is_pred) { nr_preds++; is_pred = true; } } if (quote) { *err = last_quote; return MISSING_QUOTE; } if (open != 1) { int level = open; /* find the bad open */ for (i--; i; i--) { if (quote) { if (str[i] == quote) quote = 0; continue; } switch (str[i]) { case '(': if (level == open) { *err = i; return TOO_MANY_OPEN; } level--; break; case ')': level++; break; case '\'': case '"': quote = str[i]; break; } } /* First character is the '(' with missing ')' */ *err = 0; return TOO_MANY_OPEN; } /* Set the size of the required stacks */ *parens = max_open; *preds = nr_preds; return 0; } static int process_preds(struct trace_event_call *call, const char *filter_string, struct event_filter *filter, struct filter_parse_error *pe) { struct prog_entry *prog; int nr_parens; int nr_preds; int index; int ret; ret = calc_stack(filter_string, &nr_parens, &nr_preds, &index); if (ret < 0) { switch (ret) { case MISSING_QUOTE: parse_error(pe, FILT_ERR_MISSING_QUOTE, index); break; case TOO_MANY_OPEN: parse_error(pe, FILT_ERR_TOO_MANY_OPEN, index); break; default: parse_error(pe, FILT_ERR_TOO_MANY_CLOSE, index); } return ret; } if (!nr_preds) return -EINVAL; prog = predicate_parse(filter_string, nr_parens, nr_preds, parse_pred, call, pe); if (IS_ERR(prog)) return PTR_ERR(prog); rcu_assign_pointer(filter->prog, prog); return 0; } static inline void event_set_filtered_flag(struct trace_event_file *file) { unsigned long old_flags = file->flags; file->flags |= EVENT_FILE_FL_FILTERED; if (old_flags != file->flags) trace_buffered_event_enable(); } static inline void event_set_filter(struct trace_event_file *file, struct event_filter *filter) { rcu_assign_pointer(file->filter, filter); } static inline void event_clear_filter(struct trace_event_file *file) { RCU_INIT_POINTER(file->filter, NULL); } struct filter_list { struct list_head list; struct event_filter *filter; }; static int process_system_preds(struct trace_subsystem_dir *dir, struct trace_array *tr, struct filter_parse_error *pe, char *filter_string) { struct trace_event_file *file; struct filter_list *filter_item; struct event_filter *filter = NULL; struct filter_list *tmp; LIST_HEAD(filter_list); bool fail = true; int err; list_for_each_entry(file, &tr->events, list) { if (file->system != dir) continue; filter = kzalloc(sizeof(*filter), GFP_KERNEL); if (!filter) goto fail_mem; filter->filter_string = kstrdup(filter_string, GFP_KERNEL); if (!filter->filter_string) goto fail_mem; err = process_preds(file->event_call, filter_string, filter, pe); if (err) { filter_disable(file); parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0); append_filter_err(tr, pe, filter); } else event_set_filtered_flag(file); filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); if (!filter_item) goto fail_mem; list_add_tail(&filter_item->list, &filter_list); /* * Regardless of if this returned an error, we still * replace the filter for the call. */ filter_item->filter = event_filter(file); event_set_filter(file, filter); filter = NULL; fail = false; } if (fail) goto fail; /* * The calls can still be using the old filters. * Do a synchronize_rcu() and to ensure all calls are * done with them before we free them. */ tracepoint_synchronize_unregister(); list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { __free_filter(filter_item->filter); list_del(&filter_item->list); kfree(filter_item); } return 0; fail: /* No call succeeded */ list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { list_del(&filter_item->list); kfree(filter_item); } parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0); return -EINVAL; fail_mem: __free_filter(filter); /* If any call succeeded, we still need to sync */ if (!fail) tracepoint_synchronize_unregister(); list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { __free_filter(filter_item->filter); list_del(&filter_item->list); kfree(filter_item); } return -ENOMEM; } static int create_filter_start(char *filter_string, bool set_str, struct filter_parse_error **pse, struct event_filter **filterp) { struct event_filter *filter; struct filter_parse_error *pe = NULL; int err = 0; if (WARN_ON_ONCE(*pse || *filterp)) return -EINVAL; filter = kzalloc(sizeof(*filter), GFP_KERNEL); if (filter && set_str) { filter->filter_string = kstrdup(filter_string, GFP_KERNEL); if (!filter->filter_string) err = -ENOMEM; } pe = kzalloc(sizeof(*pe), GFP_KERNEL); if (!filter || !pe || err) { kfree(pe); __free_filter(filter); return -ENOMEM; } /* we're committed to creating a new filter */ *filterp = filter; *pse = pe; return 0; } static void create_filter_finish(struct filter_parse_error *pe) { kfree(pe); } /** * create_filter - create a filter for a trace_event_call * @tr: the trace array associated with these events * @call: trace_event_call to create a filter for * @filter_string: filter string * @set_str: remember @filter_str and enable detailed error in filter * @filterp: out param for created filter (always updated on return) * Must be a pointer that references a NULL pointer. * * Creates a filter for @call with @filter_str. If @set_str is %true, * @filter_str is copied and recorded in the new filter. * * On success, returns 0 and *@filterp points to the new filter. On * failure, returns -errno and *@filterp may point to %NULL or to a new * filter. In the latter case, the returned filter contains error * information if @set_str is %true and the caller is responsible for * freeing it. */ static int create_filter(struct trace_array *tr, struct trace_event_call *call, char *filter_string, bool set_str, struct event_filter **filterp) { struct filter_parse_error *pe = NULL; int err; /* filterp must point to NULL */ if (WARN_ON(*filterp)) *filterp = NULL; err = create_filter_start(filter_string, set_str, &pe, filterp); if (err) return err; err = process_preds(call, filter_string, *filterp, pe); if (err && set_str) append_filter_err(tr, pe, *filterp); create_filter_finish(pe); return err; } int create_event_filter(struct trace_array *tr, struct trace_event_call *call, char *filter_str, bool set_str, struct event_filter **filterp) { return create_filter(tr, call, filter_str, set_str, filterp); } /** * create_system_filter - create a filter for an event subsystem * @dir: the descriptor for the subsystem directory * @filter_str: filter string * @filterp: out param for created filter (always updated on return) * * Identical to create_filter() except that it creates a subsystem filter * and always remembers @filter_str. */ static int create_system_filter(struct trace_subsystem_dir *dir, char *filter_str, struct event_filter **filterp) { struct filter_parse_error *pe = NULL; int err; err = create_filter_start(filter_str, true, &pe, filterp); if (!err) { err = process_system_preds(dir, dir->tr, pe, filter_str); if (!err) { /* System filters just show a default message */ kfree((*filterp)->filter_string); (*filterp)->filter_string = NULL; } else { append_filter_err(dir->tr, pe, *filterp); } } create_filter_finish(pe); return err; } /* caller must hold event_mutex */ int apply_event_filter(struct trace_event_file *file, char *filter_string) { struct trace_event_call *call = file->event_call; struct event_filter *filter = NULL; int err; if (file->flags & EVENT_FILE_FL_FREED) return -ENODEV; if (!strcmp(strstrip(filter_string), "0")) { filter_disable(file); filter = event_filter(file); if (!filter) return 0; event_clear_filter(file); /* Make sure the filter is not being used */ tracepoint_synchronize_unregister(); __free_filter(filter); return 0; } err = create_filter(file->tr, call, filter_string, true, &filter); /* * Always swap the call filter with the new filter * even if there was an error. If there was an error * in the filter, we disable the filter and show the error * string */ if (filter) { struct event_filter *tmp; tmp = event_filter(file); if (!err) event_set_filtered_flag(file); else filter_disable(file); event_set_filter(file, filter); if (tmp) { /* Make sure the call is done with the filter */ tracepoint_synchronize_unregister(); __free_filter(tmp); } } return err; } int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, char *filter_string) { struct event_subsystem *system = dir->subsystem; struct trace_array *tr = dir->tr; struct event_filter *filter = NULL; int err = 0; mutex_lock(&event_mutex); /* Make sure the system still has events */ if (!dir->nr_events) { err = -ENODEV; goto out_unlock; } if (!strcmp(strstrip(filter_string), "0")) { filter_free_subsystem_preds(dir, tr); remove_filter_string(system->filter); filter = system->filter; system->filter = NULL; /* Ensure all filters are no longer used */ tracepoint_synchronize_unregister(); filter_free_subsystem_filters(dir, tr); __free_filter(filter); goto out_unlock; } err = create_system_filter(dir, filter_string, &filter); if (filter) { /* * No event actually uses the system filter * we can free it without synchronize_rcu(). */ __free_filter(system->filter); system->filter = filter; } out_unlock: mutex_unlock(&event_mutex); return err; } #ifdef CONFIG_PERF_EVENTS void ftrace_profile_free_filter(struct perf_event *event) { struct event_filter *filter = event->filter; event->filter = NULL; __free_filter(filter); } struct function_filter_data { struct ftrace_ops *ops; int first_filter; int first_notrace; }; #ifdef CONFIG_FUNCTION_TRACER static char ** ftrace_function_filter_re(char *buf, int len, int *count) { char *str, **re; str = kstrndup(buf, len, GFP_KERNEL); if (!str) return NULL; /* * The argv_split function takes white space * as a separator, so convert ',' into spaces. */ strreplace(str, ',', ' '); re = argv_split(GFP_KERNEL, str, count); kfree(str); return re; } static int ftrace_function_set_regexp(struct ftrace_ops *ops, int filter, int reset, char *re, int len) { int ret; if (filter) ret = ftrace_set_filter(ops, re, len, reset); else ret = ftrace_set_notrace(ops, re, len, reset); return ret; } static int __ftrace_function_set_filter(int filter, char *buf, int len, struct function_filter_data *data) { int i, re_cnt, ret = -EINVAL; int *reset; char **re; reset = filter ? &data->first_filter : &data->first_notrace; /* * The 'ip' field could have multiple filters set, separated * either by space or comma. We first cut the filter and apply * all pieces separately. */ re = ftrace_function_filter_re(buf, len, &re_cnt); if (!re) return -EINVAL; for (i = 0; i < re_cnt; i++) { ret = ftrace_function_set_regexp(data->ops, filter, *reset, re[i], strlen(re[i])); if (ret) break; if (*reset) *reset = 0; } argv_free(re); return ret; } static int ftrace_function_check_pred(struct filter_pred *pred) { struct ftrace_event_field *field = pred->field; /* * Check the predicate for function trace, verify: * - only '==' and '!=' is used * - the 'ip' field is used */ if ((pred->op != OP_EQ) && (pred->op != OP_NE)) return -EINVAL; if (strcmp(field->name, "ip")) return -EINVAL; return 0; } static int ftrace_function_set_filter_pred(struct filter_pred *pred, struct function_filter_data *data) { int ret; /* Checking the node is valid for function trace. */ ret = ftrace_function_check_pred(pred); if (ret) return ret; return __ftrace_function_set_filter(pred->op == OP_EQ, pred->regex->pattern, pred->regex->len, data); } static bool is_or(struct prog_entry *prog, int i) { int target; /* * Only "||" is allowed for function events, thus, * all true branches should jump to true, and any * false branch should jump to false. */ target = prog[i].target + 1; /* True and false have NULL preds (all prog entries should jump to one */ if (prog[target].pred) return false; /* prog[target].target is 1 for TRUE, 0 for FALSE */ return prog[i].when_to_branch == prog[target].target; } static int ftrace_function_set_filter(struct perf_event *event, struct event_filter *filter) { struct prog_entry *prog = rcu_dereference_protected(filter->prog, lockdep_is_held(&event_mutex)); struct function_filter_data data = { .first_filter = 1, .first_notrace = 1, .ops = &event->ftrace_ops, }; int i; for (i = 0; prog[i].pred; i++) { struct filter_pred *pred = prog[i].pred; if (!is_or(prog, i)) return -EINVAL; if (ftrace_function_set_filter_pred(pred, &data) < 0) return -EINVAL; } return 0; } #else static int ftrace_function_set_filter(struct perf_event *event, struct event_filter *filter) { return -ENODEV; } #endif /* CONFIG_FUNCTION_TRACER */ int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str) { int err; struct event_filter *filter = NULL; struct trace_event_call *call; mutex_lock(&event_mutex); call = event->tp_event; err = -EINVAL; if (!call) goto out_unlock; err = -EEXIST; if (event->filter) goto out_unlock; err = create_filter(NULL, call, filter_str, false, &filter); if (err) goto free_filter; if (ftrace_event_is_function(call)) err = ftrace_function_set_filter(event, filter); else event->filter = filter; free_filter: if (err || ftrace_event_is_function(call)) __free_filter(filter); out_unlock: mutex_unlock(&event_mutex); return err; } #endif /* CONFIG_PERF_EVENTS */ #ifdef CONFIG_FTRACE_STARTUP_TEST #include <linux/types.h> #include <linux/tracepoint.h> #define CREATE_TRACE_POINTS #include "trace_events_filter_test.h" #define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \ { \ .filter = FILTER, \ .rec = { .a = va, .b = vb, .c = vc, .d = vd, \ .e = ve, .f = vf, .g = vg, .h = vh }, \ .match = m, \ .not_visited = nvisit, \ } #define YES 1 #define NO 0 static struct test_filter_data_t { char *filter; struct trace_event_raw_ftrace_test_filter rec; int match; char *not_visited; } test_filter_data[] = { #define FILTER "a == 1 && b == 1 && c == 1 && d == 1 && " \ "e == 1 && f == 1 && g == 1 && h == 1" DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, ""), DATA_REC(NO, 0, 1, 1, 1, 1, 1, 1, 1, "bcdefgh"), DATA_REC(NO, 1, 1, 1, 1, 1, 1, 1, 0, ""), #undef FILTER #define FILTER "a == 1 || b == 1 || c == 1 || d == 1 || " \ "e == 1 || f == 1 || g == 1 || h == 1" DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""), DATA_REC(YES, 0, 0, 0, 0, 0, 0, 0, 1, ""), DATA_REC(YES, 1, 0, 0, 0, 0, 0, 0, 0, "bcdefgh"), #undef FILTER #define FILTER "(a == 1 || b == 1) && (c == 1 || d == 1) && " \ "(e == 1 || f == 1) && (g == 1 || h == 1)" DATA_REC(NO, 0, 0, 1, 1, 1, 1, 1, 1, "dfh"), DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""), DATA_REC(YES, 1, 0, 1, 0, 0, 1, 0, 1, "bd"), DATA_REC(NO, 1, 0, 1, 0, 0, 1, 0, 0, "bd"), #undef FILTER #define FILTER "(a == 1 && b == 1) || (c == 1 && d == 1) || " \ "(e == 1 && f == 1) || (g == 1 && h == 1)" DATA_REC(YES, 1, 0, 1, 1, 1, 1, 1, 1, "efgh"), DATA_REC(YES, 0, 0, 0, 0, 0, 0, 1, 1, ""), DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""), #undef FILTER #define FILTER "(a == 1 && b == 1) && (c == 1 && d == 1) && " \ "(e == 1 && f == 1) || (g == 1 && h == 1)" DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 0, "gh"), DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""), DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, ""), #undef FILTER #define FILTER "((a == 1 || b == 1) || (c == 1 || d == 1) || " \ "(e == 1 || f == 1)) && (g == 1 || h == 1)" DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 1, "bcdef"), DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""), DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, "h"), #undef FILTER #define FILTER "((((((((a == 1) && (b == 1)) || (c == 1)) && (d == 1)) || " \ "(e == 1)) && (f == 1)) || (g == 1)) && (h == 1))" DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "ceg"), DATA_REC(NO, 0, 1, 0, 1, 0, 1, 0, 1, ""), DATA_REC(NO, 1, 0, 1, 0, 1, 0, 1, 0, ""), #undef FILTER #define FILTER "((((((((a == 1) || (b == 1)) && (c == 1)) || (d == 1)) && " \ "(e == 1)) || (f == 1)) && (g == 1)) || (h == 1))" DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "bdfh"), DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""), DATA_REC(YES, 1, 0, 1, 0, 1, 0, 1, 0, "bdfh"), }; #undef DATA_REC #undef FILTER #undef YES #undef NO #define DATA_CNT ARRAY_SIZE(test_filter_data) static int test_pred_visited; static int test_pred_visited_fn(struct filter_pred *pred, void *event) { struct ftrace_event_field *field = pred->field; test_pred_visited = 1; printk(KERN_INFO "\npred visited %s\n", field->name); return 1; } static void update_pred_fn(struct event_filter *filter, char *fields) { struct prog_entry *prog = rcu_dereference_protected(filter->prog, lockdep_is_held(&event_mutex)); int i; for (i = 0; prog[i].pred; i++) { struct filter_pred *pred = prog[i].pred; struct ftrace_event_field *field = pred->field; WARN_ON_ONCE(pred->fn_num == FILTER_PRED_FN_NOP); if (!field) { WARN_ONCE(1, "all leafs should have field defined %d", i); continue; } if (!strchr(fields, *field->name)) continue; pred->fn_num = FILTER_PRED_TEST_VISITED; } } static __init int ftrace_test_event_filter(void) { int i; printk(KERN_INFO "Testing ftrace filter: "); for (i = 0; i < DATA_CNT; i++) { struct event_filter *filter = NULL; struct test_filter_data_t *d = &test_filter_data[i]; int err; err = create_filter(NULL, &event_ftrace_test_filter, d->filter, false, &filter); if (err) { printk(KERN_INFO "Failed to get filter for '%s', err %d\n", d->filter, err); __free_filter(filter); break; } /* Needed to dereference filter->prog */ mutex_lock(&event_mutex); /* * The preemption disabling is not really needed for self * tests, but the rcu dereference will complain without it. */ preempt_disable(); if (*d->not_visited) update_pred_fn(filter, d->not_visited); test_pred_visited = 0; err = filter_match_preds(filter, &d->rec); preempt_enable(); mutex_unlock(&event_mutex); __free_filter(filter); if (test_pred_visited) { printk(KERN_INFO "Failed, unwanted pred visited for filter %s\n", d->filter); break; } if (err != d->match) { printk(KERN_INFO "Failed to match filter '%s', expected %d\n", d->filter, d->match); break; } } if (i == DATA_CNT) printk(KERN_CONT "OK\n"); return 0; } late_initcall(ftrace_test_event_filter); #endif /* CONFIG_FTRACE_STARTUP_TEST */
14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM exceptions #if !defined(_TRACE_PAGE_FAULT_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_PAGE_FAULT_H #include <linux/tracepoint.h> #include <asm/trace/common.h> extern int trace_pagefault_reg(void); extern void trace_pagefault_unreg(void); DECLARE_EVENT_CLASS(x86_exceptions, TP_PROTO(unsigned long address, struct pt_regs *regs, unsigned long error_code), TP_ARGS(address, regs, error_code), TP_STRUCT__entry( __field( unsigned long, address ) __field( unsigned long, ip ) __field( unsigned long, error_code ) ), TP_fast_assign( __entry->address = address; __entry->ip = regs->ip; __entry->error_code = error_code; ), TP_printk("address=%ps ip=%ps error_code=0x%lx", (void *)__entry->address, (void *)__entry->ip, __entry->error_code) ); #define DEFINE_PAGE_FAULT_EVENT(name) \ DEFINE_EVENT_FN(x86_exceptions, name, \ TP_PROTO(unsigned long address, struct pt_regs *regs, \ unsigned long error_code), \ TP_ARGS(address, regs, error_code), \ trace_pagefault_reg, trace_pagefault_unreg); DEFINE_PAGE_FAULT_EVENT(page_fault_user); DEFINE_PAGE_FAULT_EVENT(page_fault_kernel); #undef TRACE_INCLUDE_PATH #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_PATH . #define TRACE_INCLUDE_FILE exceptions #endif /* _TRACE_PAGE_FAULT_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
13 2 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 // SPDX-License-Identifier: GPL-2.0 /* * file.c - part of debugfs, a tiny little debug file system * * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2004 IBM Inc. * * debugfs is for people to use instead of /proc or /sys. * See Documentation/filesystems/ for more details. */ #include <linux/module.h> #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/pagemap.h> #include <linux/debugfs.h> #include <linux/io.h> #include <linux/slab.h> #include <linux/atomic.h> #include <linux/device.h> #include <linux/pm_runtime.h> #include <linux/poll.h> #include <linux/security.h> #include "internal.h" struct poll_table_struct; static ssize_t default_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) { return 0; } static ssize_t default_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { return count; } const struct file_operations debugfs_noop_file_operations = { .read = default_read_file, .write = default_write_file, .open = simple_open, .llseek = noop_llseek, }; #define F_DENTRY(filp) ((filp)->f_path.dentry) const struct file_operations *debugfs_real_fops(const struct file *filp) { struct debugfs_fsdata *fsd = F_DENTRY(filp)->d_fsdata; if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT) { /* * Urgh, we've been called w/o a protecting * debugfs_file_get(). */ WARN_ON(1); return NULL; } return fsd->real_fops; } EXPORT_SYMBOL_GPL(debugfs_real_fops); /** * debugfs_file_get - mark the beginning of file data access * @dentry: the dentry object whose data is being accessed. * * Up to a matching call to debugfs_file_put(), any successive call * into the file removing functions debugfs_remove() and * debugfs_remove_recursive() will block. Since associated private * file data may only get freed after a successful return of any of * the removal functions, you may safely access it after a successful * call to debugfs_file_get() without worrying about lifetime issues. * * If -%EIO is returned, the file has already been removed and thus, * it is not safe to access any of its data. If, on the other hand, * it is allowed to access the file data, zero is returned. */ int debugfs_file_get(struct dentry *dentry) { struct debugfs_fsdata *fsd; void *d_fsd; /* * This could only happen if some debugfs user erroneously calls * debugfs_file_get() on a dentry that isn't even a file, let * them know about it. */ if (WARN_ON(!d_is_reg(dentry))) return -EINVAL; d_fsd = READ_ONCE(dentry->d_fsdata); if (!((unsigned long)d_fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)) { fsd = d_fsd; } else { fsd = kmalloc(sizeof(*fsd), GFP_KERNEL); if (!fsd) return -ENOMEM; fsd->real_fops = (void *)((unsigned long)d_fsd & ~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT); refcount_set(&fsd->active_users, 1); init_completion(&fsd->active_users_drained); INIT_LIST_HEAD(&fsd->cancellations); mutex_init(&fsd->cancellations_mtx); if (cmpxchg(&dentry->d_fsdata, d_fsd, fsd) != d_fsd) { mutex_destroy(&fsd->cancellations_mtx); kfree(fsd); fsd = READ_ONCE(dentry->d_fsdata); } } /* * In case of a successful cmpxchg() above, this check is * strictly necessary and must follow it, see the comment in * __debugfs_remove_file(). * OTOH, if the cmpxchg() hasn't been executed or wasn't * successful, this serves the purpose of not starving * removers. */ if (d_unlinked(dentry)) return -EIO; if (!refcount_inc_not_zero(&fsd->active_users)) return -EIO; return 0; } EXPORT_SYMBOL_GPL(debugfs_file_get); /** * debugfs_file_put - mark the end of file data access * @dentry: the dentry object formerly passed to * debugfs_file_get(). * * Allow any ongoing concurrent call into debugfs_remove() or * debugfs_remove_recursive() blocked by a former call to * debugfs_file_get() to proceed and return to its caller. */ void debugfs_file_put(struct dentry *dentry) { struct debugfs_fsdata *fsd = READ_ONCE(dentry->d_fsdata); if (refcount_dec_and_test(&fsd->active_users)) complete(&fsd->active_users_drained); } EXPORT_SYMBOL_GPL(debugfs_file_put); /** * debugfs_enter_cancellation - enter a debugfs cancellation * @file: the file being accessed * @cancellation: the cancellation object, the cancel callback * inside of it must be initialized * * When a debugfs file is removed it needs to wait for all active * operations to complete. However, the operation itself may need * to wait for hardware or completion of some asynchronous process * or similar. As such, it may need to be cancelled to avoid long * waits or even deadlocks. * * This function can be used inside a debugfs handler that may * need to be cancelled. As soon as this function is called, the * cancellation's 'cancel' callback may be called, at which point * the caller should proceed to call debugfs_leave_cancellation() * and leave the debugfs handler function as soon as possible. * Note that the 'cancel' callback is only ever called in the * context of some kind of debugfs_remove(). * * This function must be paired with debugfs_leave_cancellation(). */ void debugfs_enter_cancellation(struct file *file, struct debugfs_cancellation *cancellation) { struct debugfs_fsdata *fsd; struct dentry *dentry = F_DENTRY(file); INIT_LIST_HEAD(&cancellation->list); if (WARN_ON(!d_is_reg(dentry))) return; if (WARN_ON(!cancellation->cancel)) return; fsd = READ_ONCE(dentry->d_fsdata); if (WARN_ON(!fsd || ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT))) return; mutex_lock(&fsd->cancellations_mtx); list_add(&cancellation->list, &fsd->cancellations); mutex_unlock(&fsd->cancellations_mtx); /* if we're already removing wake it up to cancel */ if (d_unlinked(dentry)) complete(&fsd->active_users_drained); } EXPORT_SYMBOL_GPL(debugfs_enter_cancellation); /** * debugfs_leave_cancellation - leave cancellation section * @file: the file being accessed * @cancellation: the cancellation previously registered with * debugfs_enter_cancellation() * * See the documentation of debugfs_enter_cancellation(). */ void debugfs_leave_cancellation(struct file *file, struct debugfs_cancellation *cancellation) { struct debugfs_fsdata *fsd; struct dentry *dentry = F_DENTRY(file); if (WARN_ON(!d_is_reg(dentry))) return; fsd = READ_ONCE(dentry->d_fsdata); if (WARN_ON(!fsd || ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT))) return; mutex_lock(&fsd->cancellations_mtx); if (!list_empty(&cancellation->list)) list_del(&cancellation->list); mutex_unlock(&fsd->cancellations_mtx); } EXPORT_SYMBOL_GPL(debugfs_leave_cancellation); /* * Only permit access to world-readable files when the kernel is locked down. * We also need to exclude any file that has ways to write or alter it as root * can bypass the permissions check. */ static int debugfs_locked_down(struct inode *inode, struct file *filp, const struct file_operations *real_fops) { if ((inode->i_mode & 07777 & ~0444) == 0 && !(filp->f_mode & FMODE_WRITE) && !real_fops->unlocked_ioctl && !real_fops->compat_ioctl && !real_fops->mmap) return 0; if (security_locked_down(LOCKDOWN_DEBUGFS)) return -EPERM; return 0; } static int open_proxy_open(struct inode *inode, struct file *filp) { struct dentry *dentry = F_DENTRY(filp); const struct file_operations *real_fops = NULL; int r; r = debugfs_file_get(dentry); if (r) return r == -EIO ? -ENOENT : r; real_fops = debugfs_real_fops(filp); r = debugfs_locked_down(inode, filp, real_fops); if (r) goto out; if (!fops_get(real_fops)) { #ifdef CONFIG_MODULES if (real_fops->owner && real_fops->owner->state == MODULE_STATE_GOING) { r = -ENXIO; goto out; } #endif /* Huh? Module did not clean up after itself at exit? */ WARN(1, "debugfs file owner did not clean up at exit: %pd", dentry); r = -ENXIO; goto out; } replace_fops(filp, real_fops); if (real_fops->open) r = real_fops->open(inode, filp); out: debugfs_file_put(dentry); return r; } const struct file_operations debugfs_open_proxy_file_operations = { .open = open_proxy_open, }; #define PROTO(args...) args #define ARGS(args...) args #define FULL_PROXY_FUNC(name, ret_type, filp, proto, args) \ static ret_type full_proxy_ ## name(proto) \ { \ struct dentry *dentry = F_DENTRY(filp); \ const struct file_operations *real_fops; \ ret_type r; \ \ r = debugfs_file_get(dentry); \ if (unlikely(r)) \ return r; \ real_fops = debugfs_real_fops(filp); \ r = real_fops->name(args); \ debugfs_file_put(dentry); \ return r; \ } FULL_PROXY_FUNC(llseek, loff_t, filp, PROTO(struct file *filp, loff_t offset, int whence), ARGS(filp, offset, whence)); FULL_PROXY_FUNC(read, ssize_t, filp, PROTO(struct file *filp, char __user *buf, size_t size, loff_t *ppos), ARGS(filp, buf, size, ppos)); FULL_PROXY_FUNC(write, ssize_t, filp, PROTO(struct file *filp, const char __user *buf, size_t size, loff_t *ppos), ARGS(filp, buf, size, ppos)); FULL_PROXY_FUNC(unlocked_ioctl, long, filp, PROTO(struct file *filp, unsigned int cmd, unsigned long arg), ARGS(filp, cmd, arg)); static __poll_t full_proxy_poll(struct file *filp, struct poll_table_struct *wait) { struct dentry *dentry = F_DENTRY(filp); __poll_t r = 0; const struct file_operations *real_fops; if (debugfs_file_get(dentry)) return EPOLLHUP; real_fops = debugfs_real_fops(filp); r = real_fops->poll(filp, wait); debugfs_file_put(dentry); return r; } static int full_proxy_release(struct inode *inode, struct file *filp) { const struct dentry *dentry = F_DENTRY(filp); const struct file_operations *real_fops = debugfs_real_fops(filp); const struct file_operations *proxy_fops = filp->f_op; int r = 0; /* * We must not protect this against removal races here: the * original releaser should be called unconditionally in order * not to leak any resources. Releasers must not assume that * ->i_private is still being meaningful here. */ if (real_fops->release) r = real_fops->release(inode, filp); replace_fops(filp, d_inode(dentry)->i_fop); kfree(proxy_fops); fops_put(real_fops); return r; } static void __full_proxy_fops_init(struct file_operations *proxy_fops, const struct file_operations *real_fops) { proxy_fops->release = full_proxy_release; if (real_fops->llseek) proxy_fops->llseek = full_proxy_llseek; if (real_fops->read) proxy_fops->read = full_proxy_read; if (real_fops->write) proxy_fops->write = full_proxy_write; if (real_fops->poll) proxy_fops->poll = full_proxy_poll; if (real_fops->unlocked_ioctl) proxy_fops->unlocked_ioctl = full_proxy_unlocked_ioctl; } static int full_proxy_open(struct inode *inode, struct file *filp) { struct dentry *dentry = F_DENTRY(filp); const struct file_operations *real_fops = NULL; struct file_operations *proxy_fops = NULL; int r; r = debugfs_file_get(dentry); if (r) return r == -EIO ? -ENOENT : r; real_fops = debugfs_real_fops(filp); r = debugfs_locked_down(inode, filp, real_fops); if (r) goto out; if (!fops_get(real_fops)) { #ifdef CONFIG_MODULES if (real_fops->owner && real_fops->owner->state == MODULE_STATE_GOING) { r = -ENXIO; goto out; } #endif /* Huh? Module did not cleanup after itself at exit? */ WARN(1, "debugfs file owner did not clean up at exit: %pd", dentry); r = -ENXIO; goto out; } proxy_fops = kzalloc(sizeof(*proxy_fops), GFP_KERNEL); if (!proxy_fops) { r = -ENOMEM; goto free_proxy; } __full_proxy_fops_init(proxy_fops, real_fops); replace_fops(filp, proxy_fops); if (real_fops->open) { r = real_fops->open(inode, filp); if (r) { replace_fops(filp, d_inode(dentry)->i_fop); goto free_proxy; } else if (filp->f_op != proxy_fops) { /* No protection against file removal anymore. */ WARN(1, "debugfs file owner replaced proxy fops: %pd", dentry); goto free_proxy; } } goto out; free_proxy: kfree(proxy_fops); fops_put(real_fops); out: debugfs_file_put(dentry); return r; } const struct file_operations debugfs_full_proxy_file_operations = { .open = full_proxy_open, }; ssize_t debugfs_attr_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { struct dentry *dentry = F_DENTRY(file); ssize_t ret; ret = debugfs_file_get(dentry); if (unlikely(ret)) return ret; ret = simple_attr_read(file, buf, len, ppos); debugfs_file_put(dentry); return ret; } EXPORT_SYMBOL_GPL(debugfs_attr_read); static ssize_t debugfs_attr_write_xsigned(struct file *file, const char __user *buf, size_t len, loff_t *ppos, bool is_signed) { struct dentry *dentry = F_DENTRY(file); ssize_t ret; ret = debugfs_file_get(dentry); if (unlikely(ret)) return ret; if (is_signed) ret = simple_attr_write_signed(file, buf, len, ppos); else ret = simple_attr_write(file, buf, len, ppos); debugfs_file_put(dentry); return ret; } ssize_t debugfs_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { return debugfs_attr_write_xsigned(file, buf, len, ppos, false); } EXPORT_SYMBOL_GPL(debugfs_attr_write); ssize_t debugfs_attr_write_signed(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { return debugfs_attr_write_xsigned(file, buf, len, ppos, true); } EXPORT_SYMBOL_GPL(debugfs_attr_write_signed); static struct dentry *debugfs_create_mode_unsafe(const char *name, umode_t mode, struct dentry *parent, void *value, const struct file_operations *fops, const struct file_operations *fops_ro, const struct file_operations *fops_wo) { /* if there are no write bits set, make read only */ if (!(mode & S_IWUGO)) return debugfs_create_file_unsafe(name, mode, parent, value, fops_ro); /* if there are no read bits set, make write only */ if (!(mode & S_IRUGO)) return debugfs_create_file_unsafe(name, mode, parent, value, fops_wo); return debugfs_create_file_unsafe(name, mode, parent, value, fops); } static int debugfs_u8_set(void *data, u64 val) { *(u8 *)data = val; return 0; } static int debugfs_u8_get(void *data, u64 *val) { *val = *(u8 *)data; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u8_ro, debugfs_u8_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n"); /** * debugfs_create_u8 - create a debugfs file that is used to read and write an unsigned 8-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_u8(const char *name, umode_t mode, struct dentry *parent, u8 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u8, &fops_u8_ro, &fops_u8_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u8); static int debugfs_u16_set(void *data, u64 val) { *(u16 *)data = val; return 0; } static int debugfs_u16_get(void *data, u64 *val) { *val = *(u16 *)data; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u16_ro, debugfs_u16_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n"); /** * debugfs_create_u16 - create a debugfs file that is used to read and write an unsigned 16-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_u16(const char *name, umode_t mode, struct dentry *parent, u16 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u16, &fops_u16_ro, &fops_u16_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u16); static int debugfs_u32_set(void *data, u64 val) { *(u32 *)data = val; return 0; } static int debugfs_u32_get(void *data, u64 *val) { *val = *(u32 *)data; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u32_ro, debugfs_u32_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n"); /** * debugfs_create_u32 - create a debugfs file that is used to read and write an unsigned 32-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_u32(const char *name, umode_t mode, struct dentry *parent, u32 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u32, &fops_u32_ro, &fops_u32_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u32); static int debugfs_u64_set(void *data, u64 val) { *(u64 *)data = val; return 0; } static int debugfs_u64_get(void *data, u64 *val) { *val = *(u64 *)data; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u64_ro, debugfs_u64_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n"); /** * debugfs_create_u64 - create a debugfs file that is used to read and write an unsigned 64-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_u64(const char *name, umode_t mode, struct dentry *parent, u64 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u64, &fops_u64_ro, &fops_u64_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u64); static int debugfs_ulong_set(void *data, u64 val) { *(unsigned long *)data = val; return 0; } static int debugfs_ulong_get(void *data, u64 *val) { *val = *(unsigned long *)data; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong, debugfs_ulong_get, debugfs_ulong_set, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong_ro, debugfs_ulong_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n"); /** * debugfs_create_ulong - create a debugfs file that is used to read and write * an unsigned long value. * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_ulong(const char *name, umode_t mode, struct dentry *parent, unsigned long *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_ulong, &fops_ulong_ro, &fops_ulong_wo); } EXPORT_SYMBOL_GPL(debugfs_create_ulong); DEFINE_DEBUGFS_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x16, debugfs_u16_get, debugfs_u16_set, "0x%04llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x16_ro, debugfs_u16_get, NULL, "0x%04llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x16_wo, NULL, debugfs_u16_set, "0x%04llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x64_ro, debugfs_u64_get, NULL, "0x%016llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x64_wo, NULL, debugfs_u64_set, "0x%016llx\n"); /* * debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value * * These functions are exactly the same as the above functions (but use a hex * output for the decimal challenged). For details look at the above unsigned * decimal functions. */ /** * debugfs_create_x8 - create a debugfs file that is used to read and write an unsigned 8-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. */ void debugfs_create_x8(const char *name, umode_t mode, struct dentry *parent, u8 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x8, &fops_x8_ro, &fops_x8_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x8); /** * debugfs_create_x16 - create a debugfs file that is used to read and write an unsigned 16-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. */ void debugfs_create_x16(const char *name, umode_t mode, struct dentry *parent, u16 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x16, &fops_x16_ro, &fops_x16_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x16); /** * debugfs_create_x32 - create a debugfs file that is used to read and write an unsigned 32-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. */ void debugfs_create_x32(const char *name, umode_t mode, struct dentry *parent, u32 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x32, &fops_x32_ro, &fops_x32_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x32); /** * debugfs_create_x64 - create a debugfs file that is used to read and write an unsigned 64-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. */ void debugfs_create_x64(const char *name, umode_t mode, struct dentry *parent, u64 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x64, &fops_x64_ro, &fops_x64_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x64); static int debugfs_size_t_set(void *data, u64 val) { *(size_t *)data = val; return 0; } static int debugfs_size_t_get(void *data, u64 *val) { *val = *(size_t *)data; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set, "%llu\n"); /* %llu and %zu are more or less the same */ DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t_ro, debugfs_size_t_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t_wo, NULL, debugfs_size_t_set, "%llu\n"); /** * debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. */ void debugfs_create_size_t(const char *name, umode_t mode, struct dentry *parent, size_t *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_size_t, &fops_size_t_ro, &fops_size_t_wo); } EXPORT_SYMBOL_GPL(debugfs_create_size_t); static int debugfs_atomic_t_set(void *data, u64 val) { atomic_set((atomic_t *)data, val); return 0; } static int debugfs_atomic_t_get(void *data, u64 *val) { *val = atomic_read((atomic_t *)data); return 0; } DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t, debugfs_atomic_t_get, debugfs_atomic_t_set, "%lld\n"); DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, "%lld\n"); DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n"); /** * debugfs_create_atomic_t - create a debugfs file that is used to read and * write an atomic_t value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. */ void debugfs_create_atomic_t(const char *name, umode_t mode, struct dentry *parent, atomic_t *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_atomic_t, &fops_atomic_t_ro, &fops_atomic_t_wo); } EXPORT_SYMBOL_GPL(debugfs_create_atomic_t); ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { char buf[2]; bool val; int r; struct dentry *dentry = F_DENTRY(file); r = debugfs_file_get(dentry); if (unlikely(r)) return r; val = *(bool *)file->private_data; debugfs_file_put(dentry); if (val) buf[0] = 'Y'; else buf[0] = 'N'; buf[1] = '\n'; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); } EXPORT_SYMBOL_GPL(debugfs_read_file_bool); ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { bool bv; int r; bool *val = file->private_data; struct dentry *dentry = F_DENTRY(file); r = kstrtobool_from_user(user_buf, count, &bv); if (!r) { r = debugfs_file_get(dentry); if (unlikely(r)) return r; *val = bv; debugfs_file_put(dentry); } return count; } EXPORT_SYMBOL_GPL(debugfs_write_file_bool); static const struct file_operations fops_bool = { .read = debugfs_read_file_bool, .write = debugfs_write_file_bool, .open = simple_open, .llseek = default_llseek, }; static const struct file_operations fops_bool_ro = { .read = debugfs_read_file_bool, .open = simple_open, .llseek = default_llseek, }; static const struct file_operations fops_bool_wo = { .write = debugfs_write_file_bool, .open = simple_open, .llseek = default_llseek, }; /** * debugfs_create_bool - create a debugfs file that is used to read and write a boolean value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_bool(const char *name, umode_t mode, struct dentry *parent, bool *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_bool, &fops_bool_ro, &fops_bool_wo); } EXPORT_SYMBOL_GPL(debugfs_create_bool); ssize_t debugfs_read_file_str(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct dentry *dentry = F_DENTRY(file); char *str, *copy = NULL; int copy_len, len; ssize_t ret; ret = debugfs_file_get(dentry); if (unlikely(ret)) return ret; str = *(char **)file->private_data; len = strlen(str) + 1; copy = kmalloc(len, GFP_KERNEL); if (!copy) { debugfs_file_put(dentry); return -ENOMEM; } copy_len = strscpy(copy, str, len); debugfs_file_put(dentry); if (copy_len < 0) { kfree(copy); return copy_len; } copy[copy_len] = '\n'; ret = simple_read_from_buffer(user_buf, count, ppos, copy, len); kfree(copy); return ret; } EXPORT_SYMBOL_GPL(debugfs_create_str); static ssize_t debugfs_write_file_str(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct dentry *dentry = F_DENTRY(file); char *old, *new = NULL; int pos = *ppos; int r; r = debugfs_file_get(dentry); if (unlikely(r)) return r; old = *(char **)file->private_data; /* only allow strict concatenation */ r = -EINVAL; if (pos && pos != strlen(old)) goto error; r = -E2BIG; if (pos + count + 1 > PAGE_SIZE) goto error; r = -ENOMEM; new = kmalloc(pos + count + 1, GFP_KERNEL); if (!new) goto error; if (pos) memcpy(new, old, pos); r = -EFAULT; if (copy_from_user(new + pos, user_buf, count)) goto error; new[pos + count] = '\0'; strim(new); rcu_assign_pointer(*(char __rcu **)file->private_data, new); synchronize_rcu(); kfree(old); debugfs_file_put(dentry); return count; error: kfree(new); debugfs_file_put(dentry); return r; } static const struct file_operations fops_str = { .read = debugfs_read_file_str, .write = debugfs_write_file_str, .open = simple_open, .llseek = default_llseek, }; static const struct file_operations fops_str_ro = { .read = debugfs_read_file_str, .open = simple_open, .llseek = default_llseek, }; static const struct file_operations fops_str_wo = { .write = debugfs_write_file_str, .open = simple_open, .llseek = default_llseek, }; /** * debugfs_create_str - create a debugfs file that is used to read and write a string value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_str(const char *name, umode_t mode, struct dentry *parent, char **value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_str, &fops_str_ro, &fops_str_wo); } static ssize_t read_file_blob(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct debugfs_blob_wrapper *blob = file->private_data; struct dentry *dentry = F_DENTRY(file); ssize_t r; r = debugfs_file_get(dentry); if (unlikely(r)) return r; r = simple_read_from_buffer(user_buf, count, ppos, blob->data, blob->size); debugfs_file_put(dentry); return r; } static ssize_t write_file_blob(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct debugfs_blob_wrapper *blob = file->private_data; struct dentry *dentry = F_DENTRY(file); ssize_t r; r = debugfs_file_get(dentry); if (unlikely(r)) return r; r = simple_write_to_buffer(blob->data, blob->size, ppos, user_buf, count); debugfs_file_put(dentry); return r; } static const struct file_operations fops_blob = { .read = read_file_blob, .write = write_file_blob, .open = simple_open, .llseek = default_llseek, }; /** * debugfs_create_blob - create a debugfs file that is used to read and write * a binary blob * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @blob: a pointer to a struct debugfs_blob_wrapper which contains a pointer * to the blob data and the size of the data. * * This function creates a file in debugfs with the given name that exports * @blob->data as a binary blob. If the @mode variable is so set it can be * read from and written to. * * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will * be returned. */ struct dentry *debugfs_create_blob(const char *name, umode_t mode, struct dentry *parent, struct debugfs_blob_wrapper *blob) { return debugfs_create_file_unsafe(name, mode & 0644, parent, blob, &fops_blob); } EXPORT_SYMBOL_GPL(debugfs_create_blob); static size_t u32_format_array(char *buf, size_t bufsize, u32 *array, int array_size) { size_t ret = 0; while (--array_size >= 0) { size_t len; char term = array_size ? ' ' : '\n'; len = snprintf(buf, bufsize, "%u%c", *array++, term); ret += len; buf += len; bufsize -= len; } return ret; } static int u32_array_open(struct inode *inode, struct file *file) { struct debugfs_u32_array *data = inode->i_private; int size, elements = data->n_elements; char *buf; /* * Max size: * - 10 digits + ' '/'\n' = 11 bytes per number * - terminating NUL character */ size = elements*11; buf = kmalloc(size+1, GFP_KERNEL); if (!buf) return -ENOMEM; buf[size] = 0; file->private_data = buf; u32_format_array(buf, size, data->array, data->n_elements); return nonseekable_open(inode, file); } static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { size_t size = strlen(file->private_data); return simple_read_from_buffer(buf, len, ppos, file->private_data, size); } static int u32_array_release(struct inode *inode, struct file *file) { kfree(file->private_data); return 0; } static const struct file_operations u32_array_fops = { .owner = THIS_MODULE, .open = u32_array_open, .release = u32_array_release, .read = u32_array_read, .llseek = no_llseek, }; /** * debugfs_create_u32_array - create a debugfs file that is used to read u32 * array. * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @array: wrapper struct containing data pointer and size of the array. * * This function creates a file in debugfs with the given name that exports * @array as data. If the @mode variable is so set it can be read from. * Writing is not supported. Seek within the file is also not supported. * Once array is created its size can not be changed. */ void debugfs_create_u32_array(const char *name, umode_t mode, struct dentry *parent, struct debugfs_u32_array *array) { debugfs_create_file_unsafe(name, mode, parent, array, &u32_array_fops); } EXPORT_SYMBOL_GPL(debugfs_create_u32_array); #ifdef CONFIG_HAS_IOMEM /* * The regset32 stuff is used to print 32-bit registers using the * seq_file utilities. We offer printing a register set in an already-opened * sequential file or create a debugfs file that only prints a regset32. */ /** * debugfs_print_regs32 - use seq_print to describe a set of registers * @s: the seq_file structure being used to generate output * @regs: an array if struct debugfs_reg32 structures * @nregs: the length of the above array * @base: the base address to be used in reading the registers * @prefix: a string to be prefixed to every output line * * This function outputs a text block describing the current values of * some 32-bit hardware registers. It is meant to be used within debugfs * files based on seq_file that need to show registers, intermixed with other * information. The prefix argument may be used to specify a leading string, * because some peripherals have several blocks of identical registers, * for example configuration of dma channels */ void debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs, int nregs, void __iomem *base, char *prefix) { int i; for (i = 0; i < nregs; i++, regs++) { if (prefix) seq_printf(s, "%s", prefix); seq_printf(s, "%s = 0x%08x\n", regs->name, readl(base + regs->offset)); if (seq_has_overflowed(s)) break; } } EXPORT_SYMBOL_GPL(debugfs_print_regs32); static int debugfs_regset32_show(struct seq_file *s, void *data) { struct debugfs_regset32 *regset = s->private; if (regset->dev) pm_runtime_get_sync(regset->dev); debugfs_print_regs32(s, regset->regs, regset->nregs, regset->base, ""); if (regset->dev) pm_runtime_put(regset->dev); return 0; } DEFINE_SHOW_ATTRIBUTE(debugfs_regset32); /** * debugfs_create_regset32 - create a debugfs file that returns register values * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @regset: a pointer to a struct debugfs_regset32, which contains a pointer * to an array of register definitions, the array size and the base * address where the register bank is to be found. * * This function creates a file in debugfs with the given name that reports * the names and values of a set of 32-bit registers. If the @mode variable * is so set it can be read from. Writing is not supported. */ void debugfs_create_regset32(const char *name, umode_t mode, struct dentry *parent, struct debugfs_regset32 *regset) { debugfs_create_file(name, mode, parent, regset, &debugfs_regset32_fops); } EXPORT_SYMBOL_GPL(debugfs_create_regset32); #endif /* CONFIG_HAS_IOMEM */ struct debugfs_devm_entry { int (*read)(struct seq_file *seq, void *data); struct device *dev; }; static int debugfs_devm_entry_open(struct inode *inode, struct file *f) { struct debugfs_devm_entry *entry = inode->i_private; return single_open(f, entry->read, entry->dev); } static const struct file_operations debugfs_devm_entry_ops = { .owner = THIS_MODULE, .open = debugfs_devm_entry_open, .release = single_release, .read = seq_read, .llseek = seq_lseek }; /** * debugfs_create_devm_seqfile - create a debugfs file that is bound to device. * * @dev: device related to this debugfs file. * @name: name of the debugfs file. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @read_fn: function pointer called to print the seq_file content. */ void debugfs_create_devm_seqfile(struct device *dev, const char *name, struct dentry *parent, int (*read_fn)(struct seq_file *s, void *data)) { struct debugfs_devm_entry *entry; if (IS_ERR(parent)) return; entry = devm_kzalloc(dev, sizeof(*entry), GFP_KERNEL); if (!entry) return; entry->read = read_fn; entry->dev = dev; debugfs_create_file(name, S_IRUGO, parent, entry, &debugfs_devm_entry_ops); } EXPORT_SYMBOL_GPL(debugfs_create_devm_seqfile);
4 4 4 4 540 540 540 539 540 538 539 539 540 540 540 539 539 544 540 4 540 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 // SPDX-License-Identifier: GPL-2.0-only /* * sd.c Copyright (C) 1992 Drew Eckhardt * Copyright (C) 1993, 1994, 1995, 1999 Eric Youngdale * * Linux scsi disk driver * Initial versions: Drew Eckhardt * Subsequent revisions: Eric Youngdale * Modification history: * - Drew Eckhardt <drew@colorado.edu> original * - Eric Youngdale <eric@andante.org> add scatter-gather, multiple * outstanding request, and other enhancements. * Support loadable low-level scsi drivers. * - Jirka Hanika <geo@ff.cuni.cz> support more scsi disks using * eight major numbers. * - Richard Gooch <rgooch@atnf.csiro.au> support devfs. * - Torben Mathiasen <tmm@image.dk> Resource allocation fixes in * sd_init and cleanups. * - Alex Davis <letmein@erols.com> Fix problem where partition info * not being read in sd_open. Fix problem where removable media * could be ejected after sd_open. * - Douglas Gilbert <dgilbert@interlog.com> cleanup for lk 2.5.x * - Badari Pulavarty <pbadari@us.ibm.com>, Matthew Wilcox * <willy@debian.org>, Kurt Garloff <garloff@suse.de>: * Support 32k/1M disks. * * Logging policy (needs CONFIG_SCSI_LOGGING defined): * - setting up transfer: SCSI_LOG_HLQUEUE levels 1 and 2 * - end of transfer (bh + scsi_lib): SCSI_LOG_HLCOMPLETE level 1 * - entering sd_ioctl: SCSI_LOG_IOCTL level 1 * - entering other commands: SCSI_LOG_HLQUEUE level 3 * Note: when the logging level is set by the user, it must be greater * than the level indicated above to trigger output. */ #include <linux/module.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/bio.h> #include <linux/hdreg.h> #include <linux/errno.h> #include <linux/idr.h> #include <linux/interrupt.h> #include <linux/init.h> #include <linux/blkdev.h> #include <linux/blkpg.h> #include <linux/blk-pm.h> #include <linux/delay.h> #include <linux/major.h> #include <linux/mutex.h> #include <linux/string_helpers.h> #include <linux/slab.h> #include <linux/sed-opal.h> #include <linux/pm_runtime.h> #include <linux/pr.h> #include <linux/t10-pi.h> #include <linux/uaccess.h> #include <asm/unaligned.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_dbg.h> #include <scsi/scsi_device.h> #include <scsi/scsi_driver.h> #include <scsi/scsi_eh.h> #include <scsi/scsi_host.h> #include <scsi/scsi_ioctl.h> #include <scsi/scsicam.h> #include <scsi/scsi_common.h> #include "sd.h" #include "scsi_priv.h" #include "scsi_logging.h" MODULE_AUTHOR("Eric Youngdale"); MODULE_DESCRIPTION("SCSI disk (sd) driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK0_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK1_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK2_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK3_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK4_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK5_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK6_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK7_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK8_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK9_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK10_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK11_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK12_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK13_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK14_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK15_MAJOR); MODULE_ALIAS_SCSI_DEVICE(TYPE_DISK); MODULE_ALIAS_SCSI_DEVICE(TYPE_MOD); MODULE_ALIAS_SCSI_DEVICE(TYPE_RBC); MODULE_ALIAS_SCSI_DEVICE(TYPE_ZBC); #define SD_MINORS 16 static void sd_config_discard(struct scsi_disk *, unsigned int); static void sd_config_write_same(struct scsi_disk *); static int sd_revalidate_disk(struct gendisk *); static void sd_unlock_native_capacity(struct gendisk *disk); static void sd_shutdown(struct device *); static void sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer); static void scsi_disk_release(struct device *cdev); static DEFINE_IDA(sd_index_ida); static mempool_t *sd_page_pool; static struct lock_class_key sd_bio_compl_lkclass; static const char *sd_cache_types[] = { "write through", "none", "write back", "write back, no read (daft)" }; static void sd_set_flush_flag(struct scsi_disk *sdkp) { bool wc = false, fua = false; if (sdkp->WCE) { wc = true; if (sdkp->DPOFUA) fua = true; } blk_queue_write_cache(sdkp->disk->queue, wc, fua); } static ssize_t cache_type_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int ct, rcd, wce, sp; struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; char buffer[64]; char *buffer_data; struct scsi_mode_data data; struct scsi_sense_hdr sshdr; static const char temp[] = "temporary "; int len, ret; if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC) /* no cache control on RBC devices; theoretically they * can do it, but there's probably so many exceptions * it's not worth the risk */ return -EINVAL; if (strncmp(buf, temp, sizeof(temp) - 1) == 0) { buf += sizeof(temp) - 1; sdkp->cache_override = 1; } else { sdkp->cache_override = 0; } ct = sysfs_match_string(sd_cache_types, buf); if (ct < 0) return -EINVAL; rcd = ct & 0x01 ? 1 : 0; wce = (ct & 0x02) && !sdkp->write_prot ? 1 : 0; if (sdkp->cache_override) { sdkp->WCE = wce; sdkp->RCD = rcd; sd_set_flush_flag(sdkp); return count; } if (scsi_mode_sense(sdp, 0x08, 8, 0, buffer, sizeof(buffer), SD_TIMEOUT, sdkp->max_retries, &data, NULL)) return -EINVAL; len = min_t(size_t, sizeof(buffer), data.length - data.header_length - data.block_descriptor_length); buffer_data = buffer + data.header_length + data.block_descriptor_length; buffer_data[2] &= ~0x05; buffer_data[2] |= wce << 2 | rcd; sp = buffer_data[0] & 0x80 ? 1 : 0; buffer_data[0] &= ~0x80; /* * Ensure WP, DPOFUA, and RESERVED fields are cleared in * received mode parameter buffer before doing MODE SELECT. */ data.device_specific = 0; ret = scsi_mode_select(sdp, 1, sp, buffer_data, len, SD_TIMEOUT, sdkp->max_retries, &data, &sshdr); if (ret) { if (ret > 0 && scsi_sense_valid(&sshdr)) sd_print_sense_hdr(sdkp, &sshdr); return -EINVAL; } sd_revalidate_disk(sdkp->disk); return count; } static ssize_t manage_start_stop_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; return sysfs_emit(buf, "%u\n", sdp->manage_system_start_stop && sdp->manage_runtime_start_stop && sdp->manage_shutdown); } static DEVICE_ATTR_RO(manage_start_stop); static ssize_t manage_system_start_stop_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; return sysfs_emit(buf, "%u\n", sdp->manage_system_start_stop); } static ssize_t manage_system_start_stop_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; bool v; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (kstrtobool(buf, &v)) return -EINVAL; sdp->manage_system_start_stop = v; return count; } static DEVICE_ATTR_RW(manage_system_start_stop); static ssize_t manage_runtime_start_stop_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; return sysfs_emit(buf, "%u\n", sdp->manage_runtime_start_stop); } static ssize_t manage_runtime_start_stop_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; bool v; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (kstrtobool(buf, &v)) return -EINVAL; sdp->manage_runtime_start_stop = v; return count; } static DEVICE_ATTR_RW(manage_runtime_start_stop); static ssize_t manage_shutdown_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; return sysfs_emit(buf, "%u\n", sdp->manage_shutdown); } static ssize_t manage_shutdown_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; bool v; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (kstrtobool(buf, &v)) return -EINVAL; sdp->manage_shutdown = v; return count; } static DEVICE_ATTR_RW(manage_shutdown); static ssize_t allow_restart_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->device->allow_restart); } static ssize_t allow_restart_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { bool v; struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC) return -EINVAL; if (kstrtobool(buf, &v)) return -EINVAL; sdp->allow_restart = v; return count; } static DEVICE_ATTR_RW(allow_restart); static ssize_t cache_type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); int ct = sdkp->RCD + 2*sdkp->WCE; return sprintf(buf, "%s\n", sd_cache_types[ct]); } static DEVICE_ATTR_RW(cache_type); static ssize_t FUA_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->DPOFUA); } static DEVICE_ATTR_RO(FUA); static ssize_t protection_type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->protection_type); } static ssize_t protection_type_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); unsigned int val; int err; if (!capable(CAP_SYS_ADMIN)) return -EACCES; err = kstrtouint(buf, 10, &val); if (err) return err; if (val <= T10_PI_TYPE3_PROTECTION) sdkp->protection_type = val; return count; } static DEVICE_ATTR_RW(protection_type); static ssize_t protection_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; unsigned int dif, dix; dif = scsi_host_dif_capable(sdp->host, sdkp->protection_type); dix = scsi_host_dix_capable(sdp->host, sdkp->protection_type); if (!dix && scsi_host_dix_capable(sdp->host, T10_PI_TYPE0_PROTECTION)) { dif = 0; dix = 1; } if (!dif && !dix) return sprintf(buf, "none\n"); return sprintf(buf, "%s%u\n", dix ? "dix" : "dif", dif); } static DEVICE_ATTR_RO(protection_mode); static ssize_t app_tag_own_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->ATO); } static DEVICE_ATTR_RO(app_tag_own); static ssize_t thin_provisioning_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->lbpme); } static DEVICE_ATTR_RO(thin_provisioning); /* sysfs_match_string() requires dense arrays */ static const char *lbp_mode[] = { [SD_LBP_FULL] = "full", [SD_LBP_UNMAP] = "unmap", [SD_LBP_WS16] = "writesame_16", [SD_LBP_WS10] = "writesame_10", [SD_LBP_ZERO] = "writesame_zero", [SD_LBP_DISABLE] = "disabled", }; static ssize_t provisioning_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%s\n", lbp_mode[sdkp->provisioning_mode]); } static ssize_t provisioning_mode_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; int mode; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (sd_is_zoned(sdkp)) { sd_config_discard(sdkp, SD_LBP_DISABLE); return count; } if (sdp->type != TYPE_DISK) return -EINVAL; mode = sysfs_match_string(lbp_mode, buf); if (mode < 0) return -EINVAL; sd_config_discard(sdkp, mode); return count; } static DEVICE_ATTR_RW(provisioning_mode); /* sysfs_match_string() requires dense arrays */ static const char *zeroing_mode[] = { [SD_ZERO_WRITE] = "write", [SD_ZERO_WS] = "writesame", [SD_ZERO_WS16_UNMAP] = "writesame_16_unmap", [SD_ZERO_WS10_UNMAP] = "writesame_10_unmap", }; static ssize_t zeroing_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%s\n", zeroing_mode[sdkp->zeroing_mode]); } static ssize_t zeroing_mode_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); int mode; if (!capable(CAP_SYS_ADMIN)) return -EACCES; mode = sysfs_match_string(zeroing_mode, buf); if (mode < 0) return -EINVAL; sdkp->zeroing_mode = mode; return count; } static DEVICE_ATTR_RW(zeroing_mode); static ssize_t max_medium_access_timeouts_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->max_medium_access_timeouts); } static ssize_t max_medium_access_timeouts_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); int err; if (!capable(CAP_SYS_ADMIN)) return -EACCES; err = kstrtouint(buf, 10, &sdkp->max_medium_access_timeouts); return err ? err : count; } static DEVICE_ATTR_RW(max_medium_access_timeouts); static ssize_t max_write_same_blocks_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->max_ws_blocks); } static ssize_t max_write_same_blocks_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; unsigned long max; int err; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC) return -EINVAL; err = kstrtoul(buf, 10, &max); if (err) return err; if (max == 0) sdp->no_write_same = 1; else if (max <= SD_MAX_WS16_BLOCKS) { sdp->no_write_same = 0; sdkp->max_ws_blocks = max; } sd_config_write_same(sdkp); return count; } static DEVICE_ATTR_RW(max_write_same_blocks); static ssize_t zoned_cap_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); if (sdkp->device->type == TYPE_ZBC) return sprintf(buf, "host-managed\n"); if (sdkp->zoned == 1) return sprintf(buf, "host-aware\n"); if (sdkp->zoned == 2) return sprintf(buf, "drive-managed\n"); return sprintf(buf, "none\n"); } static DEVICE_ATTR_RO(zoned_cap); static ssize_t max_retries_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdev = sdkp->device; int retries, err; err = kstrtoint(buf, 10, &retries); if (err) return err; if (retries == SCSI_CMD_RETRIES_NO_LIMIT || retries <= SD_MAX_RETRIES) { sdkp->max_retries = retries; return count; } sdev_printk(KERN_ERR, sdev, "max_retries must be between -1 and %d\n", SD_MAX_RETRIES); return -EINVAL; } static ssize_t max_retries_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%d\n", sdkp->max_retries); } static DEVICE_ATTR_RW(max_retries); static struct attribute *sd_disk_attrs[] = { &dev_attr_cache_type.attr, &dev_attr_FUA.attr, &dev_attr_allow_restart.attr, &dev_attr_manage_start_stop.attr, &dev_attr_manage_system_start_stop.attr, &dev_attr_manage_runtime_start_stop.attr, &dev_attr_manage_shutdown.attr, &dev_attr_protection_type.attr, &dev_attr_protection_mode.attr, &dev_attr_app_tag_own.attr, &dev_attr_thin_provisioning.attr, &dev_attr_provisioning_mode.attr, &dev_attr_zeroing_mode.attr, &dev_attr_max_write_same_blocks.attr, &dev_attr_max_medium_access_timeouts.attr, &dev_attr_zoned_cap.attr, &dev_attr_max_retries.attr, NULL, }; ATTRIBUTE_GROUPS(sd_disk); static struct class sd_disk_class = { .name = "scsi_disk", .dev_release = scsi_disk_release, .dev_groups = sd_disk_groups, }; /* * Don't request a new module, as that could deadlock in multipath * environment. */ static void sd_default_probe(dev_t devt) { } /* * Device no to disk mapping: * * major disc2 disc p1 * |............|.............|....|....| <- dev_t * 31 20 19 8 7 4 3 0 * * Inside a major, we have 16k disks, however mapped non- * contiguously. The first 16 disks are for major0, the next * ones with major1, ... Disk 256 is for major0 again, disk 272 * for major1, ... * As we stay compatible with our numbering scheme, we can reuse * the well-know SCSI majors 8, 65--71, 136--143. */ static int sd_major(int major_idx) { switch (major_idx) { case 0: return SCSI_DISK0_MAJOR; case 1 ... 7: return SCSI_DISK1_MAJOR + major_idx - 1; case 8 ... 15: return SCSI_DISK8_MAJOR + major_idx - 8; default: BUG(); return 0; /* shut up gcc */ } } #ifdef CONFIG_BLK_SED_OPAL static int sd_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, bool send) { struct scsi_disk *sdkp = data; struct scsi_device *sdev = sdkp->device; u8 cdb[12] = { 0, }; const struct scsi_exec_args exec_args = { .req_flags = BLK_MQ_REQ_PM, }; int ret; cdb[0] = send ? SECURITY_PROTOCOL_OUT : SECURITY_PROTOCOL_IN; cdb[1] = secp; put_unaligned_be16(spsp, &cdb[2]); put_unaligned_be32(len, &cdb[6]); ret = scsi_execute_cmd(sdev, cdb, send ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, buffer, len, SD_TIMEOUT, sdkp->max_retries, &exec_args); return ret <= 0 ? ret : -EIO; } #endif /* CONFIG_BLK_SED_OPAL */ /* * Look up the DIX operation based on whether the command is read or * write and whether dix and dif are enabled. */ static unsigned int sd_prot_op(bool write, bool dix, bool dif) { /* Lookup table: bit 2 (write), bit 1 (dix), bit 0 (dif) */ static const unsigned int ops[] = { /* wrt dix dif */ SCSI_PROT_NORMAL, /* 0 0 0 */ SCSI_PROT_READ_STRIP, /* 0 0 1 */ SCSI_PROT_READ_INSERT, /* 0 1 0 */ SCSI_PROT_READ_PASS, /* 0 1 1 */ SCSI_PROT_NORMAL, /* 1 0 0 */ SCSI_PROT_WRITE_INSERT, /* 1 0 1 */ SCSI_PROT_WRITE_STRIP, /* 1 1 0 */ SCSI_PROT_WRITE_PASS, /* 1 1 1 */ }; return ops[write << 2 | dix << 1 | dif]; } /* * Returns a mask of the protection flags that are valid for a given DIX * operation. */ static unsigned int sd_prot_flag_mask(unsigned int prot_op) { static const unsigned int flag_mask[] = { [SCSI_PROT_NORMAL] = 0, [SCSI_PROT_READ_STRIP] = SCSI_PROT_TRANSFER_PI | SCSI_PROT_GUARD_CHECK | SCSI_PROT_REF_CHECK | SCSI_PROT_REF_INCREMENT, [SCSI_PROT_READ_INSERT] = SCSI_PROT_REF_INCREMENT | SCSI_PROT_IP_CHECKSUM, [SCSI_PROT_READ_PASS] = SCSI_PROT_TRANSFER_PI | SCSI_PROT_GUARD_CHECK | SCSI_PROT_REF_CHECK | SCSI_PROT_REF_INCREMENT | SCSI_PROT_IP_CHECKSUM, [SCSI_PROT_WRITE_INSERT] = SCSI_PROT_TRANSFER_PI | SCSI_PROT_REF_INCREMENT, [SCSI_PROT_WRITE_STRIP] = SCSI_PROT_GUARD_CHECK | SCSI_PROT_REF_CHECK | SCSI_PROT_REF_INCREMENT | SCSI_PROT_IP_CHECKSUM, [SCSI_PROT_WRITE_PASS] = SCSI_PROT_TRANSFER_PI | SCSI_PROT_GUARD_CHECK | SCSI_PROT_REF_CHECK | SCSI_PROT_REF_INCREMENT | SCSI_PROT_IP_CHECKSUM, }; return flag_mask[prot_op]; } static unsigned char sd_setup_protect_cmnd(struct scsi_cmnd *scmd, unsigned int dix, unsigned int dif) { struct request *rq = scsi_cmd_to_rq(scmd); struct bio *bio = rq->bio; unsigned int prot_op = sd_prot_op(rq_data_dir(rq), dix, dif); unsigned int protect = 0; if (dix) { /* DIX Type 0, 1, 2, 3 */ if (bio_integrity_flagged(bio, BIP_IP_CHECKSUM)) scmd->prot_flags |= SCSI_PROT_IP_CHECKSUM; if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false) scmd->prot_flags |= SCSI_PROT_GUARD_CHECK; } if (dif != T10_PI_TYPE3_PROTECTION) { /* DIX/DIF Type 0, 1, 2 */ scmd->prot_flags |= SCSI_PROT_REF_INCREMENT; if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false) scmd->prot_flags |= SCSI_PROT_REF_CHECK; } if (dif) { /* DIX/DIF Type 1, 2, 3 */ scmd->prot_flags |= SCSI_PROT_TRANSFER_PI; if (bio_integrity_flagged(bio, BIP_DISK_NOCHECK)) protect = 3 << 5; /* Disable target PI checking */ else protect = 1 << 5; /* Enable target PI checking */ } scsi_set_prot_op(scmd, prot_op); scsi_set_prot_type(scmd, dif); scmd->prot_flags &= sd_prot_flag_mask(prot_op); return protect; } static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode) { struct request_queue *q = sdkp->disk->queue; unsigned int logical_block_size = sdkp->device->sector_size; unsigned int max_blocks = 0; q->limits.discard_alignment = sdkp->unmap_alignment * logical_block_size; q->limits.discard_granularity = max(sdkp->physical_block_size, sdkp->unmap_granularity * logical_block_size); sdkp->provisioning_mode = mode; switch (mode) { case SD_LBP_FULL: case SD_LBP_DISABLE: blk_queue_max_discard_sectors(q, 0); return; case SD_LBP_UNMAP: max_blocks = min_not_zero(sdkp->max_unmap_blocks, (u32)SD_MAX_WS16_BLOCKS); break; case SD_LBP_WS16: if (sdkp->device->unmap_limit_for_ws) max_blocks = sdkp->max_unmap_blocks; else max_blocks = sdkp->max_ws_blocks; max_blocks = min_not_zero(max_blocks, (u32)SD_MAX_WS16_BLOCKS); break; case SD_LBP_WS10: if (sdkp->device->unmap_limit_for_ws) max_blocks = sdkp->max_unmap_blocks; else max_blocks = sdkp->max_ws_blocks; max_blocks = min_not_zero(max_blocks, (u32)SD_MAX_WS10_BLOCKS); break; case SD_LBP_ZERO: max_blocks = min_not_zero(sdkp->max_ws_blocks, (u32)SD_MAX_WS10_BLOCKS); break; } blk_queue_max_discard_sectors(q, max_blocks * (logical_block_size >> 9)); } static void *sd_set_special_bvec(struct request *rq, unsigned int data_len) { struct page *page; page = mempool_alloc(sd_page_pool, GFP_ATOMIC); if (!page) return NULL; clear_highpage(page); bvec_set_page(&rq->special_vec, page, data_len, 0); rq->rq_flags |= RQF_SPECIAL_PAYLOAD; return bvec_virt(&rq->special_vec); } static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd) { struct scsi_device *sdp = cmd->device; struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); unsigned int data_len = 24; char *buf; buf = sd_set_special_bvec(rq, data_len); if (!buf) return BLK_STS_RESOURCE; cmd->cmd_len = 10; cmd->cmnd[0] = UNMAP; cmd->cmnd[8] = 24; put_unaligned_be16(6 + 16, &buf[0]); put_unaligned_be16(16, &buf[2]); put_unaligned_be64(lba, &buf[8]); put_unaligned_be32(nr_blocks, &buf[16]); cmd->allowed = sdkp->max_retries; cmd->transfersize = data_len; rq->timeout = SD_TIMEOUT; return scsi_alloc_sgtables(cmd); } static blk_status_t sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap) { struct scsi_device *sdp = cmd->device; struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); u32 data_len = sdp->sector_size; if (!sd_set_special_bvec(rq, data_len)) return BLK_STS_RESOURCE; cmd->cmd_len = 16; cmd->cmnd[0] = WRITE_SAME_16; if (unmap) cmd->cmnd[1] = 0x8; /* UNMAP */ put_unaligned_be64(lba, &cmd->cmnd[2]); put_unaligned_be32(nr_blocks, &cmd->cmnd[10]); cmd->allowed = sdkp->max_retries; cmd->transfersize = data_len; rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT; return scsi_alloc_sgtables(cmd); } static blk_status_t sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, bool unmap) { struct scsi_device *sdp = cmd->device; struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); u32 data_len = sdp->sector_size; if (!sd_set_special_bvec(rq, data_len)) return BLK_STS_RESOURCE; cmd->cmd_len = 10; cmd->cmnd[0] = WRITE_SAME; if (unmap) cmd->cmnd[1] = 0x8; /* UNMAP */ put_unaligned_be32(lba, &cmd->cmnd[2]); put_unaligned_be16(nr_blocks, &cmd->cmnd[7]); cmd->allowed = sdkp->max_retries; cmd->transfersize = data_len; rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT; return scsi_alloc_sgtables(cmd); } static blk_status_t sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_device *sdp = cmd->device; struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); if (!(rq->cmd_flags & REQ_NOUNMAP)) { switch (sdkp->zeroing_mode) { case SD_ZERO_WS16_UNMAP: return sd_setup_write_same16_cmnd(cmd, true); case SD_ZERO_WS10_UNMAP: return sd_setup_write_same10_cmnd(cmd, true); } } if (sdp->no_write_same) { rq->rq_flags |= RQF_QUIET; return BLK_STS_TARGET; } if (sdkp->ws16 || lba > 0xffffffff || nr_blocks > 0xffff) return sd_setup_write_same16_cmnd(cmd, false); return sd_setup_write_same10_cmnd(cmd, false); } static void sd_config_write_same(struct scsi_disk *sdkp) { struct request_queue *q = sdkp->disk->queue; unsigned int logical_block_size = sdkp->device->sector_size; if (sdkp->device->no_write_same) { sdkp->max_ws_blocks = 0; goto out; } /* Some devices can not handle block counts above 0xffff despite * supporting WRITE SAME(16). Consequently we default to 64k * blocks per I/O unless the device explicitly advertises a * bigger limit. */ if (sdkp->max_ws_blocks > SD_MAX_WS10_BLOCKS) sdkp->max_ws_blocks = min_not_zero(sdkp->max_ws_blocks, (u32)SD_MAX_WS16_BLOCKS); else if (sdkp->ws16 || sdkp->ws10 || sdkp->device->no_report_opcodes) sdkp->max_ws_blocks = min_not_zero(sdkp->max_ws_blocks, (u32)SD_MAX_WS10_BLOCKS); else { sdkp->device->no_write_same = 1; sdkp->max_ws_blocks = 0; } if (sdkp->lbprz && sdkp->lbpws) sdkp->zeroing_mode = SD_ZERO_WS16_UNMAP; else if (sdkp->lbprz && sdkp->lbpws10) sdkp->zeroing_mode = SD_ZERO_WS10_UNMAP; else if (sdkp->max_ws_blocks) sdkp->zeroing_mode = SD_ZERO_WS; else sdkp->zeroing_mode = SD_ZERO_WRITE; if (sdkp->max_ws_blocks && sdkp->physical_block_size > logical_block_size) { /* * Reporting a maximum number of blocks that is not aligned * on the device physical size would cause a large write same * request to be split into physically unaligned chunks by * __blkdev_issue_write_zeroes() even if the caller of this * functions took care to align the large request. So make sure * the maximum reported is aligned to the device physical block * size. This is only an optional optimization for regular * disks, but this is mandatory to avoid failure of large write * same requests directed at sequential write required zones of * host-managed ZBC disks. */ sdkp->max_ws_blocks = round_down(sdkp->max_ws_blocks, bytes_to_logical(sdkp->device, sdkp->physical_block_size)); } out: blk_queue_max_write_zeroes_sectors(q, sdkp->max_ws_blocks * (logical_block_size >> 9)); } static blk_status_t sd_setup_flush_cmnd(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_disk *sdkp = scsi_disk(rq->q->disk); /* flush requests don't perform I/O, zero the S/G table */ memset(&cmd->sdb, 0, sizeof(cmd->sdb)); if (cmd->device->use_16_for_sync) { cmd->cmnd[0] = SYNCHRONIZE_CACHE_16; cmd->cmd_len = 16; } else { cmd->cmnd[0] = SYNCHRONIZE_CACHE; cmd->cmd_len = 10; } cmd->transfersize = 0; cmd->allowed = sdkp->max_retries; rq->timeout = rq->q->rq_timeout * SD_FLUSH_TIMEOUT_MULTIPLIER; return BLK_STS_OK; } static blk_status_t sd_setup_rw32_cmnd(struct scsi_cmnd *cmd, bool write, sector_t lba, unsigned int nr_blocks, unsigned char flags, unsigned int dld) { cmd->cmd_len = SD_EXT_CDB_SIZE; cmd->cmnd[0] = VARIABLE_LENGTH_CMD; cmd->cmnd[7] = 0x18; /* Additional CDB len */ cmd->cmnd[9] = write ? WRITE_32 : READ_32; cmd->cmnd[10] = flags; cmd->cmnd[11] = dld & 0x07; put_unaligned_be64(lba, &cmd->cmnd[12]); put_unaligned_be32(lba, &cmd->cmnd[20]); /* Expected Indirect LBA */ put_unaligned_be32(nr_blocks, &cmd->cmnd[28]); return BLK_STS_OK; } static blk_status_t sd_setup_rw16_cmnd(struct scsi_cmnd *cmd, bool write, sector_t lba, unsigned int nr_blocks, unsigned char flags, unsigned int dld) { cmd->cmd_len = 16; cmd->cmnd[0] = write ? WRITE_16 : READ_16; cmd->cmnd[1] = flags | ((dld >> 2) & 0x01); cmd->cmnd[14] = (dld & 0x03) << 6; cmd->cmnd[15] = 0; put_unaligned_be64(lba, &cmd->cmnd[2]); put_unaligned_be32(nr_blocks, &cmd->cmnd[10]); return BLK_STS_OK; } static blk_status_t sd_setup_rw10_cmnd(struct scsi_cmnd *cmd, bool write, sector_t lba, unsigned int nr_blocks, unsigned char flags) { cmd->cmd_len = 10; cmd->cmnd[0] = write ? WRITE_10 : READ_10; cmd->cmnd[1] = flags; cmd->cmnd[6] = 0; cmd->cmnd[9] = 0; put_unaligned_be32(lba, &cmd->cmnd[2]); put_unaligned_be16(nr_blocks, &cmd->cmnd[7]); return BLK_STS_OK; } static blk_status_t sd_setup_rw6_cmnd(struct scsi_cmnd *cmd, bool write, sector_t lba, unsigned int nr_blocks, unsigned char flags) { /* Avoid that 0 blocks gets translated into 256 blocks. */ if (WARN_ON_ONCE(nr_blocks == 0)) return BLK_STS_IOERR; if (unlikely(flags & 0x8)) { /* * This happens only if this drive failed 10byte rw * command with ILLEGAL_REQUEST during operation and * thus turned off use_10_for_rw. */ scmd_printk(KERN_ERR, cmd, "FUA write on READ/WRITE(6) drive\n"); return BLK_STS_IOERR; } cmd->cmd_len = 6; cmd->cmnd[0] = write ? WRITE_6 : READ_6; cmd->cmnd[1] = (lba >> 16) & 0x1f; cmd->cmnd[2] = (lba >> 8) & 0xff; cmd->cmnd[3] = lba & 0xff; cmd->cmnd[4] = nr_blocks; cmd->cmnd[5] = 0; return BLK_STS_OK; } /* * Check if a command has a duration limit set. If it does, and the target * device supports CDL and the feature is enabled, return the limit * descriptor index to use. Return 0 (no limit) otherwise. */ static int sd_cdl_dld(struct scsi_disk *sdkp, struct scsi_cmnd *scmd) { struct scsi_device *sdp = sdkp->device; int hint; if (!sdp->cdl_supported || !sdp->cdl_enable) return 0; /* * Use "no limit" if the request ioprio does not specify a duration * limit hint. */ hint = IOPRIO_PRIO_HINT(req_get_ioprio(scsi_cmd_to_rq(scmd))); if (hint < IOPRIO_HINT_DEV_DURATION_LIMIT_1 || hint > IOPRIO_HINT_DEV_DURATION_LIMIT_7) return 0; return (hint - IOPRIO_HINT_DEV_DURATION_LIMIT_1) + 1; } static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_device *sdp = cmd->device; struct scsi_disk *sdkp = scsi_disk(rq->q->disk); sector_t lba = sectors_to_logical(sdp, blk_rq_pos(rq)); sector_t threshold; unsigned int nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); unsigned int mask = logical_to_sectors(sdp, 1) - 1; bool write = rq_data_dir(rq) == WRITE; unsigned char protect, fua; unsigned int dld; blk_status_t ret; unsigned int dif; bool dix; ret = scsi_alloc_sgtables(cmd); if (ret != BLK_STS_OK) return ret; ret = BLK_STS_IOERR; if (!scsi_device_online(sdp) || sdp->changed) { scmd_printk(KERN_ERR, cmd, "device offline or changed\n"); goto fail; } if (blk_rq_pos(rq) + blk_rq_sectors(rq) > get_capacity(rq->q->disk)) { scmd_printk(KERN_ERR, cmd, "access beyond end of device\n"); goto fail; } if ((blk_rq_pos(rq) & mask) || (blk_rq_sectors(rq) & mask)) { scmd_printk(KERN_ERR, cmd, "request not aligned to the logical block size\n"); goto fail; } /* * Some SD card readers can't handle accesses which touch the * last one or two logical blocks. Split accesses as needed. */ threshold = sdkp->capacity - SD_LAST_BUGGY_SECTORS; if (unlikely(sdp->last_sector_bug && lba + nr_blocks > threshold)) { if (lba < threshold) { /* Access up to the threshold but not beyond */ nr_blocks = threshold - lba; } else { /* Access only a single logical block */ nr_blocks = 1; } } if (req_op(rq) == REQ_OP_ZONE_APPEND) { ret = sd_zbc_prepare_zone_append(cmd, &lba, nr_blocks); if (ret) goto fail; } fua = rq->cmd_flags & REQ_FUA ? 0x8 : 0; dix = scsi_prot_sg_count(cmd); dif = scsi_host_dif_capable(cmd->device->host, sdkp->protection_type); dld = sd_cdl_dld(sdkp, cmd); if (dif || dix) protect = sd_setup_protect_cmnd(cmd, dix, dif); else protect = 0; if (protect && sdkp->protection_type == T10_PI_TYPE2_PROTECTION) { ret = sd_setup_rw32_cmnd(cmd, write, lba, nr_blocks, protect | fua, dld); } else if (sdp->use_16_for_rw || (nr_blocks > 0xffff)) { ret = sd_setup_rw16_cmnd(cmd, write, lba, nr_blocks, protect | fua, dld); } else if ((nr_blocks > 0xff) || (lba > 0x1fffff) || sdp->use_10_for_rw || protect) { ret = sd_setup_rw10_cmnd(cmd, write, lba, nr_blocks, protect | fua); } else { ret = sd_setup_rw6_cmnd(cmd, write, lba, nr_blocks, protect | fua); } if (unlikely(ret != BLK_STS_OK)) goto fail; /* * We shouldn't disconnect in the middle of a sector, so with a dumb * host adapter, it's safe to assume that we can at least transfer * this many bytes between each connect / disconnect. */ cmd->transfersize = sdp->sector_size; cmd->underflow = nr_blocks << 9; cmd->allowed = sdkp->max_retries; cmd->sdb.length = nr_blocks * sdp->sector_size; SCSI_LOG_HLQUEUE(1, scmd_printk(KERN_INFO, cmd, "%s: block=%llu, count=%d\n", __func__, (unsigned long long)blk_rq_pos(rq), blk_rq_sectors(rq))); SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, cmd, "%s %d/%u 512 byte blocks.\n", write ? "writing" : "reading", nr_blocks, blk_rq_sectors(rq))); /* * This indicates that the command is ready from our end to be queued. */ return BLK_STS_OK; fail: scsi_free_sgtables(cmd); return ret; } static blk_status_t sd_init_command(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); switch (req_op(rq)) { case REQ_OP_DISCARD: switch (scsi_disk(rq->q->disk)->provisioning_mode) { case SD_LBP_UNMAP: return sd_setup_unmap_cmnd(cmd); case SD_LBP_WS16: return sd_setup_write_same16_cmnd(cmd, true); case SD_LBP_WS10: return sd_setup_write_same10_cmnd(cmd, true); case SD_LBP_ZERO: return sd_setup_write_same10_cmnd(cmd, false); default: return BLK_STS_TARGET; } case REQ_OP_WRITE_ZEROES: return sd_setup_write_zeroes_cmnd(cmd); case REQ_OP_FLUSH: return sd_setup_flush_cmnd(cmd); case REQ_OP_READ: case REQ_OP_WRITE: case REQ_OP_ZONE_APPEND: return sd_setup_read_write_cmnd(cmd); case REQ_OP_ZONE_RESET: return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER, false); case REQ_OP_ZONE_RESET_ALL: return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER, true); case REQ_OP_ZONE_OPEN: return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_OPEN_ZONE, false); case REQ_OP_ZONE_CLOSE: return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_CLOSE_ZONE, false); case REQ_OP_ZONE_FINISH: return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_FINISH_ZONE, false); default: WARN_ON_ONCE(1); return BLK_STS_NOTSUPP; } } static void sd_uninit_command(struct scsi_cmnd *SCpnt) { struct request *rq = scsi_cmd_to_rq(SCpnt); if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) mempool_free(rq->special_vec.bv_page, sd_page_pool); } static bool sd_need_revalidate(struct gendisk *disk, struct scsi_disk *sdkp) { if (sdkp->device->removable || sdkp->write_prot) { if (disk_check_media_change(disk)) return true; } /* * Force a full rescan after ioctl(BLKRRPART). While the disk state has * nothing to do with partitions, BLKRRPART is used to force a full * revalidate after things like a format for historical reasons. */ return test_bit(GD_NEED_PART_SCAN, &disk->state); } /** * sd_open - open a scsi disk device * @disk: disk to open * @mode: open mode * * Returns 0 if successful. Returns a negated errno value in case * of error. * * Note: This can be called from a user context (e.g. fsck(1) ) * or from within the kernel (e.g. as a result of a mount(1) ). * In the latter case @inode and @filp carry an abridged amount * of information as noted above. * * Locking: called with disk->open_mutex held. **/ static int sd_open(struct gendisk *disk, blk_mode_t mode) { struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdev = sdkp->device; int retval; if (scsi_device_get(sdev)) return -ENXIO; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_open\n")); /* * If the device is in error recovery, wait until it is done. * If the device is offline, then disallow any access to it. */ retval = -ENXIO; if (!scsi_block_when_processing_errors(sdev)) goto error_out; if (sd_need_revalidate(disk, sdkp)) sd_revalidate_disk(disk); /* * If the drive is empty, just let the open fail. */ retval = -ENOMEDIUM; if (sdev->removable && !sdkp->media_present && !(mode & BLK_OPEN_NDELAY)) goto error_out; /* * If the device has the write protect tab set, have the open fail * if the user expects to be able to write to the thing. */ retval = -EROFS; if (sdkp->write_prot && (mode & BLK_OPEN_WRITE)) goto error_out; /* * It is possible that the disk changing stuff resulted in * the device being taken offline. If this is the case, * report this to the user, and don't pretend that the * open actually succeeded. */ retval = -ENXIO; if (!scsi_device_online(sdev)) goto error_out; if ((atomic_inc_return(&sdkp->openers) == 1) && sdev->removable) { if (scsi_block_when_processing_errors(sdev)) scsi_set_medium_removal(sdev, SCSI_REMOVAL_PREVENT); } return 0; error_out: scsi_device_put(sdev); return retval; } /** * sd_release - invoked when the (last) close(2) is called on this * scsi disk. * @disk: disk to release * * Returns 0. * * Note: may block (uninterruptible) if error recovery is underway * on this disk. * * Locking: called with disk->open_mutex held. **/ static void sd_release(struct gendisk *disk) { struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdev = sdkp->device; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_release\n")); if (atomic_dec_return(&sdkp->openers) == 0 && sdev->removable) { if (scsi_block_when_processing_errors(sdev)) scsi_set_medium_removal(sdev, SCSI_REMOVAL_ALLOW); } scsi_device_put(sdev); } static int sd_getgeo(struct block_device *bdev, struct hd_geometry *geo) { struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk); struct scsi_device *sdp = sdkp->device; struct Scsi_Host *host = sdp->host; sector_t capacity = logical_to_sectors(sdp, sdkp->capacity); int diskinfo[4]; /* default to most commonly used values */ diskinfo[0] = 0x40; /* 1 << 6 */ diskinfo[1] = 0x20; /* 1 << 5 */ diskinfo[2] = capacity >> 11; /* override with calculated, extended default, or driver values */ if (host->hostt->bios_param) host->hostt->bios_param(sdp, bdev, capacity, diskinfo); else scsicam_bios_param(bdev, capacity, diskinfo); geo->heads = diskinfo[0]; geo->sectors = diskinfo[1]; geo->cylinders = diskinfo[2]; return 0; } /** * sd_ioctl - process an ioctl * @bdev: target block device * @mode: open mode * @cmd: ioctl command number * @arg: this is third argument given to ioctl(2) system call. * Often contains a pointer. * * Returns 0 if successful (some ioctls return positive numbers on * success as well). Returns a negated errno value in case of error. * * Note: most ioctls are forward onto the block subsystem or further * down in the scsi subsystem. **/ static int sd_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct gendisk *disk = bdev->bd_disk; struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdp = sdkp->device; void __user *p = (void __user *)arg; int error; SCSI_LOG_IOCTL(1, sd_printk(KERN_INFO, sdkp, "sd_ioctl: disk=%s, " "cmd=0x%x\n", disk->disk_name, cmd)); if (bdev_is_partition(bdev) && !capable(CAP_SYS_RAWIO)) return -ENOIOCTLCMD; /* * If we are in the middle of error recovery, don't let anyone * else try and use this device. Also, if error recovery fails, it * may try and take the device offline, in which case all further * access to the device is prohibited. */ error = scsi_ioctl_block_when_processing_errors(sdp, cmd, (mode & BLK_OPEN_NDELAY)); if (error) return error; if (is_sed_ioctl(cmd)) return sed_ioctl(sdkp->opal_dev, cmd, p); return scsi_ioctl(sdp, mode & BLK_OPEN_WRITE, cmd, p); } static void set_media_not_present(struct scsi_disk *sdkp) { if (sdkp->media_present) sdkp->device->changed = 1; if (sdkp->device->removable) { sdkp->media_present = 0; sdkp->capacity = 0; } } static int media_not_present(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr) { if (!scsi_sense_valid(sshdr)) return 0; /* not invoked for commands that could return deferred errors */ switch (sshdr->sense_key) { case UNIT_ATTENTION: case NOT_READY: /* medium not present */ if (sshdr->asc == 0x3A) { set_media_not_present(sdkp); return 1; } } return 0; } /** * sd_check_events - check media events * @disk: kernel device descriptor * @clearing: disk events currently being cleared * * Returns mask of DISK_EVENT_*. * * Note: this function is invoked from the block subsystem. **/ static unsigned int sd_check_events(struct gendisk *disk, unsigned int clearing) { struct scsi_disk *sdkp = disk->private_data; struct scsi_device *sdp; int retval; bool disk_changed; if (!sdkp) return 0; sdp = sdkp->device; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_check_events\n")); /* * If the device is offline, don't send any commands - just pretend as * if the command failed. If the device ever comes back online, we * can deal with it then. It is only because of unrecoverable errors * that we would ever take a device offline in the first place. */ if (!scsi_device_online(sdp)) { set_media_not_present(sdkp); goto out; } /* * Using TEST_UNIT_READY enables differentiation between drive with * no cartridge loaded - NOT READY, drive with changed cartridge - * UNIT ATTENTION, or with same cartridge - GOOD STATUS. * * Drives that auto spin down. eg iomega jaz 1G, will be started * by sd_spinup_disk() from sd_revalidate_disk(), which happens whenever * sd_revalidate() is called. */ if (scsi_block_when_processing_errors(sdp)) { struct scsi_sense_hdr sshdr = { 0, }; retval = scsi_test_unit_ready(sdp, SD_TIMEOUT, sdkp->max_retries, &sshdr); /* failed to execute TUR, assume media not present */ if (retval < 0 || host_byte(retval)) { set_media_not_present(sdkp); goto out; } if (media_not_present(sdkp, &sshdr)) goto out; } /* * For removable scsi disk we have to recognise the presence * of a disk in the drive. */ if (!sdkp->media_present) sdp->changed = 1; sdkp->media_present = 1; out: /* * sdp->changed is set under the following conditions: * * Medium present state has changed in either direction. * Device has indicated UNIT_ATTENTION. */ disk_changed = sdp->changed; sdp->changed = 0; return disk_changed ? DISK_EVENT_MEDIA_CHANGE : 0; } static int sd_sync_cache(struct scsi_disk *sdkp) { int retries, res; struct scsi_device *sdp = sdkp->device; const int timeout = sdp->request_queue->rq_timeout * SD_FLUSH_TIMEOUT_MULTIPLIER; struct scsi_sense_hdr sshdr; const struct scsi_exec_args exec_args = { .req_flags = BLK_MQ_REQ_PM, .sshdr = &sshdr, }; if (!scsi_device_online(sdp)) return -ENODEV; for (retries = 3; retries > 0; --retries) { unsigned char cmd[16] = { 0 }; if (sdp->use_16_for_sync) cmd[0] = SYNCHRONIZE_CACHE_16; else cmd[0] = SYNCHRONIZE_CACHE; /* * Leave the rest of the command zero to indicate * flush everything. */ res = scsi_execute_cmd(sdp, cmd, REQ_OP_DRV_IN, NULL, 0, timeout, sdkp->max_retries, &exec_args); if (res == 0) break; } if (res) { sd_print_result(sdkp, "Synchronize Cache(10) failed", res); if (res < 0) return res; if (scsi_status_is_check_condition(res) && scsi_sense_valid(&sshdr)) { sd_print_sense_hdr(sdkp, &sshdr); /* we need to evaluate the error return */ if (sshdr.asc == 0x3a || /* medium not present */ sshdr.asc == 0x20 || /* invalid command */ (sshdr.asc == 0x74 && sshdr.ascq == 0x71)) /* drive is password locked */ /* this is no error here */ return 0; /* * This drive doesn't support sync and there's not much * we can do because this is called during shutdown * or suspend so just return success so those operations * can proceed. */ if (sshdr.sense_key == ILLEGAL_REQUEST) return 0; } switch (host_byte(res)) { /* ignore errors due to racing a disconnection */ case DID_BAD_TARGET: case DID_NO_CONNECT: return 0; /* signal the upper layer it might try again */ case DID_BUS_BUSY: case DID_IMM_RETRY: case DID_REQUEUE: case DID_SOFT_ERROR: return -EBUSY; default: return -EIO; } } return 0; } static void sd_rescan(struct device *dev) { struct scsi_disk *sdkp = dev_get_drvdata(dev); sd_revalidate_disk(sdkp->disk); } static int sd_get_unique_id(struct gendisk *disk, u8 id[16], enum blk_unique_id type) { struct scsi_device *sdev = scsi_disk(disk)->device; const struct scsi_vpd *vpd; const unsigned char *d; int ret = -ENXIO, len; rcu_read_lock(); vpd = rcu_dereference(sdev->vpd_pg83); if (!vpd) goto out_unlock; ret = -EINVAL; for (d = vpd->data + 4; d < vpd->data + vpd->len; d += d[3] + 4) { /* we only care about designators with LU association */ if (((d[1] >> 4) & 0x3) != 0x00) continue; if ((d[1] & 0xf) != type) continue; /* * Only exit early if a 16-byte descriptor was found. Otherwise * keep looking as one with more entropy might still show up. */ len = d[3]; if (len != 8 && len != 12 && len != 16) continue; ret = len; memcpy(id, d + 4, len); if (len == 16) break; } out_unlock: rcu_read_unlock(); return ret; } static int sd_scsi_to_pr_err(struct scsi_sense_hdr *sshdr, int result) { switch (host_byte(result)) { case DID_TRANSPORT_MARGINAL: case DID_TRANSPORT_DISRUPTED: case DID_BUS_BUSY: return PR_STS_RETRY_PATH_FAILURE; case DID_NO_CONNECT: return PR_STS_PATH_FAILED; case DID_TRANSPORT_FAILFAST: return PR_STS_PATH_FAST_FAILED; } switch (status_byte(result)) { case SAM_STAT_RESERVATION_CONFLICT: return PR_STS_RESERVATION_CONFLICT; case SAM_STAT_CHECK_CONDITION: if (!scsi_sense_valid(sshdr)) return PR_STS_IOERR; if (sshdr->sense_key == ILLEGAL_REQUEST && (sshdr->asc == 0x26 || sshdr->asc == 0x24)) return -EINVAL; fallthrough; default: return PR_STS_IOERR; } } static int sd_pr_in_command(struct block_device *bdev, u8 sa, unsigned char *data, int data_len) { struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk); struct scsi_device *sdev = sdkp->device; struct scsi_sense_hdr sshdr; u8 cmd[10] = { PERSISTENT_RESERVE_IN, sa }; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, }; int result; put_unaligned_be16(data_len, &cmd[7]); result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_IN, data, data_len, SD_TIMEOUT, sdkp->max_retries, &exec_args); if (scsi_status_is_check_condition(result) && scsi_sense_valid(&sshdr)) { sdev_printk(KERN_INFO, sdev, "PR command failed: %d\n", result); scsi_print_sense_hdr(sdev, NULL, &sshdr); } if (result <= 0) return result; return sd_scsi_to_pr_err(&sshdr, result); } static int sd_pr_read_keys(struct block_device *bdev, struct pr_keys *keys_info) { int result, i, data_offset, num_copy_keys; u32 num_keys = keys_info->num_keys; int data_len = num_keys * 8 + 8; u8 *data; data = kzalloc(data_len, GFP_KERNEL); if (!data) return -ENOMEM; result = sd_pr_in_command(bdev, READ_KEYS, data, data_len); if (result) goto free_data; keys_info->generation = get_unaligned_be32(&data[0]); keys_info->num_keys = get_unaligned_be32(&data[4]) / 8; data_offset = 8; num_copy_keys = min(num_keys, keys_info->num_keys); for (i = 0; i < num_copy_keys; i++) { keys_info->keys[i] = get_unaligned_be64(&data[data_offset]); data_offset += 8; } free_data: kfree(data); return result; } static int sd_pr_read_reservation(struct block_device *bdev, struct pr_held_reservation *rsv) { struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk); struct scsi_device *sdev = sdkp->device; u8 data[24] = { }; int result, len; result = sd_pr_in_command(bdev, READ_RESERVATION, data, sizeof(data)); if (result) return result; len = get_unaligned_be32(&data[4]); if (!len) return 0; /* Make sure we have at least the key and type */ if (len < 14) { sdev_printk(KERN_INFO, sdev, "READ RESERVATION failed due to short return buffer of %d bytes\n", len); return -EINVAL; } rsv->generation = get_unaligned_be32(&data[0]); rsv->key = get_unaligned_be64(&data[8]); rsv->type = scsi_pr_type_to_block(data[21] & 0x0f); return 0; } static int sd_pr_out_command(struct block_device *bdev, u8 sa, u64 key, u64 sa_key, enum scsi_pr_type type, u8 flags) { struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk); struct scsi_device *sdev = sdkp->device; struct scsi_sense_hdr sshdr; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, }; int result; u8 cmd[16] = { 0, }; u8 data[24] = { 0, }; cmd[0] = PERSISTENT_RESERVE_OUT; cmd[1] = sa; cmd[2] = type; put_unaligned_be32(sizeof(data), &cmd[5]); put_unaligned_be64(key, &data[0]); put_unaligned_be64(sa_key, &data[8]); data[20] = flags; result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_OUT, &data, sizeof(data), SD_TIMEOUT, sdkp->max_retries, &exec_args); if (scsi_status_is_check_condition(result) && scsi_sense_valid(&sshdr)) { sdev_printk(KERN_INFO, sdev, "PR command failed: %d\n", result); scsi_print_sense_hdr(sdev, NULL, &sshdr); } if (result <= 0) return result; return sd_scsi_to_pr_err(&sshdr, result); } static int sd_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, u32 flags) { if (flags & ~PR_FL_IGNORE_KEY) return -EOPNOTSUPP; return sd_pr_out_command(bdev, (flags & PR_FL_IGNORE_KEY) ? 0x06 : 0x00, old_key, new_key, 0, (1 << 0) /* APTPL */); } static int sd_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, u32 flags) { if (flags) return -EOPNOTSUPP; return sd_pr_out_command(bdev, 0x01, key, 0, block_pr_type_to_scsi(type), 0); } static int sd_pr_release(struct block_device *bdev, u64 key, enum pr_type type) { return sd_pr_out_command(bdev, 0x02, key, 0, block_pr_type_to_scsi(type), 0); } static int sd_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, enum pr_type type, bool abort) { return sd_pr_out_command(bdev, abort ? 0x05 : 0x04, old_key, new_key, block_pr_type_to_scsi(type), 0); } static int sd_pr_clear(struct block_device *bdev, u64 key) { return sd_pr_out_command(bdev, 0x03, key, 0, 0, 0); } static const struct pr_ops sd_pr_ops = { .pr_register = sd_pr_register, .pr_reserve = sd_pr_reserve, .pr_release = sd_pr_release, .pr_preempt = sd_pr_preempt, .pr_clear = sd_pr_clear, .pr_read_keys = sd_pr_read_keys, .pr_read_reservation = sd_pr_read_reservation, }; static void scsi_disk_free_disk(struct gendisk *disk) { struct scsi_disk *sdkp = scsi_disk(disk); put_device(&sdkp->disk_dev); } static const struct block_device_operations sd_fops = { .owner = THIS_MODULE, .open = sd_open, .release = sd_release, .ioctl = sd_ioctl, .getgeo = sd_getgeo, .compat_ioctl = blkdev_compat_ptr_ioctl, .check_events = sd_check_events, .unlock_native_capacity = sd_unlock_native_capacity, .report_zones = sd_zbc_report_zones, .get_unique_id = sd_get_unique_id, .free_disk = scsi_disk_free_disk, .pr_ops = &sd_pr_ops, }; /** * sd_eh_reset - reset error handling callback * @scmd: sd-issued command that has failed * * This function is called by the SCSI midlayer before starting * SCSI EH. When counting medium access failures we have to be * careful to register it only only once per device and SCSI EH run; * there might be several timed out commands which will cause the * 'max_medium_access_timeouts' counter to trigger after the first * SCSI EH run already and set the device to offline. * So this function resets the internal counter before starting SCSI EH. **/ static void sd_eh_reset(struct scsi_cmnd *scmd) { struct scsi_disk *sdkp = scsi_disk(scsi_cmd_to_rq(scmd)->q->disk); /* New SCSI EH run, reset gate variable */ sdkp->ignore_medium_access_errors = false; } /** * sd_eh_action - error handling callback * @scmd: sd-issued command that has failed * @eh_disp: The recovery disposition suggested by the midlayer * * This function is called by the SCSI midlayer upon completion of an * error test command (currently TEST UNIT READY). The result of sending * the eh command is passed in eh_disp. We're looking for devices that * fail medium access commands but are OK with non access commands like * test unit ready (so wrongly see the device as having a successful * recovery) **/ static int sd_eh_action(struct scsi_cmnd *scmd, int eh_disp) { struct scsi_disk *sdkp = scsi_disk(scsi_cmd_to_rq(scmd)->q->disk); struct scsi_device *sdev = scmd->device; if (!scsi_device_online(sdev) || !scsi_medium_access_command(scmd) || host_byte(scmd->result) != DID_TIME_OUT || eh_disp != SUCCESS) return eh_disp; /* * The device has timed out executing a medium access command. * However, the TEST UNIT READY command sent during error * handling completed successfully. Either the device is in the * process of recovering or has it suffered an internal failure * that prevents access to the storage medium. */ if (!sdkp->ignore_medium_access_errors) { sdkp->medium_access_timed_out++; sdkp->ignore_medium_access_errors = true; } /* * If the device keeps failing read/write commands but TEST UNIT * READY always completes successfully we assume that medium * access is no longer possible and take the device offline. */ if (sdkp->medium_access_timed_out >= sdkp->max_medium_access_timeouts) { scmd_printk(KERN_ERR, scmd, "Medium access timeout failure. Offlining disk!\n"); mutex_lock(&sdev->state_mutex); scsi_device_set_state(sdev, SDEV_OFFLINE); mutex_unlock(&sdev->state_mutex); return SUCCESS; } return eh_disp; } static unsigned int sd_completed_bytes(struct scsi_cmnd *scmd) { struct request *req = scsi_cmd_to_rq(scmd); struct scsi_device *sdev = scmd->device; unsigned int transferred, good_bytes; u64 start_lba, end_lba, bad_lba; /* * Some commands have a payload smaller than the device logical * block size (e.g. INQUIRY on a 4K disk). */ if (scsi_bufflen(scmd) <= sdev->sector_size) return 0; /* Check if we have a 'bad_lba' information */ if (!scsi_get_sense_info_fld(scmd->sense_buffer, SCSI_SENSE_BUFFERSIZE, &bad_lba)) return 0; /* * If the bad lba was reported incorrectly, we have no idea where * the error is. */ start_lba = sectors_to_logical(sdev, blk_rq_pos(req)); end_lba = start_lba + bytes_to_logical(sdev, scsi_bufflen(scmd)); if (bad_lba < start_lba || bad_lba >= end_lba) return 0; /* * resid is optional but mostly filled in. When it's unused, * its value is zero, so we assume the whole buffer transferred */ transferred = scsi_bufflen(scmd) - scsi_get_resid(scmd); /* This computation should always be done in terms of the * resolution of the device's medium. */ good_bytes = logical_to_bytes(sdev, bad_lba - start_lba); return min(good_bytes, transferred); } /** * sd_done - bottom half handler: called when the lower level * driver has completed (successfully or otherwise) a scsi command. * @SCpnt: mid-level's per command structure. * * Note: potentially run from within an ISR. Must not block. **/ static int sd_done(struct scsi_cmnd *SCpnt) { int result = SCpnt->result; unsigned int good_bytes = result ? 0 : scsi_bufflen(SCpnt); unsigned int sector_size = SCpnt->device->sector_size; unsigned int resid; struct scsi_sense_hdr sshdr; struct request *req = scsi_cmd_to_rq(SCpnt); struct scsi_disk *sdkp = scsi_disk(req->q->disk); int sense_valid = 0; int sense_deferred = 0; switch (req_op(req)) { case REQ_OP_DISCARD: case REQ_OP_WRITE_ZEROES: case REQ_OP_ZONE_RESET: case REQ_OP_ZONE_RESET_ALL: case REQ_OP_ZONE_OPEN: case REQ_OP_ZONE_CLOSE: case REQ_OP_ZONE_FINISH: if (!result) { good_bytes = blk_rq_bytes(req); scsi_set_resid(SCpnt, 0); } else { good_bytes = 0; scsi_set_resid(SCpnt, blk_rq_bytes(req)); } break; default: /* * In case of bogus fw or device, we could end up having * an unaligned partial completion. Check this here and force * alignment. */ resid = scsi_get_resid(SCpnt); if (resid & (sector_size - 1)) { sd_printk(KERN_INFO, sdkp, "Unaligned partial completion (resid=%u, sector_sz=%u)\n", resid, sector_size); scsi_print_command(SCpnt); resid = min(scsi_bufflen(SCpnt), round_up(resid, sector_size)); scsi_set_resid(SCpnt, resid); } } if (result) { sense_valid = scsi_command_normalize_sense(SCpnt, &sshdr); if (sense_valid) sense_deferred = scsi_sense_is_deferred(&sshdr); } sdkp->medium_access_timed_out = 0; if (!scsi_status_is_check_condition(result) && (!sense_valid || sense_deferred)) goto out; switch (sshdr.sense_key) { case HARDWARE_ERROR: case MEDIUM_ERROR: good_bytes = sd_completed_bytes(SCpnt); break; case RECOVERED_ERROR: good_bytes = scsi_bufflen(SCpnt); break; case NO_SENSE: /* This indicates a false check condition, so ignore it. An * unknown amount of data was transferred so treat it as an * error. */ SCpnt->result = 0; memset(SCpnt->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); break; case ABORTED_COMMAND: if (sshdr.asc == 0x10) /* DIF: Target detected corruption */ good_bytes = sd_completed_bytes(SCpnt); break; case ILLEGAL_REQUEST: switch (sshdr.asc) { case 0x10: /* DIX: Host detected corruption */ good_bytes = sd_completed_bytes(SCpnt); break; case 0x20: /* INVALID COMMAND OPCODE */ case 0x24: /* INVALID FIELD IN CDB */ switch (SCpnt->cmnd[0]) { case UNMAP: sd_config_discard(sdkp, SD_LBP_DISABLE); break; case WRITE_SAME_16: case WRITE_SAME: if (SCpnt->cmnd[1] & 8) { /* UNMAP */ sd_config_discard(sdkp, SD_LBP_DISABLE); } else { sdkp->device->no_write_same = 1; sd_config_write_same(sdkp); req->rq_flags |= RQF_QUIET; } break; } } break; default: break; } out: if (sd_is_zoned(sdkp)) good_bytes = sd_zbc_complete(SCpnt, good_bytes, &sshdr); SCSI_LOG_HLCOMPLETE(1, scmd_printk(KERN_INFO, SCpnt, "sd_done: completed %d of %d bytes\n", good_bytes, scsi_bufflen(SCpnt))); return good_bytes; } /* * spinup disk - called only in sd_revalidate_disk() */ static void sd_spinup_disk(struct scsi_disk *sdkp) { unsigned char cmd[10]; unsigned long spintime_expire = 0; int retries, spintime; unsigned int the_result; struct scsi_sense_hdr sshdr; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, }; int sense_valid = 0; spintime = 0; /* Spin up drives, as required. Only do this at boot time */ /* Spinup needs to be done for module loads too. */ do { retries = 0; do { bool media_was_present = sdkp->media_present; cmd[0] = TEST_UNIT_READY; memset((void *) &cmd[1], 0, 9); the_result = scsi_execute_cmd(sdkp->device, cmd, REQ_OP_DRV_IN, NULL, 0, SD_TIMEOUT, sdkp->max_retries, &exec_args); if (the_result > 0) { /* * If the drive has indicated to us that it * doesn't have any media in it, don't bother * with any more polling. */ if (media_not_present(sdkp, &sshdr)) { if (media_was_present) sd_printk(KERN_NOTICE, sdkp, "Media removed, stopped polling\n"); return; } sense_valid = scsi_sense_valid(&sshdr); } retries++; } while (retries < 3 && (!scsi_status_is_good(the_result) || (scsi_status_is_check_condition(the_result) && sense_valid && sshdr.sense_key == UNIT_ATTENTION))); if (!scsi_status_is_check_condition(the_result)) { /* no sense, TUR either succeeded or failed * with a status error */ if(!spintime && !scsi_status_is_good(the_result)) { sd_print_result(sdkp, "Test Unit Ready failed", the_result); } break; } /* * The device does not want the automatic start to be issued. */ if (sdkp->device->no_start_on_add) break; if (sense_valid && sshdr.sense_key == NOT_READY) { if (sshdr.asc == 4 && sshdr.ascq == 3) break; /* manual intervention required */ if (sshdr.asc == 4 && sshdr.ascq == 0xb) break; /* standby */ if (sshdr.asc == 4 && sshdr.ascq == 0xc) break; /* unavailable */ if (sshdr.asc == 4 && sshdr.ascq == 0x1b) break; /* sanitize in progress */ if (sshdr.asc == 4 && sshdr.ascq == 0x24) break; /* depopulation in progress */ if (sshdr.asc == 4 && sshdr.ascq == 0x25) break; /* depopulation restoration in progress */ /* * Issue command to spin up drive when not ready */ if (!spintime) { sd_printk(KERN_NOTICE, sdkp, "Spinning up disk..."); cmd[0] = START_STOP; cmd[1] = 1; /* Return immediately */ memset((void *) &cmd[2], 0, 8); cmd[4] = 1; /* Start spin cycle */ if (sdkp->device->start_stop_pwr_cond) cmd[4] |= 1 << 4; scsi_execute_cmd(sdkp->device, cmd, REQ_OP_DRV_IN, NULL, 0, SD_TIMEOUT, sdkp->max_retries, &exec_args); spintime_expire = jiffies + 100 * HZ; spintime = 1; } /* Wait 1 second for next try */ msleep(1000); printk(KERN_CONT "."); /* * Wait for USB flash devices with slow firmware. * Yes, this sense key/ASC combination shouldn't * occur here. It's characteristic of these devices. */ } else if (sense_valid && sshdr.sense_key == UNIT_ATTENTION && sshdr.asc == 0x28) { if (!spintime) { spintime_expire = jiffies + 5 * HZ; spintime = 1; } /* Wait 1 second for next try */ msleep(1000); } else { /* we don't understand the sense code, so it's * probably pointless to loop */ if(!spintime) { sd_printk(KERN_NOTICE, sdkp, "Unit Not Ready\n"); sd_print_sense_hdr(sdkp, &sshdr); } break; } } while (spintime && time_before_eq(jiffies, spintime_expire)); if (spintime) { if (scsi_status_is_good(the_result)) printk(KERN_CONT "ready\n"); else printk(KERN_CONT "not responding...\n"); } } /* * Determine whether disk supports Data Integrity Field. */ static int sd_read_protection_type(struct scsi_disk *sdkp, unsigned char *buffer) { struct scsi_device *sdp = sdkp->device; u8 type; if (scsi_device_protection(sdp) == 0 || (buffer[12] & 1) == 0) { sdkp->protection_type = 0; return 0; } type = ((buffer[12] >> 1) & 7) + 1; /* P_TYPE 0 = Type 1 */ if (type > T10_PI_TYPE3_PROTECTION) { sd_printk(KERN_ERR, sdkp, "formatted with unsupported" \ " protection type %u. Disabling disk!\n", type); sdkp->protection_type = 0; return -ENODEV; } sdkp->protection_type = type; return 0; } static void sd_config_protection(struct scsi_disk *sdkp) { struct scsi_device *sdp = sdkp->device; sd_dif_config_host(sdkp); if (!sdkp->protection_type) return; if (!scsi_host_dif_capable(sdp->host, sdkp->protection_type)) { sd_first_printk(KERN_NOTICE, sdkp, "Disabling DIF Type %u protection\n", sdkp->protection_type); sdkp->protection_type = 0; } sd_first_printk(KERN_NOTICE, sdkp, "Enabling DIF Type %u protection\n", sdkp->protection_type); } static void read_capacity_error(struct scsi_disk *sdkp, struct scsi_device *sdp, struct scsi_sense_hdr *sshdr, int sense_valid, int the_result) { if (sense_valid) sd_print_sense_hdr(sdkp, sshdr); else sd_printk(KERN_NOTICE, sdkp, "Sense not available.\n"); /* * Set dirty bit for removable devices if not ready - * sometimes drives will not report this properly. */ if (sdp->removable && sense_valid && sshdr->sense_key == NOT_READY) set_media_not_present(sdkp); /* * We used to set media_present to 0 here to indicate no media * in the drive, but some drives fail read capacity even with * media present, so we can't do that. */ sdkp->capacity = 0; /* unknown mapped to zero - as usual */ } #define RC16_LEN 32 #if RC16_LEN > SD_BUF_SIZE #error RC16_LEN must not be more than SD_BUF_SIZE #endif #define READ_CAPACITY_RETRIES_ON_RESET 10 static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp, unsigned char *buffer) { unsigned char cmd[16]; struct scsi_sense_hdr sshdr; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, }; int sense_valid = 0; int the_result; int retries = 3, reset_retries = READ_CAPACITY_RETRIES_ON_RESET; unsigned int alignment; unsigned long long lba; unsigned sector_size; if (sdp->no_read_capacity_16) return -EINVAL; do { memset(cmd, 0, 16); cmd[0] = SERVICE_ACTION_IN_16; cmd[1] = SAI_READ_CAPACITY_16; cmd[13] = RC16_LEN; memset(buffer, 0, RC16_LEN); the_result = scsi_execute_cmd(sdp, cmd, REQ_OP_DRV_IN, buffer, RC16_LEN, SD_TIMEOUT, sdkp->max_retries, &exec_args); if (the_result > 0) { if (media_not_present(sdkp, &sshdr)) return -ENODEV; sense_valid = scsi_sense_valid(&sshdr); if (sense_valid && sshdr.sense_key == ILLEGAL_REQUEST && (sshdr.asc