5 7 7 3 4 3 6 2 2 2 7 8 7 7 7 7 7 6 5 6 5 3 7 8 7 6 8 5 4 4 8 4 7 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 // SPDX-License-Identifier: GPL-2.0-or-later /* * udp_diag.c Module for monitoring UDP transport protocols sockets. * * Authors: Pavel Emelyanov, <xemul@parallels.com> */ #include <linux/module.h> #include <linux/inet_diag.h> #include <linux/udp.h> #include <net/udp.h> #include <net/udplite.h> #include <linux/sock_diag.h> static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, struct nlattr *bc, bool net_admin) { if (!inet_diag_bc_sk(bc, sk)) return 0; return inet_sk_diag_fill(sk, NULL, skb, cb, req, NLM_F_MULTI, net_admin); } static int udp_dump_one(struct udp_table *tbl, struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { struct sk_buff *in_skb = cb->skb; int err; struct sock *sk = NULL; struct sk_buff *rep; struct net *net = sock_net(in_skb->sk); rcu_read_lock(); if (req->sdiag_family == AF_INET) /* src and dst are swapped for historical reasons */ sk = __udp4_lib_lookup(net, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_if, 0, tbl, NULL); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) sk = __udp6_lib_lookup(net, (struct in6_addr *)req->id.idiag_src, req->id.idiag_sport, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, req->id.idiag_if, 0, tbl, NULL); #endif if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; rcu_read_unlock(); err = -ENOENT; if (!sk) goto out_nosk; err = sock_diag_check_cookie(sk, req->id.idiag_cookie); if (err) goto out; err = -ENOMEM; rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) + inet_diag_msg_attrs_size() + nla_total_size(sizeof(struct inet_diag_meminfo)) + 64, GFP_KERNEL); if (!rep) goto out; err = inet_sk_diag_fill(sk, NULL, rep, cb, req, 0, netlink_net_capable(in_skb, CAP_NET_ADMIN)); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(rep); goto out; } err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); out: if (sk) sock_put(sk); out_nosk: return err; } static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct net *net = sock_net(skb->sk); struct inet_diag_dump_data *cb_data; int num, s_num, slot, s_slot; struct nlattr *bc; cb_data = cb->data; bc = cb_data->inet_diag_nla_bc; s_slot = cb->args[0]; num = s_num = cb->args[1]; for (slot = s_slot; slot <= table->mask; s_num = 0, slot++) { struct udp_hslot *hslot = &table->hash[slot]; struct sock *sk; num = 0; if (hlist_empty(&hslot->head)) continue; spin_lock_bh(&hslot->lock); sk_for_each(sk, &hslot->head) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) continue; if (num < s_num) goto next; if (!(r->idiag_states & (1 << sk->sk_state))) goto next; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) goto next; if (r->id.idiag_sport != inet->inet_sport && r->id.idiag_sport) goto next; if (r->id.idiag_dport != inet->inet_dport && r->id.idiag_dport) goto next; if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) { spin_unlock_bh(&hslot->lock); goto done; } next: num++; } spin_unlock_bh(&hslot->lock); } done: cb->args[0] = slot; cb->args[1] = num; } static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { udp_dump(sock_net(cb->skb->sk)->ipv4.udp_table, skb, cb, r); } static int udp_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { return udp_dump_one(sock_net(cb->skb->sk)->ipv4.udp_table, cb, req); } static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *info) { r->idiag_rqueue = udp_rqueue_get(sk); r->idiag_wqueue = sk_wmem_alloc_get(sk); } #ifdef CONFIG_INET_DIAG_DESTROY static int __udp_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req, struct udp_table *tbl) { struct net *net = sock_net(in_skb->sk); struct sock *sk; int err; rcu_read_lock(); if (req->sdiag_family == AF_INET) sk = __udp4_lib_lookup(net, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_if, 0, tbl, NULL); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) { if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) sk = __udp4_lib_lookup(net, req->id.idiag_dst[3], req->id.idiag_dport, req->id.idiag_src[3], req->id.idiag_sport, req->id.idiag_if, 0, tbl, NULL); else sk = __udp6_lib_lookup(net, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, (struct in6_addr *)req->id.idiag_src, req->id.idiag_sport, req->id.idiag_if, 0, tbl, NULL); } #endif else { rcu_read_unlock(); return -EINVAL; } if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; rcu_read_unlock(); if (!sk) return -ENOENT; if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) { sock_put(sk); return -ENOENT; } err = sock_diag_destroy(sk, ECONNABORTED); sock_put(sk); return err; } static int udp_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req) { return __udp_diag_destroy(in_skb, req, sock_net(in_skb->sk)->ipv4.udp_table); } static int udplite_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req) { return __udp_diag_destroy(in_skb, req, &udplite_table); } #endif static const struct inet_diag_handler udp_diag_handler = { .dump = udp_diag_dump, .dump_one = udp_diag_dump_one, .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDP, .idiag_info_size = 0, #ifdef CONFIG_INET_DIAG_DESTROY .destroy = udp_diag_destroy, #endif }; static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { udp_dump(&udplite_table, skb, cb, r); } static int udplite_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { return udp_dump_one(&udplite_table, cb, req); } static const struct inet_diag_handler udplite_diag_handler = { .dump = udplite_diag_dump, .dump_one = udplite_diag_dump_one, .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDPLITE, .idiag_info_size = 0, #ifdef CONFIG_INET_DIAG_DESTROY .destroy = udplite_diag_destroy, #endif }; static int __init udp_diag_init(void) { int err; err = inet_diag_register(&udp_diag_handler); if (err) goto out; err = inet_diag_register(&udplite_diag_handler); if (err) goto out_lite; out: return err; out_lite: inet_diag_unregister(&udp_diag_handler); goto out; } static void __exit udp_diag_exit(void) { inet_diag_unregister(&udplite_diag_handler); inet_diag_unregister(&udp_diag_handler); } module_init(udp_diag_init); module_exit(udp_diag_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("UDP socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-17 /* AF_INET - IPPROTO_UDP */); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-136 /* AF_INET - IPPROTO_UDPLITE */);
1 1 1 1 1 1 1 79 80 1 2 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 // SPDX-License-Identifier: LGPL-2.1 /* * Copyright IBM Corporation, 2010 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> */ #include <linux/module.h> #include <linux/fs.h> #include <net/9p/9p.h> #include <net/9p/client.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/posix_acl_xattr.h> #include "xattr.h" #include "acl.h" #include "v9fs.h" #include "v9fs_vfs.h" #include "fid.h" static struct posix_acl *v9fs_fid_get_acl(struct p9_fid *fid, const char *name) { ssize_t size; void *value = NULL; struct posix_acl *acl = NULL; size = v9fs_fid_xattr_get(fid, name, NULL, 0); if (size < 0) return ERR_PTR(size); if (size == 0) return ERR_PTR(-ENODATA); value = kzalloc(size, GFP_NOFS); if (!value) return ERR_PTR(-ENOMEM); size = v9fs_fid_xattr_get(fid, name, value, size); if (size < 0) acl = ERR_PTR(size); else if (size == 0) acl = ERR_PTR(-ENODATA); else acl = posix_acl_from_xattr(&init_user_ns, value, size); kfree(value); return acl; } static struct posix_acl *v9fs_acl_get(struct dentry *dentry, const char *name) { struct p9_fid *fid; struct posix_acl *acl = NULL; fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) return ERR_CAST(fid); acl = v9fs_fid_get_acl(fid, name); p9_fid_put(fid); return acl; } static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, const char *name) { int retval; struct posix_acl *acl = NULL; acl = v9fs_fid_get_acl(fid, name); if (!IS_ERR(acl)) return acl; retval = PTR_ERR(acl); if (retval == -ENODATA || retval == -ENOSYS || retval == -EOPNOTSUPP) return NULL; /* map everything else to -EIO */ return ERR_PTR(-EIO); } int v9fs_get_acl(struct inode *inode, struct p9_fid *fid) { int retval = 0; struct posix_acl *pacl, *dacl; struct v9fs_session_info *v9ses; v9ses = v9fs_inode2v9ses(inode); if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) || ((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) { set_cached_acl(inode, ACL_TYPE_DEFAULT, NULL); set_cached_acl(inode, ACL_TYPE_ACCESS, NULL); return 0; } /* get the default/access acl values and cache them */ dacl = __v9fs_get_acl(fid, XATTR_NAME_POSIX_ACL_DEFAULT); pacl = __v9fs_get_acl(fid, XATTR_NAME_POSIX_ACL_ACCESS); if (!IS_ERR(dacl) && !IS_ERR(pacl)) { set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl); set_cached_acl(inode, ACL_TYPE_ACCESS, pacl); } else retval = -EIO; if (!IS_ERR(dacl)) posix_acl_release(dacl); if (!IS_ERR(pacl)) posix_acl_release(pacl); return retval; } static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type) { struct posix_acl *acl; /* * 9p Always cache the acl value when * instantiating the inode (v9fs_inode_from_fid) */ acl = get_cached_acl(inode, type); BUG_ON(is_uncached_acl(acl)); return acl; } struct posix_acl *v9fs_iop_get_inode_acl(struct inode *inode, int type, bool rcu) { struct v9fs_session_info *v9ses; if (rcu) return ERR_PTR(-ECHILD); v9ses = v9fs_inode2v9ses(inode); if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) || ((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) { /* * On access = client and acl = on mode get the acl * values from the server */ return NULL; } return v9fs_get_cached_acl(inode, type); } struct posix_acl *v9fs_iop_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type) { struct v9fs_session_info *v9ses; v9ses = v9fs_dentry2v9ses(dentry); /* We allow set/get/list of acl when access=client is not specified. */ if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) return v9fs_acl_get(dentry, posix_acl_xattr_name(type)); return v9fs_get_cached_acl(d_inode(dentry), type); } int v9fs_iop_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int retval; size_t size = 0; void *value = NULL; const char *acl_name; struct v9fs_session_info *v9ses; struct inode *inode = d_inode(dentry); if (acl) { retval = posix_acl_valid(inode->i_sb->s_user_ns, acl); if (retval) goto err_out; size = posix_acl_xattr_size(acl->a_count); value = kzalloc(size, GFP_NOFS); if (!value) { retval = -ENOMEM; goto err_out; } retval = posix_acl_to_xattr(&init_user_ns, acl, value, size); if (retval < 0) goto err_out; } /* * set the attribute on the remote. Without even looking at the * xattr value. We leave it to the server to validate */ acl_name = posix_acl_xattr_name(type); v9ses = v9fs_dentry2v9ses(dentry); if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) { retval = v9fs_xattr_set(dentry, acl_name, value, size, 0); goto err_out; } if (S_ISLNK(inode->i_mode)) { retval = -EOPNOTSUPP; goto err_out; } if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) { retval = -EPERM; goto err_out; } switch (type) { case ACL_TYPE_ACCESS: if (acl) { struct iattr iattr = {}; struct posix_acl *acl_mode = acl; retval = posix_acl_update_mode(&nop_mnt_idmap, inode, &iattr.ia_mode, &acl_mode); if (retval) goto err_out; if (!acl_mode) { /* * ACL can be represented by the mode bits. * So don't update ACL below. */ kfree(value); value = NULL; size = 0; } iattr.ia_valid = ATTR_MODE; /* * FIXME should we update ctime ? * What is the following setxattr update the mode ? */ v9fs_vfs_setattr_dotl(&nop_mnt_idmap, dentry, &iattr); } break; case ACL_TYPE_DEFAULT: if (!S_ISDIR(inode->i_mode)) { retval = acl ? -EINVAL : 0; goto err_out; } break; } retval = v9fs_xattr_set(dentry, acl_name, value, size, 0); if (!retval) set_cached_acl(inode, type, acl); err_out: kfree(value); return retval; } static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl) { int retval; char *name; size_t size; void *buffer; if (!acl) return 0; /* Set a setxattr request to server */ size = posix_acl_xattr_size(acl->a_count); buffer = kmalloc(size, GFP_KERNEL); if (!buffer) return -ENOMEM; retval = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); if (retval < 0) goto err_free_out; switch (type) { case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: name = XATTR_NAME_POSIX_ACL_DEFAULT; break; default: BUG(); } retval = v9fs_fid_xattr_set(fid, name, buffer, size, 0); err_free_out: kfree(buffer); return retval; } int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid) { int retval = 0; struct posix_acl *acl; if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS); if (acl) { retval = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); if (retval) return retval; set_cached_acl(inode, ACL_TYPE_ACCESS, acl); retval = v9fs_set_acl(fid, ACL_TYPE_ACCESS, acl); posix_acl_release(acl); } return retval; } int v9fs_set_create_acl(struct inode *inode, struct p9_fid *fid, struct posix_acl *dacl, struct posix_acl *acl) { set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl); set_cached_acl(inode, ACL_TYPE_ACCESS, acl); v9fs_set_acl(fid, ACL_TYPE_DEFAULT, dacl); v9fs_set_acl(fid, ACL_TYPE_ACCESS, acl); return 0; } void v9fs_put_acl(struct posix_acl *dacl, struct posix_acl *acl) { posix_acl_release(dacl); posix_acl_release(acl); } int v9fs_acl_mode(struct inode *dir, umode_t *modep, struct posix_acl **dpacl, struct posix_acl **pacl) { int retval = 0; umode_t mode = *modep; struct posix_acl *acl = NULL; if (!S_ISLNK(mode)) { acl = v9fs_get_cached_acl(dir, ACL_TYPE_DEFAULT); if (IS_ERR(acl)) return PTR_ERR(acl); if (!acl) mode &= ~current_umask(); } if (acl) { if (S_ISDIR(mode)) *dpacl = posix_acl_dup(acl); retval = __posix_acl_create(&acl, GFP_NOFS, &mode); if (retval < 0) return retval; if (retval > 0) *pacl = acl; else posix_acl_release(acl); } *modep = mode; return 0; }
3 2 3 1 1 1 2 2 2 2 8 1 1 1 1 1 1 1 3 3 2 1 1 1 1 1 1 1 1 2 3 1 2 1 1 2 2 2 1 1 1 1 1 17 3 3 3 3 17 4 1 1 1 8 4 4 8 3 8 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2009 Red Hat, Inc. * Author: Michael S. Tsirkin <mst@redhat.com> * * virtio-net server in host kernel. */ #include <linux/compat.h> #include <linux/eventfd.h> #include <linux/vhost.h> #include <linux/virtio_net.h> #include <linux/miscdevice.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/mutex.h> #include <linux/workqueue.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/sched/clock.h> #include <linux/sched/signal.h> #include <linux/vmalloc.h> #include <linux/net.h> #include <linux/if_packet.h> #include <linux/if_arp.h> #include <linux/if_tun.h> #include <linux/if_macvlan.h> #include <linux/if_tap.h> #include <linux/if_vlan.h> #include <linux/skb_array.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/xdp.h> #include "vhost.h" static int experimental_zcopytx = 0; module_param(experimental_zcopytx, int, 0444); MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;" " 1 -Enable; 0 - Disable"); /* Max number of bytes transferred before requeueing the job. * Using this limit prevents one virtqueue from starving others. */ #define VHOST_NET_WEIGHT 0x80000 /* Max number of packets transferred before requeueing the job. * Using this limit prevents one virtqueue from starving others with small * pkts. */ #define VHOST_NET_PKT_WEIGHT 256 /* MAX number of TX used buffers for outstanding zerocopy */ #define VHOST_MAX_PEND 128 #define VHOST_GOODCOPY_LEN 256 /* * For transmit, used buffer len is unused; we override it to track buffer * status internally; used for zerocopy tx only. */ /* Lower device DMA failed */ #define VHOST_DMA_FAILED_LEN ((__force __virtio32)3) /* Lower device DMA done */ #define VHOST_DMA_DONE_LEN ((__force __virtio32)2) /* Lower device DMA in progress */ #define VHOST_DMA_IN_PROGRESS ((__force __virtio32)1) /* Buffer unused */ #define VHOST_DMA_CLEAR_LEN ((__force __virtio32)0) #define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force u32)VHOST_DMA_DONE_LEN) enum { VHOST_NET_FEATURES = VHOST_FEATURES | (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) | (1ULL << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_ACCESS_PLATFORM) | (1ULL << VIRTIO_F_RING_RESET) }; enum { VHOST_NET_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2) }; enum { VHOST_NET_VQ_RX = 0, VHOST_NET_VQ_TX = 1, VHOST_NET_VQ_MAX = 2, }; struct vhost_net_ubuf_ref { /* refcount follows semantics similar to kref: * 0: object is released * 1: no outstanding ubufs * >1: outstanding ubufs */ atomic_t refcount; wait_queue_head_t wait; struct vhost_virtqueue *vq; }; #define VHOST_NET_BATCH 64 struct vhost_net_buf { void **queue; int tail; int head; }; struct vhost_net_virtqueue { struct vhost_virtqueue vq; size_t vhost_hlen; size_t sock_hlen; /* vhost zerocopy support fields below: */ /* last used idx for outstanding DMA zerocopy buffers */ int upend_idx; /* For TX, first used idx for DMA done zerocopy buffers * For RX, number of batched heads */ int done_idx; /* Number of XDP frames batched */ int batched_xdp; /* an array of userspace buffers info */ struct ubuf_info_msgzc *ubuf_info; /* Reference counting for outstanding ubufs. * Protected by vq mutex. Writers must also take device mutex. */ struct vhost_net_ubuf_ref *ubufs; struct ptr_ring *rx_ring; struct vhost_net_buf rxq; /* Batched XDP buffs */ struct xdp_buff *xdp; }; struct vhost_net { struct vhost_dev dev; struct vhost_net_virtqueue vqs[VHOST_NET_VQ_MAX]; struct vhost_poll poll[VHOST_NET_VQ_MAX]; /* Number of TX recently submitted. * Protected by tx vq lock. */ unsigned tx_packets; /* Number of times zerocopy TX recently failed. * Protected by tx vq lock. */ unsigned tx_zcopy_err; /* Flush in progress. Protected by tx vq lock. */ bool tx_flush; /* Private page frag */ struct page_frag page_frag; /* Refcount bias of page frag */ int refcnt_bias; }; static unsigned vhost_net_zcopy_mask __read_mostly; static void *vhost_net_buf_get_ptr(struct vhost_net_buf *rxq) { if (rxq->tail != rxq->head) return rxq->queue[rxq->head]; else return NULL; } static int vhost_net_buf_get_size(struct vhost_net_buf *rxq) { return rxq->tail - rxq->head; } static int vhost_net_buf_is_empty(struct vhost_net_buf *rxq) { return rxq->tail == rxq->head; } static void *vhost_net_buf_consume(struct vhost_net_buf *rxq) { void *ret = vhost_net_buf_get_ptr(rxq); ++rxq->head; return ret; } static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq) { struct vhost_net_buf *rxq = &nvq->rxq; rxq->head = 0; rxq->tail = ptr_ring_consume_batched(nvq->rx_ring, rxq->queue, VHOST_NET_BATCH); return rxq->tail; } static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq) { struct vhost_net_buf *rxq = &nvq->rxq; if (nvq->rx_ring && !vhost_net_buf_is_empty(rxq)) { ptr_ring_unconsume(nvq->rx_ring, rxq->queue + rxq->head, vhost_net_buf_get_size(rxq), tun_ptr_free); rxq->head = rxq->tail = 0; } } static int vhost_net_buf_peek_len(void *ptr) { if (tun_is_xdp_frame(ptr)) { struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); return xdpf->len; } return __skb_array_len_with_tag(ptr); } static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq) { struct vhost_net_buf *rxq = &nvq->rxq; if (!vhost_net_buf_is_empty(rxq)) goto out; if (!vhost_net_buf_produce(nvq)) return 0; out: return vhost_net_buf_peek_len(vhost_net_buf_get_ptr(rxq)); } static void vhost_net_buf_init(struct vhost_net_buf *rxq) { rxq->head = rxq->tail = 0; } static void vhost_net_enable_zcopy(int vq) { vhost_net_zcopy_mask |= 0x1 << vq; } static struct vhost_net_ubuf_ref * vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy) { struct vhost_net_ubuf_ref *ubufs; /* No zero copy backend? Nothing to count. */ if (!zcopy) return NULL; ubufs = kmalloc(sizeof(*ubufs), GFP_KERNEL); if (!ubufs) return ERR_PTR(-ENOMEM); atomic_set(&ubufs->refcount, 1); init_waitqueue_head(&ubufs->wait); ubufs->vq = vq; return ubufs; } static int vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs) { int r = atomic_sub_return(1, &ubufs->refcount); if (unlikely(!r)) wake_up(&ubufs->wait); return r; } static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs) { vhost_net_ubuf_put(ubufs); wait_event(ubufs->wait, !atomic_read(&ubufs->refcount)); } static void vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref *ubufs) { vhost_net_ubuf_put_and_wait(ubufs); kfree(ubufs); } static void vhost_net_clear_ubuf_info(struct vhost_net *n) { int i; for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { kfree(n->vqs[i].ubuf_info); n->vqs[i].ubuf_info = NULL; } } static int vhost_net_set_ubuf_info(struct vhost_net *n) { bool zcopy; int i; for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { zcopy = vhost_net_zcopy_mask & (0x1 << i); if (!zcopy) continue; n->vqs[i].ubuf_info = kmalloc_array(UIO_MAXIOV, sizeof(*n->vqs[i].ubuf_info), GFP_KERNEL); if (!n->vqs[i].ubuf_info) goto err; } return 0; err: vhost_net_clear_ubuf_info(n); return -ENOMEM; } static void vhost_net_vq_reset(struct vhost_net *n) { int i; vhost_net_clear_ubuf_info(n); for (i = 0; i < VHOST_NET_VQ_MAX; i++) { n->vqs[i].done_idx = 0; n->vqs[i].upend_idx = 0; n->vqs[i].ubufs = NULL; n->vqs[i].vhost_hlen = 0; n->vqs[i].sock_hlen = 0; vhost_net_buf_init(&n->vqs[i].rxq); } } static void vhost_net_tx_packet(struct vhost_net *net) { ++net->tx_packets; if (net->tx_packets < 1024) return; net->tx_packets = 0; net->tx_zcopy_err = 0; } static void vhost_net_tx_err(struct vhost_net *net) { ++net->tx_zcopy_err; } static bool vhost_net_tx_select_zcopy(struct vhost_net *net) { /* TX flush waits for outstanding DMAs to be done. * Don't start new DMAs. */ return !net->tx_flush && net->tx_packets / 64 >= net->tx_zcopy_err; } static bool vhost_sock_zcopy(struct socket *sock) { return unlikely(experimental_zcopytx) && sock_flag(sock->sk, SOCK_ZEROCOPY); } static bool vhost_sock_xdp(struct socket *sock) { return sock_flag(sock->sk, SOCK_XDP); } /* In case of DMA done not in order in lower device driver for some reason. * upend_idx is used to track end of used idx, done_idx is used to track head * of used idx. Once lower device DMA done contiguously, we will signal KVM * guest used idx. */ static void vhost_zerocopy_signal_used(struct vhost_net *net, struct vhost_virtqueue *vq) { struct vhost_net_virtqueue *nvq = container_of(vq, struct vhost_net_virtqueue, vq); int i, add; int j = 0; for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) { if (vq->heads[i].len == VHOST_DMA_FAILED_LEN) vhost_net_tx_err(net); if (VHOST_DMA_IS_DONE(vq->heads[i].len)) { vq->heads[i].len = VHOST_DMA_CLEAR_LEN; ++j; } else break; } while (j) { add = min(UIO_MAXIOV - nvq->done_idx, j); vhost_add_used_and_signal_n(vq->dev, vq, &vq->heads[nvq->done_idx], add); nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV; j -= add; } } static void vhost_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *ubuf_base, bool success) { struct ubuf_info_msgzc *ubuf = uarg_to_msgzc(ubuf_base); struct vhost_net_ubuf_ref *ubufs = ubuf->ctx; struct vhost_virtqueue *vq = ubufs->vq; int cnt; rcu_read_lock_bh(); /* set len to mark this desc buffers done DMA */ vq->heads[ubuf->desc].len = success ? VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN; cnt = vhost_net_ubuf_put(ubufs); /* * Trigger polling thread if guest stopped submitting new buffers: * in this case, the refcount after decrement will eventually reach 1. * We also trigger polling periodically after each 16 packets * (the value 16 here is more or less arbitrary, it's tuned to trigger * less than 10% of times). */ if (cnt <= 1 || !(cnt % 16)) vhost_poll_queue(&vq->poll); rcu_read_unlock_bh(); } static inline unsigned long busy_clock(void) { return local_clock() >> 10; } static bool vhost_can_busy_poll(unsigned long endtime) { return likely(!need_resched() && !time_after(busy_clock(), endtime) && !signal_pending(current)); } static void vhost_net_disable_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { struct vhost_net_virtqueue *nvq = container_of(vq, struct vhost_net_virtqueue, vq); struct vhost_poll *poll = n->poll + (nvq - n->vqs); if (!vhost_vq_get_backend(vq)) return; vhost_poll_stop(poll); } static int vhost_net_enable_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { struct vhost_net_virtqueue *nvq = container_of(vq, struct vhost_net_virtqueue, vq); struct vhost_poll *poll = n->poll + (nvq - n->vqs); struct socket *sock; sock = vhost_vq_get_backend(vq); if (!sock) return 0; return vhost_poll_start(poll, sock->file); } static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq) { struct vhost_virtqueue *vq = &nvq->vq; struct vhost_dev *dev = vq->dev; if (!nvq->done_idx) return; vhost_add_used_and_signal_n(dev, vq, vq->heads, nvq->done_idx); nvq->done_idx = 0; } static void vhost_tx_batch(struct vhost_net *net, struct vhost_net_virtqueue *nvq, struct socket *sock, struct msghdr *msghdr) { struct tun_msg_ctl ctl = { .type = TUN_MSG_PTR, .num = nvq->batched_xdp, .ptr = nvq->xdp, }; int i, err; if (nvq->batched_xdp == 0) goto signal_used; msghdr->msg_control = &ctl; msghdr->msg_controllen = sizeof(ctl); err = sock->ops->sendmsg(sock, msghdr, 0); if (unlikely(err < 0)) { vq_err(&nvq->vq, "Fail to batch sending packets\n"); /* free pages owned by XDP; since this is an unlikely error path, * keep it simple and avoid more complex bulk update for the * used pages */ for (i = 0; i < nvq->batched_xdp; ++i) put_page(virt_to_head_page(nvq->xdp[i].data)); nvq->batched_xdp = 0; nvq->done_idx = 0; return; } signal_used: vhost_net_signal_used(nvq); nvq->batched_xdp = 0; } static int sock_has_rx_data(struct socket *sock) { if (unlikely(!sock)) return 0; if (sock->ops->peek_len) return sock->ops->peek_len(sock); return skb_queue_empty(&sock->sk->sk_receive_queue); } static void vhost_net_busy_poll_try_queue(struct vhost_net *net, struct vhost_virtqueue *vq) { if (!vhost_vq_avail_empty(&net->dev, vq)) { vhost_poll_queue(&vq->poll); } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { vhost_disable_notify(&net->dev, vq); vhost_poll_queue(&vq->poll); } } static void vhost_net_busy_poll(struct vhost_net *net, struct vhost_virtqueue *rvq, struct vhost_virtqueue *tvq, bool *busyloop_intr, bool poll_rx) { unsigned long busyloop_timeout; unsigned long endtime; struct socket *sock; struct vhost_virtqueue *vq = poll_rx ? tvq : rvq; /* Try to hold the vq mutex of the paired virtqueue. We can't * use mutex_lock() here since we could not guarantee a * consistenet lock ordering. */ if (!mutex_trylock(&vq->mutex)) return; vhost_disable_notify(&net->dev, vq); sock = vhost_vq_get_backend(rvq); busyloop_timeout = poll_rx ? rvq->busyloop_timeout: tvq->busyloop_timeout; preempt_disable(); endtime = busy_clock() + busyloop_timeout; while (vhost_can_busy_poll(endtime)) { if (vhost_vq_has_work(vq)) { *busyloop_intr = true; break; } if ((sock_has_rx_data(sock) && !vhost_vq_avail_empty(&net->dev, rvq)) || !vhost_vq_avail_empty(&net->dev, tvq)) break; cpu_relax(); } preempt_enable(); if (poll_rx || sock_has_rx_data(sock)) vhost_net_busy_poll_try_queue(net, vq); else if (!poll_rx) /* On tx here, sock has no rx data. */ vhost_enable_notify(&net->dev, rvq); mutex_unlock(&vq->mutex); } static int vhost_net_tx_get_vq_desc(struct vhost_net *net, struct vhost_net_virtqueue *tnvq, unsigned int *out_num, unsigned int *in_num, struct msghdr *msghdr, bool *busyloop_intr) { struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX]; struct vhost_virtqueue *rvq = &rnvq->vq; struct vhost_virtqueue *tvq = &tnvq->vq; int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov), out_num, in_num, NULL, NULL); if (r == tvq->num && tvq->busyloop_timeout) { /* Flush batched packets first */ if (!vhost_sock_zcopy(vhost_vq_get_backend(tvq))) vhost_tx_batch(net, tnvq, vhost_vq_get_backend(tvq), msghdr); vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false); r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov), out_num, in_num, NULL, NULL); } return r; } static bool vhost_exceeds_maxpend(struct vhost_net *net) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *vq = &nvq->vq; return (nvq->upend_idx + UIO_MAXIOV - nvq->done_idx) % UIO_MAXIOV > min_t(unsigned int, VHOST_MAX_PEND, vq->num >> 2); } static size_t init_iov_iter(struct vhost_virtqueue *vq, struct iov_iter *iter, size_t hdr_size, int out) { /* Skip header. TODO: support TSO. */ size_t len = iov_length(vq->iov, out); iov_iter_init(iter, ITER_SOURCE, vq->iov, out, len); iov_iter_advance(iter, hdr_size); return iov_iter_count(iter); } static int get_tx_bufs(struct vhost_net *net, struct vhost_net_virtqueue *nvq, struct msghdr *msg, unsigned int *out, unsigned int *in, size_t *len, bool *busyloop_intr) { struct vhost_virtqueue *vq = &nvq->vq; int ret; ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr); if (ret < 0 || ret == vq->num) return ret; if (*in) { vq_err(vq, "Unexpected descriptor format for TX: out %d, int %d\n", *out, *in); return -EFAULT; } /* Sanity check */ *len = init_iov_iter(vq, &msg->msg_iter, nvq->vhost_hlen, *out); if (*len == 0) { vq_err(vq, "Unexpected header len for TX: %zd expected %zd\n", *len, nvq->vhost_hlen); return -EFAULT; } return ret; } static bool tx_can_batch(struct vhost_virtqueue *vq, size_t total_len) { return total_len < VHOST_NET_WEIGHT && !vhost_vq_avail_empty(vq->dev, vq); } static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz, struct page_frag *pfrag, gfp_t gfp) { if (pfrag->page) { if (pfrag->offset + sz <= pfrag->size) return true; __page_frag_cache_drain(pfrag->page, net->refcnt_bias); } pfrag->offset = 0; net->refcnt_bias = 0; if (SKB_FRAG_PAGE_ORDER) { /* Avoid direct reclaim but allow kswapd to wake */ pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY, SKB_FRAG_PAGE_ORDER); if (likely(pfrag->page)) { pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; goto done; } } pfrag->page = alloc_page(gfp); if (likely(pfrag->page)) { pfrag->size = PAGE_SIZE; goto done; } return false; done: net->refcnt_bias = USHRT_MAX; page_ref_add(pfrag->page, USHRT_MAX - 1); return true; } #define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq, struct iov_iter *from) { struct vhost_virtqueue *vq = &nvq->vq; struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); struct socket *sock = vhost_vq_get_backend(vq); struct page_frag *alloc_frag = &net->page_frag; struct virtio_net_hdr *gso; struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp]; struct tun_xdp_hdr *hdr; size_t len = iov_iter_count(from); int headroom = vhost_sock_xdp(sock) ? XDP_PACKET_HEADROOM : 0; int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + headroom + nvq->sock_hlen); int sock_hlen = nvq->sock_hlen; void *buf; int copied; if (unlikely(len < nvq->sock_hlen)) return -EFAULT; if (SKB_DATA_ALIGN(len + pad) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE) return -ENOSPC; buflen += SKB_DATA_ALIGN(len + pad); alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES); if (unlikely(!vhost_net_page_frag_refill(net, buflen, alloc_frag, GFP_KERNEL))) return -ENOMEM; buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; copied = copy_page_from_iter(alloc_frag->page, alloc_frag->offset + offsetof(struct tun_xdp_hdr, gso), sock_hlen, from); if (copied != sock_hlen) return -EFAULT; hdr = buf; gso = &hdr->gso; if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && vhost16_to_cpu(vq, gso->csum_start) + vhost16_to_cpu(vq, gso->csum_offset) + 2 > vhost16_to_cpu(vq, gso->hdr_len)) { gso->hdr_len = cpu_to_vhost16(vq, vhost16_to_cpu(vq, gso->csum_start) + vhost16_to_cpu(vq, gso->csum_offset) + 2); if (vhost16_to_cpu(vq, gso->hdr_len) > len) return -EINVAL; } len -= sock_hlen; copied = copy_page_from_iter(alloc_frag->page, alloc_frag->offset + pad, len, from); if (copied != len) return -EFAULT; xdp_init_buff(xdp, buflen, NULL); xdp_prepare_buff(xdp, buf, pad, len, true); hdr->buflen = buflen; --net->refcnt_bias; alloc_frag->offset += buflen; ++nvq->batched_xdp; return 0; } static void handle_tx_copy(struct vhost_net *net, struct socket *sock) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *vq = &nvq->vq; unsigned out, in; int head; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, .msg_control = NULL, .msg_controllen = 0, .msg_flags = MSG_DONTWAIT, }; size_t len, total_len = 0; int err; int sent_pkts = 0; bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX); do { bool busyloop_intr = false; if (nvq->done_idx == VHOST_NET_BATCH) vhost_tx_batch(net, nvq, sock, &msg); head = get_tx_bufs(net, nvq, &msg, &out, &in, &len, &busyloop_intr); /* On error, stop handling until the next kick. */ if (unlikely(head < 0)) break; /* Nothing new? Wait for eventfd to tell us they refilled. */ if (head == vq->num) { if (unlikely(busyloop_intr)) { vhost_poll_queue(&vq->poll); } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { vhost_disable_notify(&net->dev, vq); continue; } break; } total_len += len; /* For simplicity, TX batching is only enabled if * sndbuf is unlimited. */ if (sock_can_batch) { err = vhost_net_build_xdp(nvq, &msg.msg_iter); if (!err) { goto done; } else if (unlikely(err != -ENOSPC)) { vhost_tx_batch(net, nvq, sock, &msg); vhost_discard_vq_desc(vq, 1); vhost_net_enable_vq(net, vq); break; } /* We can't build XDP buff, go for single * packet path but let's flush batched * packets. */ vhost_tx_batch(net, nvq, sock, &msg); msg.msg_control = NULL; } else { if (tx_can_batch(vq, total_len)) msg.msg_flags |= MSG_MORE; else msg.msg_flags &= ~MSG_MORE; } err = sock->ops->sendmsg(sock, &msg, len); if (unlikely(err < 0)) { if (err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS) { vhost_discard_vq_desc(vq, 1); vhost_net_enable_vq(net, vq); break; } pr_debug("Fail to send packet: err %d", err); } else if (unlikely(err != len)) pr_debug("Truncated TX packet: len %d != %zd\n", err, len); done: vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head); vq->heads[nvq->done_idx].len = 0; ++nvq->done_idx; } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len))); vhost_tx_batch(net, nvq, sock, &msg); } static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *vq = &nvq->vq; unsigned out, in; int head; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, .msg_control = NULL, .msg_controllen = 0, .msg_flags = MSG_DONTWAIT, }; struct tun_msg_ctl ctl; size_t len, total_len = 0; int err; struct vhost_net_ubuf_ref *ubufs; struct ubuf_info_msgzc *ubuf; bool zcopy_used; int sent_pkts = 0; do { bool busyloop_intr; /* Release DMAs done buffers first */ vhost_zerocopy_signal_used(net, vq); busyloop_intr = false; head = get_tx_bufs(net, nvq, &msg, &out, &in, &len, &busyloop_intr); /* On error, stop handling until the next kick. */ if (unlikely(head < 0)) break; /* Nothing new? Wait for eventfd to tell us they refilled. */ if (head == vq->num) { if (unlikely(busyloop_intr)) { vhost_poll_queue(&vq->poll); } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { vhost_disable_notify(&net->dev, vq); continue; } break; } zcopy_used = len >= VHOST_GOODCOPY_LEN && !vhost_exceeds_maxpend(net) && vhost_net_tx_select_zcopy(net); /* use msg_control to pass vhost zerocopy ubuf info to skb */ if (zcopy_used) { ubuf = nvq->ubuf_info + nvq->upend_idx; vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head); vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS; ubuf->ctx = nvq->ubufs; ubuf->desc = nvq->upend_idx; ubuf->ubuf.callback = vhost_zerocopy_callback; ubuf->ubuf.flags = SKBFL_ZEROCOPY_FRAG; refcount_set(&ubuf->ubuf.refcnt, 1); msg.msg_control = &ctl; ctl.type = TUN_MSG_UBUF; ctl.ptr = &ubuf->ubuf; msg.msg_controllen = sizeof(ctl); ubufs = nvq->ubufs; atomic_inc(&ubufs->refcount); nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV; } else { msg.msg_control = NULL; ubufs = NULL; } total_len += len; if (tx_can_batch(vq, total_len) && likely(!vhost_exceeds_maxpend(net))) { msg.msg_flags |= MSG_MORE; } else { msg.msg_flags &= ~MSG_MORE; } err = sock->ops->sendmsg(sock, &msg, len); if (unlikely(err < 0)) { bool retry = err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS; if (zcopy_used) { if (vq->heads[ubuf->desc].len == VHOST_DMA_IN_PROGRESS) vhost_net_ubuf_put(ubufs); if (retry) nvq->upend_idx = ((unsigned)nvq->upend_idx - 1) % UIO_MAXIOV; else vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN; } if (retry) { vhost_discard_vq_desc(vq, 1); vhost_net_enable_vq(net, vq); break; } pr_debug("Fail to send packet: err %d", err); } else if (unlikely(err != len)) pr_debug("Truncated TX packet: " " len %d != %zd\n", err, len); if (!zcopy_used) vhost_add_used_and_signal(&net->dev, vq, head, 0); else vhost_zerocopy_signal_used(net, vq); vhost_net_tx_packet(net); } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len))); } /* Expects to be always run from workqueue - which acts as * read-size critical section for our kind of RCU. */ static void handle_tx(struct vhost_net *net) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *vq = &nvq->vq; struct socket *sock; mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX); sock = vhost_vq_get_backend(vq); if (!sock) goto out; if (!vq_meta_prefetch(vq)) goto out; vhost_disable_notify(&net->dev, vq); vhost_net_disable_vq(net, vq); if (vhost_sock_zcopy(sock)) handle_tx_zerocopy(net, sock); else handle_tx_copy(net, sock); out: mutex_unlock(&vq->mutex); } static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk) { struct sk_buff *head; int len = 0; unsigned long flags; if (rvq->rx_ring) return vhost_net_buf_peek(rvq); spin_lock_irqsave(&sk->sk_receive_queue.lock, flags); head = skb_peek(&sk->sk_receive_queue); if (likely(head)) { len = head->len; if (skb_vlan_tag_present(head)) len += VLAN_HLEN; } spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags); return len; } static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk, bool *busyloop_intr) { struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX]; struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *rvq = &rnvq->vq; struct vhost_virtqueue *tvq = &tnvq->vq; int len = peek_head_len(rnvq, sk); if (!len && rvq->busyloop_timeout) { /* Flush batched heads first */ vhost_net_signal_used(rnvq); /* Both tx vq and rx socket were polled here */ vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, true); len = peek_head_len(rnvq, sk); } return len; } /* This is a multi-buffer version of vhost_get_desc, that works if * vq has read descriptors only. * @vq - the relevant virtqueue * @datalen - data length we'll be reading * @iovcount - returned count of io vectors we fill * @log - vhost log * @log_num - log offset * @quota - headcount quota, 1 for big buffer * returns number of buffer heads allocated, negative on error */ static int get_rx_bufs(struct vhost_virtqueue *vq, struct vring_used_elem *heads, int datalen, unsigned *iovcount, struct vhost_log *log, unsigned *log_num, unsigned int quota) { unsigned int out, in; int seg = 0; int headcount = 0; unsigned d; int r, nlogs = 0; /* len is always initialized before use since we are always called with * datalen > 0. */ u32 len; while (datalen > 0 && headcount < quota) { if (unlikely(seg >= UIO_MAXIOV)) { r = -ENOBUFS; goto err; } r = vhost_get_vq_desc(vq, vq->iov + seg, ARRAY_SIZE(vq->iov) - seg, &out, &in, log, log_num); if (unlikely(r < 0)) goto err; d = r; if (d == vq->num) { r = 0; goto err; } if (unlikely(out || in <= 0)) { vq_err(vq, "unexpected descriptor format for RX: " "out %d, in %d\n", out, in); r = -EINVAL; goto err; } if (unlikely(log)) { nlogs += *log_num; log += *log_num; } heads[headcount].id = cpu_to_vhost32(vq, d); len = iov_length(vq->iov + seg, in); heads[headcount].len = cpu_to_vhost32(vq, len); datalen -= len; ++headcount; seg += in; } heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen); *iovcount = seg; if (unlikely(log)) *log_num = nlogs; /* Detect overrun */ if (unlikely(datalen > 0)) { r = UIO_MAXIOV + 1; goto err; } return headcount; err: vhost_discard_vq_desc(vq, headcount); return r; } /* Expects to be always run from workqueue - which acts as * read-size critical section for our kind of RCU. */ static void handle_rx(struct vhost_net *net) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX]; struct vhost_virtqueue *vq = &nvq->vq; unsigned in, log; struct vhost_log *vq_log; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, .msg_control = NULL, /* FIXME: get and handle RX aux data. */ .msg_controllen = 0, .msg_flags = MSG_DONTWAIT, }; struct virtio_net_hdr hdr = { .flags = 0, .gso_type = VIRTIO_NET_HDR_GSO_NONE }; size_t total_len = 0; int err, mergeable; s16 headcount; size_t vhost_hlen, sock_hlen; size_t vhost_len, sock_len; bool busyloop_intr = false; struct socket *sock; struct iov_iter fixup; __virtio16 num_buffers; int recv_pkts = 0; mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX); sock = vhost_vq_get_backend(vq); if (!sock) goto out; if (!vq_meta_prefetch(vq)) goto out; vhost_disable_notify(&net->dev, vq); vhost_net_disable_vq(net, vq); vhost_hlen = nvq->vhost_hlen; sock_hlen = nvq->sock_hlen; vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ? vq->log : NULL; mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF); do { sock_len = vhost_net_rx_peek_head_len(net, sock->sk, &busyloop_intr); if (!sock_len) break; sock_len += sock_hlen; vhost_len = sock_len + vhost_hlen; headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx, vhost_len, &in, vq_log, &log, likely(mergeable) ? UIO_MAXIOV : 1); /* On error, stop handling until the next kick. */ if (unlikely(headcount < 0)) goto out; /* OK, now we need to know about added descriptors. */ if (!headcount) { if (unlikely(busyloop_intr)) { vhost_poll_queue(&vq->poll); } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { /* They have slipped one in as we were * doing that: check again. */ vhost_disable_notify(&net->dev, vq); continue; } /* Nothing new? Wait for eventfd to tell us * they refilled. */ goto out; } busyloop_intr = false; if (nvq->rx_ring) msg.msg_control = vhost_net_buf_consume(&nvq->rxq); /* On overrun, truncate and discard */ if (unlikely(headcount > UIO_MAXIOV)) { iov_iter_init(&msg.msg_iter, ITER_DEST, vq->iov, 1, 1); err = sock->ops->recvmsg(sock, &msg, 1, MSG_DONTWAIT | MSG_TRUNC); pr_debug("Discarded rx packet: len %zd\n", sock_len); continue; } /* We don't need to be notified again. */ iov_iter_init(&msg.msg_iter, ITER_DEST, vq->iov, in, vhost_len); fixup = msg.msg_iter; if (unlikely((vhost_hlen))) { /* We will supply the header ourselves * TODO: support TSO. */ iov_iter_advance(&msg.msg_iter, vhost_hlen); } err = sock->ops->recvmsg(sock, &msg, sock_len, MSG_DONTWAIT | MSG_TRUNC); /* Userspace might have consumed the packet meanwhile: * it's not supposed to do this usually, but might be hard * to prevent. Discard data we got (if any) and keep going. */ if (unlikely(err != sock_len)) { pr_debug("Discarded rx packet: " " len %d, expected %zd\n", err, sock_len); vhost_discard_vq_desc(vq, headcount); continue; } /* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */ if (unlikely(vhost_hlen)) { if (copy_to_iter(&hdr, sizeof(hdr), &fixup) != sizeof(hdr)) { vq_err(vq, "Unable to write vnet_hdr " "at addr %p\n", vq->iov->iov_base); goto out; } } else { /* Header came from socket; we'll need to patch * ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF */ iov_iter_advance(&fixup, sizeof(hdr)); } /* TODO: Should check and handle checksum. */ num_buffers = cpu_to_vhost16(vq, headcount); if (likely(mergeable) && copy_to_iter(&num_buffers, sizeof num_buffers, &fixup) != sizeof num_buffers) { vq_err(vq, "Failed num_buffers write"); vhost_discard_vq_desc(vq, headcount); goto out; } nvq->done_idx += headcount; if (nvq->done_idx > VHOST_NET_BATCH) vhost_net_signal_used(nvq); if (unlikely(vq_log)) vhost_log_write(vq, vq_log, log, vhost_len, vq->iov, in); total_len += vhost_len; } while (likely(!vhost_exceeds_weight(vq, ++recv_pkts, total_len))); if (unlikely(busyloop_intr)) vhost_poll_queue(&vq->poll); else if (!sock_len) vhost_net_enable_vq(net, vq); out: vhost_net_signal_used(nvq); mutex_unlock(&vq->mutex); } static void handle_tx_kick(struct vhost_work *work) { struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, poll.work); struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); handle_tx(net); } static void handle_rx_kick(struct vhost_work *work) { struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, poll.work); struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); handle_rx(net); } static void handle_tx_net(struct vhost_work *work) { struct vhost_net *net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work); handle_tx(net); } static void handle_rx_net(struct vhost_work *work) { struct vhost_net *net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work); handle_rx(net); } static int vhost_net_open(struct inode *inode, struct file *f) { struct vhost_net *n; struct vhost_dev *dev; struct vhost_virtqueue **vqs; void **queue; struct xdp_buff *xdp; int i; n = kvmalloc(sizeof *n, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!n) return -ENOMEM; vqs = kmalloc_array(VHOST_NET_VQ_MAX, sizeof(*vqs), GFP_KERNEL); if (!vqs) { kvfree(n); return -ENOMEM; } queue = kmalloc_array(VHOST_NET_BATCH, sizeof(void *), GFP_KERNEL); if (!queue) { kfree(vqs); kvfree(n); return -ENOMEM; } n->vqs[VHOST_NET_VQ_RX].rxq.queue = queue; xdp = kmalloc_array(VHOST_NET_BATCH, sizeof(*xdp), GFP_KERNEL); if (!xdp) { kfree(vqs); kvfree(n); kfree(queue); return -ENOMEM; } n->vqs[VHOST_NET_VQ_TX].xdp = xdp; dev = &n->dev; vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq; vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq; n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick; n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick; for (i = 0; i < VHOST_NET_VQ_MAX; i++) { n->vqs[i].ubufs = NULL; n->vqs[i].ubuf_info = NULL; n->vqs[i].upend_idx = 0; n->vqs[i].done_idx = 0; n->vqs[i].batched_xdp = 0; n->vqs[i].vhost_hlen = 0; n->vqs[i].sock_hlen = 0; n->vqs[i].rx_ring = NULL; vhost_net_buf_init(&n->vqs[i].rxq); } vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX, UIO_MAXIOV + VHOST_NET_BATCH, VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT, true, NULL); vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev, vqs[VHOST_NET_VQ_TX]); vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev, vqs[VHOST_NET_VQ_RX]); f->private_data = n; n->page_frag.page = NULL; n->refcnt_bias = 0; return 0; } static struct socket *vhost_net_stop_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { struct socket *sock; struct vhost_net_virtqueue *nvq = container_of(vq, struct vhost_net_virtqueue, vq); mutex_lock(&vq->mutex); sock = vhost_vq_get_backend(vq); vhost_net_disable_vq(n, vq); vhost_vq_set_backend(vq, NULL); vhost_net_buf_unproduce(nvq); nvq->rx_ring = NULL; mutex_unlock(&vq->mutex); return sock; } static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock, struct socket **rx_sock) { *tx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_TX].vq); *rx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_RX].vq); } static void vhost_net_flush(struct vhost_net *n) { vhost_dev_flush(&n->dev); if (n->vqs[VHOST_NET_VQ_TX].ubufs) { mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); n->tx_flush = true; mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); /* Wait for all lower device DMAs done. */ vhost_net_ubuf_put_and_wait(n->vqs[VHOST_NET_VQ_TX].ubufs); mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); n->tx_flush = false; atomic_set(&n->vqs[VHOST_NET_VQ_TX].ubufs->refcount, 1); mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); } } static int vhost_net_release(struct inode *inode, struct file *f) { struct vhost_net *n = f->private_data; struct socket *tx_sock; struct socket *rx_sock; vhost_net_stop(n, &tx_sock, &rx_sock); vhost_net_flush(n); vhost_dev_stop(&n->dev); vhost_dev_cleanup(&n->dev); vhost_net_vq_reset(n); if (tx_sock) sockfd_put(tx_sock); if (rx_sock) sockfd_put(rx_sock); /* Make sure no callbacks are outstanding */ synchronize_rcu(); /* We do an extra flush before freeing memory, * since jobs can re-queue themselves. */ vhost_net_flush(n); kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue); kfree(n->vqs[VHOST_NET_VQ_TX].xdp); kfree(n->dev.vqs); if (n->page_frag.page) __page_frag_cache_drain(n->page_frag.page, n->refcnt_bias); kvfree(n); return 0; } static struct socket *get_raw_socket(int fd) { int r; struct socket *sock = sockfd_lookup(fd, &r); if (!sock) return ERR_PTR(-ENOTSOCK); /* Parameter checking */ if (sock->sk->sk_type != SOCK_RAW) { r = -ESOCKTNOSUPPORT; goto err; } if (sock->sk->sk_family != AF_PACKET) { r = -EPFNOSUPPORT; goto err; } return sock; err: sockfd_put(sock); return ERR_PTR(r); } static struct ptr_ring *get_tap_ptr_ring(struct file *file) { struct ptr_ring *ring; ring = tun_get_tx_ring(file); if (!IS_ERR(ring)) goto out; ring = tap_get_ptr_ring(file); if (!IS_ERR(ring)) goto out; ring = NULL; out: return ring; } static struct socket *get_tap_socket(int fd) { struct file *file = fget(fd); struct socket *sock; if (!file) return ERR_PTR(-EBADF); sock = tun_get_socket(file); if (!IS_ERR(sock)) return sock; sock = tap_get_socket(file); if (IS_ERR(sock)) fput(file); return sock; } static struct socket *get_socket(int fd) { struct socket *sock; /* special case to disable backend */ if (fd == -1) return NULL; sock = get_raw_socket(fd); if (!IS_ERR(sock)) return sock; sock = get_tap_socket(fd); if (!IS_ERR(sock)) return sock; return ERR_PTR(-ENOTSOCK); } static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) { struct socket *sock, *oldsock; struct vhost_virtqueue *vq; struct vhost_net_virtqueue *nvq; struct vhost_net_ubuf_ref *ubufs, *oldubufs = NULL; int r; mutex_lock(&n->dev.mutex); r = vhost_dev_check_owner(&n->dev); if (r) goto err; if (index >= VHOST_NET_VQ_MAX) { r = -ENOBUFS; goto err; } vq = &n->vqs[index].vq; nvq = &n->vqs[index]; mutex_lock(&vq->mutex); if (fd == -1) vhost_clear_msg(&n->dev); /* Verify that ring has been setup correctly. */ if (!vhost_vq_access_ok(vq)) { r = -EFAULT; goto err_vq; } sock = get_socket(fd); if (IS_ERR(sock)) { r = PTR_ERR(sock); goto err_vq; } /* start polling new socket */ oldsock = vhost_vq_get_backend(vq); if (sock != oldsock) { ubufs = vhost_net_ubuf_alloc(vq, sock && vhost_sock_zcopy(sock)); if (IS_ERR(ubufs)) { r = PTR_ERR(ubufs); goto err_ubufs; } vhost_net_disable_vq(n, vq); vhost_vq_set_backend(vq, sock); vhost_net_buf_unproduce(nvq); r = vhost_vq_init_access(vq); if (r) goto err_used; r = vhost_net_enable_vq(n, vq); if (r) goto err_used; if (index == VHOST_NET_VQ_RX) { if (sock) nvq->rx_ring = get_tap_ptr_ring(sock->file); else nvq->rx_ring = NULL; } oldubufs = nvq->ubufs; nvq->ubufs = ubufs; n->tx_packets = 0; n->tx_zcopy_err = 0; n->tx_flush = false; } mutex_unlock(&vq->mutex); if (oldubufs) { vhost_net_ubuf_put_wait_and_free(oldubufs); mutex_lock(&vq->mutex); vhost_zerocopy_signal_used(n, vq); mutex_unlock(&vq->mutex); } if (oldsock) { vhost_dev_flush(&n->dev); sockfd_put(oldsock); } mutex_unlock(&n->dev.mutex); return 0; err_used: vhost_vq_set_backend(vq, oldsock); vhost_net_enable_vq(n, vq); if (ubufs) vhost_net_ubuf_put_wait_and_free(ubufs); err_ubufs: if (sock) sockfd_put(sock); err_vq: mutex_unlock(&vq->mutex); err: mutex_unlock(&n->dev.mutex); return r; } static long vhost_net_reset_owner(struct vhost_net *n) { struct socket *tx_sock = NULL; struct socket *rx_sock = NULL; long err; struct vhost_iotlb *umem; mutex_lock(&n->dev.mutex); err = vhost_dev_check_owner(&n->dev); if (err) goto done; umem = vhost_dev_reset_owner_prepare(); if (!umem) { err = -ENOMEM; goto done; } vhost_net_stop(n, &tx_sock, &rx_sock); vhost_net_flush(n); vhost_dev_stop(&n->dev); vhost_dev_reset_owner(&n->dev, umem); vhost_net_vq_reset(n); done: mutex_unlock(&n->dev.mutex); if (tx_sock) sockfd_put(tx_sock); if (rx_sock) sockfd_put(rx_sock); return err; } static int vhost_net_set_features(struct vhost_net *n, u64 features) { size_t vhost_hlen, sock_hlen, hdr_len; int i; hdr_len = (features & ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) ? sizeof(struct virtio_net_hdr_mrg_rxbuf) : sizeof(struct virtio_net_hdr); if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) { /* vhost provides vnet_hdr */ vhost_hlen = hdr_len; sock_hlen = 0; } else { /* socket provides vnet_hdr */ vhost_hlen = 0; sock_hlen = hdr_len; } mutex_lock(&n->dev.mutex); if ((features & (1 << VHOST_F_LOG_ALL)) && !vhost_log_access_ok(&n->dev)) goto out_unlock; if ((features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) { if (vhost_init_device_iotlb(&n->dev)) goto out_unlock; } for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { mutex_lock(&n->vqs[i].vq.mutex); n->vqs[i].vq.acked_features = features; n->vqs[i].vhost_hlen = vhost_hlen; n->vqs[i].sock_hlen = sock_hlen; mutex_unlock(&n->vqs[i].vq.mutex); } mutex_unlock(&n->dev.mutex); return 0; out_unlock: mutex_unlock(&n->dev.mutex); return -EFAULT; } static long vhost_net_set_owner(struct vhost_net *n) { int r; mutex_lock(&n->dev.mutex); if (vhost_dev_has_owner(&n->dev)) { r = -EBUSY; goto out; } r = vhost_net_set_ubuf_info(n); if (r) goto out; r = vhost_dev_set_owner(&n->dev); if (r) vhost_net_clear_ubuf_info(n); vhost_net_flush(n); out: mutex_unlock(&n->dev.mutex); return r; } static long vhost_net_ioctl(struct file *f, unsigned int ioctl, unsigned long arg) { struct vhost_net *n = f->private_data; void __user *argp = (void __user *)arg; u64 __user *featurep = argp; struct vhost_vring_file backend; u64 features; int r; switch (ioctl) { case VHOST_NET_SET_BACKEND: if (copy_from_user(&backend, argp, sizeof backend)) return -EFAULT; return vhost_net_set_backend(n, backend.index, backend.fd); case VHOST_GET_FEATURES: features = VHOST_NET_FEATURES; if (copy_to_user(featurep, &features, sizeof features)) return -EFAULT; return 0; case VHOST_SET_FEATURES: if (copy_from_user(&features, featurep, sizeof features)) return -EFAULT; if (features & ~VHOST_NET_FEATURES) return -EOPNOTSUPP; return vhost_net_set_features(n, features); case VHOST_GET_BACKEND_FEATURES: features = VHOST_NET_BACKEND_FEATURES; if (copy_to_user(featurep, &features, sizeof(features))) return -EFAULT; return 0; case VHOST_SET_BACKEND_FEATURES: if (copy_from_user(&features, featurep, sizeof(features))) return -EFAULT; if (features & ~VHOST_NET_BACKEND_FEATURES) return -EOPNOTSUPP; vhost_set_backend_features(&n->dev, features); return 0; case VHOST_RESET_OWNER: return vhost_net_reset_owner(n); case VHOST_SET_OWNER: return vhost_net_set_owner(n); default: mutex_lock(&n->dev.mutex); r = vhost_dev_ioctl(&n->dev, ioctl, argp); if (r == -ENOIOCTLCMD) r = vhost_vring_ioctl(&n->dev, ioctl, argp); else vhost_net_flush(n); mutex_unlock(&n->dev.mutex); return r; } } static ssize_t vhost_net_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct vhost_net *n = file->private_data; struct vhost_dev *dev = &n->dev; int noblock = file->f_flags & O_NONBLOCK; return vhost_chr_read_iter(dev, to, noblock); } static ssize_t vhost_net_chr_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct vhost_net *n = file->private_data; struct vhost_dev *dev = &n->dev; return vhost_chr_write_iter(dev, from); } static __poll_t vhost_net_chr_poll(struct file *file, poll_table *wait) { struct vhost_net *n = file->private_data; struct vhost_dev *dev = &n->dev; return vhost_chr_poll(file, dev, wait); } static const struct file_operations vhost_net_fops = { .owner = THIS_MODULE, .release = vhost_net_release, .read_iter = vhost_net_chr_read_iter, .write_iter = vhost_net_chr_write_iter, .poll = vhost_net_chr_poll, .unlocked_ioctl = vhost_net_ioctl, .compat_ioctl = compat_ptr_ioctl, .open = vhost_net_open, .llseek = noop_llseek, }; static struct miscdevice vhost_net_misc = { .minor = VHOST_NET_MINOR, .name = "vhost-net", .fops = &vhost_net_fops, }; static int __init vhost_net_init(void) { if (experimental_zcopytx) vhost_net_enable_zcopy(VHOST_NET_VQ_TX); return misc_register(&vhost_net_misc); } module_init(vhost_net_init); static void __exit vhost_net_exit(void) { misc_deregister(&vhost_net_misc); } module_exit(vhost_net_exit); MODULE_VERSION("0.0.1"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Michael S. Tsirkin"); MODULE_DESCRIPTION("Host kernel accelerator for virtio net"); MODULE_ALIAS_MISCDEV(VHOST_NET_MINOR); MODULE_ALIAS("devname:vhost-net");
2 2 2 1 2 2 2 1 2 2 4 2 1 1 3 2 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 // SPDX-License-Identifier: GPL-2.0-or-later /* * CALIPSO - Common Architecture Label IPv6 Security Option * * This is an implementation of the CALIPSO protocol as specified in * RFC 5570. * * Authors: Paul Moore <paul.moore@hp.com> * Huw Davies <huw@codeweavers.com> */ /* (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008 * (c) Copyright Huw Davies <huw@codeweavers.com>, 2015 */ #include <linux/init.h> #include <linux/types.h> #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/string.h> #include <linux/jhash.h> #include <linux/audit.h> #include <linux/slab.h> #include <net/ip.h> #include <net/icmp.h> #include <net/tcp.h> #include <net/netlabel.h> #include <net/calipso.h> #include <linux/atomic.h> #include <linux/bug.h> #include <asm/unaligned.h> #include <linux/crc-ccitt.h> /* Maximium size of the calipso option including * the two-byte TLV header. */ #define CALIPSO_OPT_LEN_MAX (2 + 252) /* Size of the minimum calipso option including * the two-byte TLV header. */ #define CALIPSO_HDR_LEN (2 + 8) /* Maximium size of the calipso option including * the two-byte TLV header and upto 3 bytes of * leading pad and 7 bytes of trailing pad. */ #define CALIPSO_OPT_LEN_MAX_WITH_PAD (3 + CALIPSO_OPT_LEN_MAX + 7) /* Maximium size of u32 aligned buffer required to hold calipso * option. Max of 3 initial pad bytes starting from buffer + 3. * i.e. the worst case is when the previous tlv finishes on 4n + 3. */ #define CALIPSO_MAX_BUFFER (6 + CALIPSO_OPT_LEN_MAX) /* List of available DOI definitions */ static DEFINE_SPINLOCK(calipso_doi_list_lock); static LIST_HEAD(calipso_doi_list); /* Label mapping cache */ int calipso_cache_enabled = 1; int calipso_cache_bucketsize = 10; #define CALIPSO_CACHE_BUCKETBITS 7 #define CALIPSO_CACHE_BUCKETS BIT(CALIPSO_CACHE_BUCKETBITS) #define CALIPSO_CACHE_REORDERLIMIT 10 struct calipso_map_cache_bkt { spinlock_t lock; u32 size; struct list_head list; }; struct calipso_map_cache_entry { u32 hash; unsigned char *key; size_t key_len; struct netlbl_lsm_cache *lsm_data; u32 activity; struct list_head list; }; static struct calipso_map_cache_bkt *calipso_cache; static void calipso_cache_invalidate(void); static void calipso_doi_putdef(struct calipso_doi *doi_def); /* Label Mapping Cache Functions */ /** * calipso_cache_entry_free - Frees a cache entry * @entry: the entry to free * * Description: * This function frees the memory associated with a cache entry including the * LSM cache data if there are no longer any users, i.e. reference count == 0. * */ static void calipso_cache_entry_free(struct calipso_map_cache_entry *entry) { if (entry->lsm_data) netlbl_secattr_cache_free(entry->lsm_data); kfree(entry->key); kfree(entry); } /** * calipso_map_cache_hash - Hashing function for the CALIPSO cache * @key: the hash key * @key_len: the length of the key in bytes * * Description: * The CALIPSO tag hashing function. Returns a 32-bit hash value. * */ static u32 calipso_map_cache_hash(const unsigned char *key, u32 key_len) { return jhash(key, key_len, 0); } /** * calipso_cache_init - Initialize the CALIPSO cache * * Description: * Initializes the CALIPSO label mapping cache, this function should be called * before any of the other functions defined in this file. Returns zero on * success, negative values on error. * */ static int __init calipso_cache_init(void) { u32 iter; calipso_cache = kcalloc(CALIPSO_CACHE_BUCKETS, sizeof(struct calipso_map_cache_bkt), GFP_KERNEL); if (!calipso_cache) return -ENOMEM; for (iter = 0; iter < CALIPSO_CACHE_BUCKETS; iter++) { spin_lock_init(&calipso_cache[iter].lock); calipso_cache[iter].size = 0; INIT_LIST_HEAD(&calipso_cache[iter].list); } return 0; } /** * calipso_cache_invalidate - Invalidates the current CALIPSO cache * * Description: * Invalidates and frees any entries in the CALIPSO cache. Returns zero on * success and negative values on failure. * */ static void calipso_cache_invalidate(void) { struct calipso_map_cache_entry *entry, *tmp_entry; u32 iter; for (iter = 0; iter < CALIPSO_CACHE_BUCKETS; iter++) { spin_lock_bh(&calipso_cache[iter].lock); list_for_each_entry_safe(entry, tmp_entry, &calipso_cache[iter].list, list) { list_del(&entry->list); calipso_cache_entry_free(entry); } calipso_cache[iter].size = 0; spin_unlock_bh(&calipso_cache[iter].lock); } } /** * calipso_cache_check - Check the CALIPSO cache for a label mapping * @key: the buffer to check * @key_len: buffer length in bytes * @secattr: the security attribute struct to use * * Description: * This function checks the cache to see if a label mapping already exists for * the given key. If there is a match then the cache is adjusted and the * @secattr struct is populated with the correct LSM security attributes. The * cache is adjusted in the following manner if the entry is not already the * first in the cache bucket: * * 1. The cache entry's activity counter is incremented * 2. The previous (higher ranking) entry's activity counter is decremented * 3. If the difference between the two activity counters is geater than * CALIPSO_CACHE_REORDERLIMIT the two entries are swapped * * Returns zero on success, -ENOENT for a cache miss, and other negative values * on error. * */ static int calipso_cache_check(const unsigned char *key, u32 key_len, struct netlbl_lsm_secattr *secattr) { u32 bkt; struct calipso_map_cache_entry *entry; struct calipso_map_cache_entry *prev_entry = NULL; u32 hash; if (!calipso_cache_enabled) return -ENOENT; hash = calipso_map_cache_hash(key, key_len); bkt = hash & (CALIPSO_CACHE_BUCKETS - 1); spin_lock_bh(&calipso_cache[bkt].lock); list_for_each_entry(entry, &calipso_cache[bkt].list, list) { if (entry->hash == hash && entry->key_len == key_len && memcmp(entry->key, key, key_len) == 0) { entry->activity += 1; refcount_inc(&entry->lsm_data->refcount); secattr->cache = entry->lsm_data; secattr->flags |= NETLBL_SECATTR_CACHE; secattr->type = NETLBL_NLTYPE_CALIPSO; if (!prev_entry) { spin_unlock_bh(&calipso_cache[bkt].lock); return 0; } if (prev_entry->activity > 0) prev_entry->activity -= 1; if (entry->activity > prev_entry->activity && entry->activity - prev_entry->activity > CALIPSO_CACHE_REORDERLIMIT) { __list_del(entry->list.prev, entry->list.next); __list_add(&entry->list, prev_entry->list.prev, &prev_entry->list); } spin_unlock_bh(&calipso_cache[bkt].lock); return 0; } prev_entry = entry; } spin_unlock_bh(&calipso_cache[bkt].lock); return -ENOENT; } /** * calipso_cache_add - Add an entry to the CALIPSO cache * @calipso_ptr: the CALIPSO option * @secattr: the packet's security attributes * * Description: * Add a new entry into the CALIPSO label mapping cache. Add the new entry to * head of the cache bucket's list, if the cache bucket is out of room remove * the last entry in the list first. It is important to note that there is * currently no checking for duplicate keys. Returns zero on success, * negative values on failure. The key stored starts at calipso_ptr + 2, * i.e. the type and length bytes are not stored, this corresponds to * calipso_ptr[1] bytes of data. * */ static int calipso_cache_add(const unsigned char *calipso_ptr, const struct netlbl_lsm_secattr *secattr) { int ret_val = -EPERM; u32 bkt; struct calipso_map_cache_entry *entry = NULL; struct calipso_map_cache_entry *old_entry = NULL; u32 calipso_ptr_len; if (!calipso_cache_enabled || calipso_cache_bucketsize <= 0) return 0; calipso_ptr_len = calipso_ptr[1]; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) return -ENOMEM; entry->key = kmemdup(calipso_ptr + 2, calipso_ptr_len, GFP_ATOMIC); if (!entry->key) { ret_val = -ENOMEM; goto cache_add_failure; } entry->key_len = calipso_ptr_len; entry->hash = calipso_map_cache_hash(calipso_ptr, calipso_ptr_len); refcount_inc(&secattr->cache->refcount); entry->lsm_data = secattr->cache; bkt = entry->hash & (CALIPSO_CACHE_BUCKETS - 1); spin_lock_bh(&calipso_cache[bkt].lock); if (calipso_cache[bkt].size < calipso_cache_bucketsize) { list_add(&entry->list, &calipso_cache[bkt].list); calipso_cache[bkt].size += 1; } else { old_entry = list_entry(calipso_cache[bkt].list.prev, struct calipso_map_cache_entry, list); list_del(&old_entry->list); list_add(&entry->list, &calipso_cache[bkt].list); calipso_cache_entry_free(old_entry); } spin_unlock_bh(&calipso_cache[bkt].lock); return 0; cache_add_failure: if (entry) calipso_cache_entry_free(entry); return ret_val; } /* DOI List Functions */ /** * calipso_doi_search - Searches for a DOI definition * @doi: the DOI to search for * * Description: * Search the DOI definition list for a DOI definition with a DOI value that * matches @doi. The caller is responsible for calling rcu_read_[un]lock(). * Returns a pointer to the DOI definition on success and NULL on failure. */ static struct calipso_doi *calipso_doi_search(u32 doi) { struct calipso_doi *iter; list_for_each_entry_rcu(iter, &calipso_doi_list, list) if (iter->doi == doi && refcount_read(&iter->refcount)) return iter; return NULL; } /** * calipso_doi_add - Add a new DOI to the CALIPSO protocol engine * @doi_def: the DOI structure * @audit_info: NetLabel audit information * * Description: * The caller defines a new DOI for use by the CALIPSO engine and calls this * function to add it to the list of acceptable domains. The caller must * ensure that the mapping table specified in @doi_def->map meets all of the * requirements of the mapping type (see calipso.h for details). Returns * zero on success and non-zero on failure. * */ static int calipso_doi_add(struct calipso_doi *doi_def, struct netlbl_audit *audit_info) { int ret_val = -EINVAL; u32 doi; u32 doi_type; struct audit_buffer *audit_buf; doi = doi_def->doi; doi_type = doi_def->type; if (doi_def->doi == CALIPSO_DOI_UNKNOWN) goto doi_add_return; refcount_set(&doi_def->refcount, 1); spin_lock(&calipso_doi_list_lock); if (calipso_doi_search(doi_def->doi)) { spin_unlock(&calipso_doi_list_lock); ret_val = -EEXIST; goto doi_add_return; } list_add_tail_rcu(&doi_def->list, &calipso_doi_list); spin_unlock(&calipso_doi_list_lock); ret_val = 0; doi_add_return: audit_buf = netlbl_audit_start(AUDIT_MAC_CALIPSO_ADD, audit_info); if (audit_buf) { const char *type_str; switch (doi_type) { case CALIPSO_MAP_PASS: type_str = "pass"; break; default: type_str = "(unknown)"; } audit_log_format(audit_buf, " calipso_doi=%u calipso_type=%s res=%u", doi, type_str, ret_val == 0 ? 1 : 0); audit_log_end(audit_buf); } return ret_val; } /** * calipso_doi_free - Frees a DOI definition * @doi_def: the DOI definition * * Description: * This function frees all of the memory associated with a DOI definition. * */ static void calipso_doi_free(struct calipso_doi *doi_def) { kfree(doi_def); } /** * calipso_doi_free_rcu - Frees a DOI definition via the RCU pointer * @entry: the entry's RCU field * * Description: * This function is designed to be used as a callback to the call_rcu() * function so that the memory allocated to the DOI definition can be released * safely. * */ static void calipso_doi_free_rcu(struct rcu_head *entry) { struct calipso_doi *doi_def; doi_def = container_of(entry, struct calipso_doi, rcu); calipso_doi_free(doi_def); } /** * calipso_doi_remove - Remove an existing DOI from the CALIPSO protocol engine * @doi: the DOI value * @audit_info: NetLabel audit information * * Description: * Removes a DOI definition from the CALIPSO engine. The NetLabel routines will * be called to release their own LSM domain mappings as well as our own * domain list. Returns zero on success and negative values on failure. * */ static int calipso_doi_remove(u32 doi, struct netlbl_audit *audit_info) { int ret_val; struct calipso_doi *doi_def; struct audit_buffer *audit_buf; spin_lock(&calipso_doi_list_lock); doi_def = calipso_doi_search(doi); if (!doi_def) { spin_unlock(&calipso_doi_list_lock); ret_val = -ENOENT; goto doi_remove_return; } list_del_rcu(&doi_def->list); spin_unlock(&calipso_doi_list_lock); calipso_doi_putdef(doi_def); ret_val = 0; doi_remove_return: audit_buf = netlbl_audit_start(AUDIT_MAC_CALIPSO_DEL, audit_info); if (audit_buf) { audit_log_format(audit_buf, " calipso_doi=%u res=%u", doi, ret_val == 0 ? 1 : 0); audit_log_end(audit_buf); } return ret_val; } /** * calipso_doi_getdef - Returns a reference to a valid DOI definition * @doi: the DOI value * * Description: * Searches for a valid DOI definition and if one is found it is returned to * the caller. Otherwise NULL is returned. The caller must ensure that * calipso_doi_putdef() is called when the caller is done. * */ static struct calipso_doi *calipso_doi_getdef(u32 doi) { struct calipso_doi *doi_def; rcu_read_lock(); doi_def = calipso_doi_search(doi); if (!doi_def) goto doi_getdef_return; if (!refcount_inc_not_zero(&doi_def->refcount)) doi_def = NULL; doi_getdef_return: rcu_read_unlock(); return doi_def; } /** * calipso_doi_putdef - Releases a reference for the given DOI definition * @doi_def: the DOI definition * * Description: * Releases a DOI definition reference obtained from calipso_doi_getdef(). * */ static void calipso_doi_putdef(struct calipso_doi *doi_def) { if (!doi_def) return; if (!refcount_dec_and_test(&doi_def->refcount)) return; calipso_cache_invalidate(); call_rcu(&doi_def->rcu, calipso_doi_free_rcu); } /** * calipso_doi_walk - Iterate through the DOI definitions * @skip_cnt: skip past this number of DOI definitions, updated * @callback: callback for each DOI definition * @cb_arg: argument for the callback function * * Description: * Iterate over the DOI definition list, skipping the first @skip_cnt entries. * For each entry call @callback, if @callback returns a negative value stop * 'walking' through the list and return. Updates the value in @skip_cnt upon * return. Returns zero on success, negative values on failure. * */ static int calipso_doi_walk(u32 *skip_cnt, int (*callback)(struct calipso_doi *doi_def, void *arg), void *cb_arg) { int ret_val = -ENOENT; u32 doi_cnt = 0; struct calipso_doi *iter_doi; rcu_read_lock(); list_for_each_entry_rcu(iter_doi, &calipso_doi_list, list) if (refcount_read(&iter_doi->refcount) > 0) { if (doi_cnt++ < *skip_cnt) continue; ret_val = callback(iter_doi, cb_arg); if (ret_val < 0) { doi_cnt--; goto doi_walk_return; } } doi_walk_return: rcu_read_unlock(); *skip_cnt = doi_cnt; return ret_val; } /** * calipso_validate - Validate a CALIPSO option * @skb: the packet * @option: the start of the option * * Description: * This routine is called to validate a CALIPSO option. * If the option is valid then %true is returned, otherwise * %false is returned. * * The caller should have already checked that the length of the * option (including the TLV header) is >= 10 and that the catmap * length is consistent with the option length. * * We leave checks on the level and categories to the socket layer. */ bool calipso_validate(const struct sk_buff *skb, const unsigned char *option) { struct calipso_doi *doi_def; bool ret_val; u16 crc, len = option[1] + 2; static const u8 zero[2]; /* The original CRC runs over the option including the TLV header * with the CRC-16 field (at offset 8) zeroed out. */ crc = crc_ccitt(0xffff, option, 8); crc = crc_ccitt(crc, zero, sizeof(zero)); if (len > 10) crc = crc_ccitt(crc, option + 10, len - 10); crc = ~crc; if (option[8] != (crc & 0xff) || option[9] != ((crc >> 8) & 0xff)) return false; rcu_read_lock(); doi_def = calipso_doi_search(get_unaligned_be32(option + 2)); ret_val = !!doi_def; rcu_read_unlock(); return ret_val; } /** * calipso_map_cat_hton - Perform a category mapping from host to network * @doi_def: the DOI definition * @secattr: the security attributes * @net_cat: the zero'd out category bitmap in network/CALIPSO format * @net_cat_len: the length of the CALIPSO bitmap in bytes * * Description: * Perform a label mapping to translate a local MLS category bitmap to the * correct CALIPSO bitmap using the given DOI definition. Returns the minimum * size in bytes of the network bitmap on success, negative values otherwise. * */ static int calipso_map_cat_hton(const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *net_cat, u32 net_cat_len) { int spot = -1; u32 net_spot_max = 0; u32 net_clen_bits = net_cat_len * 8; for (;;) { spot = netlbl_catmap_walk(secattr->attr.mls.cat, spot + 1); if (spot < 0) break; if (spot >= net_clen_bits) return -ENOSPC; netlbl_bitmap_setbit(net_cat, spot, 1); if (spot > net_spot_max) net_spot_max = spot; } return (net_spot_max / 32 + 1) * 4; } /** * calipso_map_cat_ntoh - Perform a category mapping from network to host * @doi_def: the DOI definition * @net_cat: the category bitmap in network/CALIPSO format * @net_cat_len: the length of the CALIPSO bitmap in bytes * @secattr: the security attributes * * Description: * Perform a label mapping to translate a CALIPSO bitmap to the correct local * MLS category bitmap using the given DOI definition. Returns zero on * success, negative values on failure. * */ static int calipso_map_cat_ntoh(const struct calipso_doi *doi_def, const unsigned char *net_cat, u32 net_cat_len, struct netlbl_lsm_secattr *secattr) { int ret_val; int spot = -1; u32 net_clen_bits = net_cat_len * 8; for (;;) { spot = netlbl_bitmap_walk(net_cat, net_clen_bits, spot + 1, 1); if (spot < 0) { if (spot == -2) return -EFAULT; return 0; } ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat, spot, GFP_ATOMIC); if (ret_val != 0) return ret_val; } return -EINVAL; } /** * calipso_pad_write - Writes pad bytes in TLV format * @buf: the buffer * @offset: offset from start of buffer to write padding * @count: number of pad bytes to write * * Description: * Write @count bytes of TLV padding into @buffer starting at offset @offset. * @count should be less than 8 - see RFC 4942. * */ static int calipso_pad_write(unsigned char *buf, unsigned int offset, unsigned int count) { if (WARN_ON_ONCE(count >= 8)) return -EINVAL; switch (count) { case 0: break; case 1: buf[offset] = IPV6_TLV_PAD1; break; default: buf[offset] = IPV6_TLV_PADN; buf[offset + 1] = count - 2; if (count > 2) memset(buf + offset + 2, 0, count - 2); break; } return 0; } /** * calipso_genopt - Generate a CALIPSO option * @buf: the option buffer * @start: offset from which to write * @buf_len: the size of opt_buf * @doi_def: the CALIPSO DOI to use * @secattr: the security attributes * * Description: * Generate a CALIPSO option using the DOI definition and security attributes * passed to the function. This also generates upto three bytes of leading * padding that ensures that the option is 4n + 2 aligned. It returns the * number of bytes written (including any initial padding). */ static int calipso_genopt(unsigned char *buf, u32 start, u32 buf_len, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val; u32 len, pad; u16 crc; static const unsigned char padding[4] = {2, 1, 0, 3}; unsigned char *calipso; /* CALIPSO has 4n + 2 alignment */ pad = padding[start & 3]; if (buf_len <= start + pad + CALIPSO_HDR_LEN) return -ENOSPC; if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0) return -EPERM; len = CALIPSO_HDR_LEN; if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { ret_val = calipso_map_cat_hton(doi_def, secattr, buf + start + pad + len, buf_len - start - pad - len); if (ret_val < 0) return ret_val; len += ret_val; } calipso_pad_write(buf, start, pad); calipso = buf + start + pad; calipso[0] = IPV6_TLV_CALIPSO; calipso[1] = len - 2; *(__be32 *)(calipso + 2) = htonl(doi_def->doi); calipso[6] = (len - CALIPSO_HDR_LEN) / 4; calipso[7] = secattr->attr.mls.lvl; crc = ~crc_ccitt(0xffff, calipso, len); calipso[8] = crc & 0xff; calipso[9] = (crc >> 8) & 0xff; return pad + len; } /* Hop-by-hop hdr helper functions */ /** * calipso_opt_update - Replaces socket's hop options with a new set * @sk: the socket * @hop: new hop options * * Description: * Replaces @sk's hop options with @hop. @hop may be NULL to leave * the socket with no hop options. * */ static int calipso_opt_update(struct sock *sk, struct ipv6_opt_hdr *hop) { struct ipv6_txoptions *old = txopt_get(inet6_sk(sk)), *txopts; txopts = ipv6_renew_options(sk, old, IPV6_HOPOPTS, hop); txopt_put(old); if (IS_ERR(txopts)) return PTR_ERR(txopts); txopts = ipv6_update_options(sk, txopts); if (txopts) { atomic_sub(txopts->tot_len, &sk->sk_omem_alloc); txopt_put(txopts); } return 0; } /** * calipso_tlv_len - Returns the length of the TLV * @opt: the option header * @offset: offset of the TLV within the header * * Description: * Returns the length of the TLV option at offset @offset within * the option header @opt. Checks that the entire TLV fits inside * the option header, returns a negative value if this is not the case. */ static int calipso_tlv_len(struct ipv6_opt_hdr *opt, unsigned int offset) { unsigned char *tlv = (unsigned char *)opt; unsigned int opt_len = ipv6_optlen(opt), tlv_len; if (offset < sizeof(*opt) || offset >= opt_len) return -EINVAL; if (tlv[offset] == IPV6_TLV_PAD1) return 1; if (offset + 1 >= opt_len) return -EINVAL; tlv_len = tlv[offset + 1] + 2; if (offset + tlv_len > opt_len) return -EINVAL; return tlv_len; } /** * calipso_opt_find - Finds the CALIPSO option in an IPv6 hop options header * @hop: the hop options header * @start: on return holds the offset of any leading padding * @end: on return holds the offset of the first non-pad TLV after CALIPSO * * Description: * Finds the space occupied by a CALIPSO option (including any leading and * trailing padding). * * If a CALIPSO option exists set @start and @end to the * offsets within @hop of the start of padding before the first * CALIPSO option and the end of padding after the first CALIPSO * option. In this case the function returns 0. * * In the absence of a CALIPSO option, @start and @end will be * set to the start and end of any trailing padding in the header. * This is useful when appending a new option, as the caller may want * to overwrite some of this padding. In this case the function will * return -ENOENT. */ static int calipso_opt_find(struct ipv6_opt_hdr *hop, unsigned int *start, unsigned int *end) { int ret_val = -ENOENT, tlv_len; unsigned int opt_len, offset, offset_s = 0, offset_e = 0; unsigned char *opt = (unsigned char *)hop; opt_len = ipv6_optlen(hop); offset = sizeof(*hop); while (offset < opt_len) { tlv_len = calipso_tlv_len(hop, offset); if (tlv_len < 0) return tlv_len; switch (opt[offset]) { case IPV6_TLV_PAD1: case IPV6_TLV_PADN: if (offset_e) offset_e = offset; break; case IPV6_TLV_CALIPSO: ret_val = 0; offset_e = offset; break; default: if (offset_e == 0) offset_s = offset; else goto out; } offset += tlv_len; } out: if (offset_s) *start = offset_s + calipso_tlv_len(hop, offset_s); else *start = sizeof(*hop); if (offset_e) *end = offset_e + calipso_tlv_len(hop, offset_e); else *end = opt_len; return ret_val; } /** * calipso_opt_insert - Inserts a CALIPSO option into an IPv6 hop opt hdr * @hop: the original hop options header * @doi_def: the CALIPSO DOI to use * @secattr: the specific security attributes of the socket * * Description: * Creates a new hop options header based on @hop with a * CALIPSO option added to it. If @hop already contains a CALIPSO * option this is overwritten, otherwise the new option is appended * after any existing options. If @hop is NULL then the new header * will contain just the CALIPSO option and any needed padding. * */ static struct ipv6_opt_hdr * calipso_opt_insert(struct ipv6_opt_hdr *hop, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { unsigned int start, end, buf_len, pad, hop_len; struct ipv6_opt_hdr *new; int ret_val; if (hop) { hop_len = ipv6_optlen(hop); ret_val = calipso_opt_find(hop, &start, &end); if (ret_val && ret_val != -ENOENT) return ERR_PTR(ret_val); } else { hop_len = 0; start = sizeof(*hop); end = 0; } buf_len = hop_len + start - end + CALIPSO_OPT_LEN_MAX_WITH_PAD; new = kzalloc(buf_len, GFP_ATOMIC); if (!new) return ERR_PTR(-ENOMEM); if (start > sizeof(*hop)) memcpy(new, hop, start); ret_val = calipso_genopt((unsigned char *)new, start, buf_len, doi_def, secattr); if (ret_val < 0) { kfree(new); return ERR_PTR(ret_val); } buf_len = start + ret_val; /* At this point buf_len aligns to 4n, so (buf_len & 4) pads to 8n */ pad = ((buf_len & 4) + (end & 7)) & 7; calipso_pad_write((unsigned char *)new, buf_len, pad); buf_len += pad; if (end != hop_len) { memcpy((char *)new + buf_len, (char *)hop + end, hop_len - end); buf_len += hop_len - end; } new->nexthdr = 0; new->hdrlen = buf_len / 8 - 1; return new; } /** * calipso_opt_del - Removes the CALIPSO option from an option header * @hop: the original header * @new: the new header * * Description: * Creates a new header based on @hop without any CALIPSO option. If @hop * doesn't contain a CALIPSO option it returns -ENOENT. If @hop contains * no other non-padding options, it returns zero with @new set to NULL. * Otherwise it returns zero, creates a new header without the CALIPSO * option (and removing as much padding as possible) and returns with * @new set to that header. * */ static int calipso_opt_del(struct ipv6_opt_hdr *hop, struct ipv6_opt_hdr **new) { int ret_val; unsigned int start, end, delta, pad, hop_len; ret_val = calipso_opt_find(hop, &start, &end); if (ret_val) return ret_val; hop_len = ipv6_optlen(hop); if (start == sizeof(*hop) && end == hop_len) { /* There's no other option in the header so return NULL */ *new = NULL; return 0; } delta = (end - start) & ~7; *new = kzalloc(hop_len - delta, GFP_ATOMIC); if (!*new) return -ENOMEM; memcpy(*new, hop, start); (*new)->hdrlen -= delta / 8; pad = (end - start) & 7; calipso_pad_write((unsigned char *)*new, start, pad); if (end != hop_len) memcpy((char *)*new + start + pad, (char *)hop + end, hop_len - end); return 0; } /** * calipso_opt_getattr - Get the security attributes from a memory block * @calipso: the CALIPSO option * @secattr: the security attributes * * Description: * Inspect @calipso and return the security attributes in @secattr. * Returns zero on success and negative values on failure. * */ static int calipso_opt_getattr(const unsigned char *calipso, struct netlbl_lsm_secattr *secattr) { int ret_val = -ENOMSG; u32 doi, len = calipso[1], cat_len = calipso[6] * 4; struct calipso_doi *doi_def; if (cat_len + 8 > len) return -EINVAL; if (calipso_cache_check(calipso + 2, calipso[1], secattr) == 0) return 0; doi = get_unaligned_be32(calipso + 2); rcu_read_lock(); doi_def = calipso_doi_search(doi); if (!doi_def) goto getattr_return; secattr->attr.mls.lvl = calipso[7]; secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (cat_len) { ret_val = calipso_map_cat_ntoh(doi_def, calipso + 10, cat_len, secattr); if (ret_val != 0) { netlbl_catmap_free(secattr->attr.mls.cat); goto getattr_return; } if (secattr->attr.mls.cat) secattr->flags |= NETLBL_SECATTR_MLS_CAT; } secattr->type = NETLBL_NLTYPE_CALIPSO; getattr_return: rcu_read_unlock(); return ret_val; } /* sock functions. */ /** * calipso_sock_getattr - Get the security attributes from a sock * @sk: the sock * @secattr: the security attributes * * Description: * Query @sk to see if there is a CALIPSO option attached to the sock and if * there is return the CALIPSO security attributes in @secattr. This function * requires that @sk be locked, or privately held, but it does not do any * locking itself. Returns zero on success and negative values on failure. * */ static int calipso_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) { struct ipv6_opt_hdr *hop; int opt_len, len, ret_val = -ENOMSG, offset; unsigned char *opt; struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); if (!txopts || !txopts->hopopt) goto done; hop = txopts->hopopt; opt = (unsigned char *)hop; opt_len = ipv6_optlen(hop); offset = sizeof(*hop); while (offset < opt_len) { len = calipso_tlv_len(hop, offset); if (len < 0) { ret_val = len; goto done; } switch (opt[offset]) { case IPV6_TLV_CALIPSO: if (len < CALIPSO_HDR_LEN) ret_val = -EINVAL; else ret_val = calipso_opt_getattr(&opt[offset], secattr); goto done; default: offset += len; break; } } done: txopt_put(txopts); return ret_val; } /** * calipso_sock_setattr - Add a CALIPSO option to a socket * @sk: the socket * @doi_def: the CALIPSO DOI to use * @secattr: the specific security attributes of the socket * * Description: * Set the CALIPSO option on the given socket using the DOI definition and * security attributes passed to the function. This function requires * exclusive access to @sk, which means it either needs to be in the * process of being created or locked. Returns zero on success and negative * values on failure. * */ static int calipso_sock_setattr(struct sock *sk, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val; struct ipv6_opt_hdr *old, *new; struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); old = NULL; if (txopts) old = txopts->hopopt; new = calipso_opt_insert(old, doi_def, secattr); txopt_put(txopts); if (IS_ERR(new)) return PTR_ERR(new); ret_val = calipso_opt_update(sk, new); kfree(new); return ret_val; } /** * calipso_sock_delattr - Delete the CALIPSO option from a socket * @sk: the socket * * Description: * Removes the CALIPSO option from a socket, if present. * */ static void calipso_sock_delattr(struct sock *sk) { struct ipv6_opt_hdr *new_hop; struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); if (!txopts || !txopts->hopopt) goto done; if (calipso_opt_del(txopts->hopopt, &new_hop)) goto done; calipso_opt_update(sk, new_hop); kfree(new_hop); done: txopt_put(txopts); } /* request sock functions. */ /** * calipso_req_setattr - Add a CALIPSO option to a connection request socket * @req: the connection request socket * @doi_def: the CALIPSO DOI to use * @secattr: the specific security attributes of the socket * * Description: * Set the CALIPSO option on the given socket using the DOI definition and * security attributes passed to the function. Returns zero on success and * negative values on failure. * */ static int calipso_req_setattr(struct request_sock *req, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { struct ipv6_txoptions *txopts; struct inet_request_sock *req_inet = inet_rsk(req); struct ipv6_opt_hdr *old, *new; struct sock *sk = sk_to_full_sk(req_to_sk(req)); if (req_inet->ipv6_opt && req_inet->ipv6_opt->hopopt) old = req_inet->ipv6_opt->hopopt; else old = NULL; new = calipso_opt_insert(old, doi_def, secattr); if (IS_ERR(new)) return PTR_ERR(new); txopts = ipv6_renew_options(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, new); kfree(new); if (IS_ERR(txopts)) return PTR_ERR(txopts); txopts = xchg(&req_inet->ipv6_opt, txopts); if (txopts) { atomic_sub(txopts->tot_len, &sk->sk_omem_alloc); txopt_put(txopts); } return 0; } /** * calipso_req_delattr - Delete the CALIPSO option from a request socket * @req: the request socket * * Description: * Removes the CALIPSO option from a request socket, if present. * */ static void calipso_req_delattr(struct request_sock *req) { struct inet_request_sock *req_inet = inet_rsk(req); struct ipv6_opt_hdr *new; struct ipv6_txoptions *txopts; struct sock *sk = sk_to_full_sk(req_to_sk(req)); if (!req_inet->ipv6_opt || !req_inet->ipv6_opt->hopopt) return; if (calipso_opt_del(req_inet->ipv6_opt->hopopt, &new)) return; /* Nothing to do */ txopts = ipv6_renew_options(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, new); if (!IS_ERR(txopts)) { txopts = xchg(&req_inet->ipv6_opt, txopts); if (txopts) { atomic_sub(txopts->tot_len, &sk->sk_omem_alloc); txopt_put(txopts); } } kfree(new); } /* skbuff functions. */ /** * calipso_skbuff_optptr - Find the CALIPSO option in the packet * @skb: the packet * * Description: * Parse the packet's IP header looking for a CALIPSO option. Returns a pointer * to the start of the CALIPSO option on success, NULL if one if not found. * */ static unsigned char *calipso_skbuff_optptr(const struct sk_buff *skb) { const struct ipv6hdr *ip6_hdr = ipv6_hdr(skb); int offset; if (ip6_hdr->nexthdr != NEXTHDR_HOP) return NULL; offset = ipv6_find_tlv(skb, sizeof(*ip6_hdr), IPV6_TLV_CALIPSO); if (offset >= 0) return (unsigned char *)ip6_hdr + offset; return NULL; } /** * calipso_skbuff_setattr - Set the CALIPSO option on a packet * @skb: the packet * @doi_def: the CALIPSO DOI to use * @secattr: the security attributes * * Description: * Set the CALIPSO option on the given packet based on the security attributes. * Returns a pointer to the IP header on success and NULL on failure. * */ static int calipso_skbuff_setattr(struct sk_buff *skb, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val; struct ipv6hdr *ip6_hdr; struct ipv6_opt_hdr *hop; unsigned char buf[CALIPSO_MAX_BUFFER]; int len_delta, new_end, pad, payload; unsigned int start, end; ip6_hdr = ipv6_hdr(skb); if (ip6_hdr->nexthdr == NEXTHDR_HOP) { hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1); ret_val = calipso_opt_find(hop, &start, &end); if (ret_val && ret_val != -ENOENT) return ret_val; } else { start = 0; end = 0; } memset(buf, 0, sizeof(buf)); ret_val = calipso_genopt(buf, start & 3, sizeof(buf), doi_def, secattr); if (ret_val < 0) return ret_val; new_end = start + ret_val; /* At this point new_end aligns to 4n, so (new_end & 4) pads to 8n */ pad = ((new_end & 4) + (end & 7)) & 7; len_delta = new_end - (int)end + pad; ret_val = skb_cow(skb, skb_headroom(skb) + len_delta); if (ret_val < 0) return ret_val; ip6_hdr = ipv6_hdr(skb); /* Reset as skb_cow() may have moved it */ if (len_delta) { if (len_delta > 0) skb_push(skb, len_delta); else skb_pull(skb, -len_delta); memmove((char *)ip6_hdr - len_delta, ip6_hdr, sizeof(*ip6_hdr) + start); skb_reset_network_header(skb); ip6_hdr = ipv6_hdr(skb); payload = ntohs(ip6_hdr->payload_len); ip6_hdr->payload_len = htons(payload + len_delta); } hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1); if (start == 0) { struct ipv6_opt_hdr *new_hop = (struct ipv6_opt_hdr *)buf; new_hop->nexthdr = ip6_hdr->nexthdr; new_hop->hdrlen = len_delta / 8 - 1; ip6_hdr->nexthdr = NEXTHDR_HOP; } else { hop->hdrlen += len_delta / 8; } memcpy((char *)hop + start, buf + (start & 3), new_end - start); calipso_pad_write((unsigned char *)hop, new_end, pad); return 0; } /** * calipso_skbuff_delattr - Delete any CALIPSO options from a packet * @skb: the packet * * Description: * Removes any and all CALIPSO options from the given packet. Returns zero on * success, negative values on failure. * */ static int calipso_skbuff_delattr(struct sk_buff *skb) { int ret_val; struct ipv6hdr *ip6_hdr; struct ipv6_opt_hdr *old_hop; u32 old_hop_len, start = 0, end = 0, delta, size, pad; if (!calipso_skbuff_optptr(skb)) return 0; /* since we are changing the packet we should make a copy */ ret_val = skb_cow(skb, skb_headroom(skb)); if (ret_val < 0) return ret_val; ip6_hdr = ipv6_hdr(skb); old_hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1); old_hop_len = ipv6_optlen(old_hop); ret_val = calipso_opt_find(old_hop, &start, &end); if (ret_val) return ret_val; if (start == sizeof(*old_hop) && end == old_hop_len) { /* There's no other option in the header so we delete * the whole thing. */ delta = old_hop_len; size = sizeof(*ip6_hdr); ip6_hdr->nexthdr = old_hop->nexthdr; } else { delta = (end - start) & ~7; if (delta) old_hop->hdrlen -= delta / 8; pad = (end - start) & 7; size = sizeof(*ip6_hdr) + start + pad; calipso_pad_write((unsigned char *)old_hop, start, pad); } if (delta) { skb_pull(skb, delta); memmove((char *)ip6_hdr + delta, ip6_hdr, size); skb_reset_network_header(skb); } return 0; } static const struct netlbl_calipso_ops ops = { .doi_add = calipso_doi_add, .doi_free = calipso_doi_free, .doi_remove = calipso_doi_remove, .doi_getdef = calipso_doi_getdef, .doi_putdef = calipso_doi_putdef, .doi_walk = calipso_doi_walk, .sock_getattr = calipso_sock_getattr, .sock_setattr = calipso_sock_setattr, .sock_delattr = calipso_sock_delattr, .req_setattr = calipso_req_setattr, .req_delattr = calipso_req_delattr, .opt_getattr = calipso_opt_getattr, .skbuff_optptr = calipso_skbuff_optptr, .skbuff_setattr = calipso_skbuff_setattr, .skbuff_delattr = calipso_skbuff_delattr, .cache_invalidate = calipso_cache_invalidate, .cache_add = calipso_cache_add }; /** * calipso_init - Initialize the CALIPSO module * * Description: * Initialize the CALIPSO module and prepare it for use. Returns zero on * success and negative values on failure. * */ int __init calipso_init(void) { int ret_val; ret_val = calipso_cache_init(); if (!ret_val) netlbl_calipso_ops_register(&ops); return ret_val; } void calipso_exit(void) { netlbl_calipso_ops_register(NULL); calipso_cache_invalidate(); kfree(calipso_cache); }
5 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 /* SPDX-License-Identifier: GPL-2.0-only */ /* L2TP internal definitions. * * Copyright (c) 2008,2009 Katalix Systems Ltd */ #include <linux/refcount.h> #ifndef _L2TP_CORE_H_ #define _L2TP_CORE_H_ #include <net/dst.h> #include <net/sock.h> #ifdef CONFIG_XFRM #include <net/xfrm.h> #endif /* Random numbers used for internal consistency checks of tunnel and session structures */ #define L2TP_TUNNEL_MAGIC 0x42114DDA #define L2TP_SESSION_MAGIC 0x0C04EB7D /* Per tunnel session hash table size */ #define L2TP_HASH_BITS 4 #define L2TP_HASH_SIZE BIT(L2TP_HASH_BITS) /* System-wide session hash table size */ #define L2TP_HASH_BITS_2 8 #define L2TP_HASH_SIZE_2 BIT(L2TP_HASH_BITS_2) struct sk_buff; struct l2tp_stats { atomic_long_t tx_packets; atomic_long_t tx_bytes; atomic_long_t tx_errors; atomic_long_t rx_packets; atomic_long_t rx_bytes; atomic_long_t rx_seq_discards; atomic_long_t rx_oos_packets; atomic_long_t rx_errors; atomic_long_t rx_cookie_discards; atomic_long_t rx_invalid; }; struct l2tp_tunnel; /* L2TP session configuration */ struct l2tp_session_cfg { enum l2tp_pwtype pw_type; unsigned int recv_seq:1; /* expect receive packets with sequence numbers? */ unsigned int send_seq:1; /* send packets with sequence numbers? */ unsigned int lns_mode:1; /* behave as LNS? * LAC enables sequence numbers under LNS control. */ u16 l2specific_type; /* Layer 2 specific type */ u8 cookie[8]; /* optional cookie */ int cookie_len; /* 0, 4 or 8 bytes */ u8 peer_cookie[8]; /* peer's cookie */ int peer_cookie_len; /* 0, 4 or 8 bytes */ int reorder_timeout; /* configured reorder timeout (in jiffies) */ char *ifname; }; /* Represents a session (pseudowire) instance. * Tracks runtime state including cookies, dataplane packet sequencing, and IO statistics. * Is linked into a per-tunnel session hashlist; and in the case of an L2TPv3 session into * an additional per-net ("global") hashlist. */ #define L2TP_SESSION_NAME_MAX 32 struct l2tp_session { int magic; /* should be L2TP_SESSION_MAGIC */ long dead; struct l2tp_tunnel *tunnel; /* back pointer to tunnel context */ u32 session_id; u32 peer_session_id; u8 cookie[8]; int cookie_len; u8 peer_cookie[8]; int peer_cookie_len; u16 l2specific_type; u16 hdr_len; u32 nr; /* session NR state (receive) */ u32 ns; /* session NR state (send) */ struct sk_buff_head reorder_q; /* receive reorder queue */ u32 nr_max; /* max NR. Depends on tunnel */ u32 nr_window_size; /* NR window size */ u32 nr_oos; /* NR of last OOS packet */ int nr_oos_count; /* for OOS recovery */ int nr_oos_count_max; struct hlist_node hlist; /* hash list node */ refcount_t ref_count; char name[L2TP_SESSION_NAME_MAX]; /* for logging */ char ifname[IFNAMSIZ]; unsigned int recv_seq:1; /* expect receive packets with sequence numbers? */ unsigned int send_seq:1; /* send packets with sequence numbers? */ unsigned int lns_mode:1; /* behave as LNS? * LAC enables sequence numbers under LNS control. */ int reorder_timeout; /* configured reorder timeout (in jiffies) */ int reorder_skip; /* set if skip to next nr */ enum l2tp_pwtype pwtype; struct l2tp_stats stats; struct hlist_node global_hlist; /* global hash list node */ /* Session receive handler for data packets. * Each pseudowire implementation should implement this callback in order to * handle incoming packets. Packets are passed to the pseudowire handler after * reordering, if data sequence numbers are enabled for the session. */ void (*recv_skb)(struct l2tp_session *session, struct sk_buff *skb, int data_len); /* Session close handler. * Each pseudowire implementation may implement this callback in order to carry * out pseudowire-specific shutdown actions. * The callback is called by core after unhashing the session and purging its * reorder queue. */ void (*session_close)(struct l2tp_session *session); /* Session show handler. * Pseudowire-specific implementation of debugfs session rendering. * The callback is called by l2tp_debugfs.c after rendering core session * information. */ void (*show)(struct seq_file *m, void *priv); u8 priv[]; /* private data */ }; /* L2TP tunnel configuration */ struct l2tp_tunnel_cfg { enum l2tp_encap_type encap; /* Used only for kernel-created sockets */ struct in_addr local_ip; struct in_addr peer_ip; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr *local_ip6; struct in6_addr *peer_ip6; #endif u16 local_udp_port; u16 peer_udp_port; unsigned int use_udp_checksums:1, udp6_zero_tx_checksums:1, udp6_zero_rx_checksums:1; }; /* Represents a tunnel instance. * Tracks runtime state including IO statistics. * Holds the tunnel socket (either passed from userspace or directly created by the kernel). * Maintains a hashlist of sessions belonging to the tunnel instance. * Is linked into a per-net list of tunnels. */ #define L2TP_TUNNEL_NAME_MAX 20 struct l2tp_tunnel { int magic; /* Should be L2TP_TUNNEL_MAGIC */ unsigned long dead; struct rcu_head rcu; spinlock_t hlist_lock; /* write-protection for session_hlist */ bool acpt_newsess; /* indicates whether this tunnel accepts * new sessions. Protected by hlist_lock. */ struct hlist_head session_hlist[L2TP_HASH_SIZE]; /* hashed list of sessions, hashed by id */ u32 tunnel_id; u32 peer_tunnel_id; int version; /* 2=>L2TPv2, 3=>L2TPv3 */ char name[L2TP_TUNNEL_NAME_MAX]; /* for logging */ enum l2tp_encap_type encap; struct l2tp_stats stats; struct list_head list; /* list node on per-namespace list of tunnels */ struct net *l2tp_net; /* the net we belong to */ refcount_t ref_count; void (*old_sk_destruct)(struct sock *sk); struct sock *sock; /* parent socket */ int fd; /* parent fd, if tunnel socket was created * by userspace */ struct work_struct del_work; }; /* Pseudowire ops callbacks for use with the l2tp genetlink interface */ struct l2tp_nl_cmd_ops { /* The pseudowire session create callback is responsible for creating a session * instance for a specific pseudowire type. * It must call l2tp_session_create and l2tp_session_register to register the * session instance, as well as carry out any pseudowire-specific initialisation. * It must return >= 0 on success, or an appropriate negative errno value on failure. */ int (*session_create)(struct net *net, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg); /* The pseudowire session delete callback is responsible for initiating the deletion * of a session instance. * It must call l2tp_session_delete, as well as carry out any pseudowire-specific * teardown actions. */ void (*session_delete)(struct l2tp_session *session); }; static inline void *l2tp_session_priv(struct l2tp_session *session) { return &session->priv[0]; } /* Tunnel and session refcounts */ void l2tp_tunnel_inc_refcount(struct l2tp_tunnel *tunnel); void l2tp_tunnel_dec_refcount(struct l2tp_tunnel *tunnel); void l2tp_session_inc_refcount(struct l2tp_session *session); void l2tp_session_dec_refcount(struct l2tp_session *session); /* Tunnel and session lookup. * These functions take a reference on the instances they return, so * the caller must ensure that the reference is dropped appropriately. */ struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id); struct l2tp_tunnel *l2tp_tunnel_get_nth(const struct net *net, int nth); struct l2tp_session *l2tp_tunnel_get_session(struct l2tp_tunnel *tunnel, u32 session_id); struct l2tp_session *l2tp_session_get(const struct net *net, u32 session_id); struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth); struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net, const char *ifname); /* Tunnel and session lifetime management. * Creation of a new instance is a two-step process: create, then register. * Destruction is triggered using the *_delete functions, and completes asynchronously. */ int l2tp_tunnel_create(int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp); int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, struct l2tp_tunnel_cfg *cfg); void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg); int l2tp_session_register(struct l2tp_session *session, struct l2tp_tunnel *tunnel); void l2tp_session_delete(struct l2tp_session *session); /* Receive path helpers. If data sequencing is enabled for the session these * functions handle queuing and reordering prior to passing packets to the * pseudowire code to be passed to userspace. */ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, unsigned char *ptr, unsigned char *optr, u16 hdrflags, int length); int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb); /* Transmit path helpers for sending packets over the tunnel socket. */ void l2tp_session_set_header_len(struct l2tp_session *session, int version); int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb); /* Pseudowire management. * Pseudowires should register with l2tp core on module init, and unregister * on module exit. */ int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops *ops); void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type); /* IOCTL helper for IP encap modules. */ int l2tp_ioctl(struct sock *sk, int cmd, int *karg); /* Extract the tunnel structure from a socket's sk_user_data pointer, * validating the tunnel magic feather. */ struct l2tp_tunnel *l2tp_sk_to_tunnel(struct sock *sk); static inline int l2tp_get_l2specific_len(struct l2tp_session *session) { switch (session->l2specific_type) { case L2TP_L2SPECTYPE_DEFAULT: return 4; case L2TP_L2SPECTYPE_NONE: default: return 0; } } static inline u32 l2tp_tunnel_dst_mtu(const struct l2tp_tunnel *tunnel) { struct dst_entry *dst; u32 mtu; dst = sk_dst_get(tunnel->sock); if (!dst) return 0; mtu = dst_mtu(dst); dst_release(dst); return mtu; } #ifdef CONFIG_XFRM static inline bool l2tp_tunnel_uses_xfrm(const struct l2tp_tunnel *tunnel) { struct sock *sk = tunnel->sock; return sk && (rcu_access_pointer(sk->sk_policy[0]) || rcu_access_pointer(sk->sk_policy[1])); } #else static inline bool l2tp_tunnel_uses_xfrm(const struct l2tp_tunnel *tunnel) { return false; } #endif static inline int l2tp_v3_ensure_opt_in_linear(struct l2tp_session *session, struct sk_buff *skb, unsigned char **ptr, unsigned char **optr) { int opt_len = session->peer_cookie_len + l2tp_get_l2specific_len(session); if (opt_len > 0) { int off = *ptr - *optr; if (!pskb_may_pull(skb, off + opt_len)) return -1; if (skb->data != *optr) { *optr = skb->data; *ptr = skb->data + off; } } return 0; } #define MODULE_ALIAS_L2TP_PWTYPE(type) \ MODULE_ALIAS("net-l2tp-type-" __stringify(type)) #endif /* _L2TP_CORE_H_ */
11 11 9 6 4 12 12 12 12 12 11 11 2 3 3 1 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include "queueing.h" #include "device.h" #include "peer.h" #include "timers.h" #include "messages.h" #include "cookie.h" #include "socket.h" #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/udp.h> #include <net/ip_tunnels.h> /* Must be called with bh disabled. */ static void update_rx_stats(struct wg_peer *peer, size_t len) { dev_sw_netstats_rx_add(peer->device->dev, len); peer->rx_bytes += len; } #define SKB_TYPE_LE32(skb) (((struct message_header *)(skb)->data)->type) static size_t validate_header_len(struct sk_buff *skb) { if (unlikely(skb->len < sizeof(struct message_header))) return 0; if (SKB_TYPE_LE32(skb) == cpu_to_le32(MESSAGE_DATA) && skb->len >= MESSAGE_MINIMUM_LENGTH) return sizeof(struct message_data); if (SKB_TYPE_LE32(skb) == cpu_to_le32(MESSAGE_HANDSHAKE_INITIATION) && skb->len == sizeof(struct message_handshake_initiation)) return sizeof(struct message_handshake_initiation); if (SKB_TYPE_LE32(skb) == cpu_to_le32(MESSAGE_HANDSHAKE_RESPONSE) && skb->len == sizeof(struct message_handshake_response)) return sizeof(struct message_handshake_response); if (SKB_TYPE_LE32(skb) == cpu_to_le32(MESSAGE_HANDSHAKE_COOKIE) && skb->len == sizeof(struct message_handshake_cookie)) return sizeof(struct message_handshake_cookie); return 0; } static int prepare_skb_header(struct sk_buff *skb, struct wg_device *wg) { size_t data_offset, data_len, header_len; struct udphdr *udp; if (unlikely(!wg_check_packet_protocol(skb) || skb_transport_header(skb) < skb->head || (skb_transport_header(skb) + sizeof(struct udphdr)) > skb_tail_pointer(skb))) return -EINVAL; /* Bogus IP header */ udp = udp_hdr(skb); data_offset = (u8 *)udp - skb->data; if (unlikely(data_offset > U16_MAX || data_offset + sizeof(struct udphdr) > skb->len)) /* Packet has offset at impossible location or isn't big enough * to have UDP fields. */ return -EINVAL; data_len = ntohs(udp->len); if (unlikely(data_len < sizeof(struct udphdr) || data_len > skb->len - data_offset)) /* UDP packet is reporting too small of a size or lying about * its size. */ return -EINVAL; data_len -= sizeof(struct udphdr); data_offset = (u8 *)udp + sizeof(struct udphdr) - skb->data; if (unlikely(!pskb_may_pull(skb, data_offset + sizeof(struct message_header)) || pskb_trim(skb, data_len + data_offset) < 0)) return -EINVAL; skb_pull(skb, data_offset); if (unlikely(skb->len != data_len)) /* Final len does not agree with calculated len */ return -EINVAL; header_len = validate_header_len(skb); if (unlikely(!header_len)) return -EINVAL; __skb_push(skb, data_offset); if (unlikely(!pskb_may_pull(skb, data_offset + header_len))) return -EINVAL; __skb_pull(skb, data_offset); return 0; } static void wg_receive_handshake_packet(struct wg_device *wg, struct sk_buff *skb) { enum cookie_mac_state mac_state; struct wg_peer *peer = NULL; /* This is global, so that our load calculation applies to the whole * system. We don't care about races with it at all. */ static u64 last_under_load; bool packet_needs_cookie; bool under_load; if (SKB_TYPE_LE32(skb) == cpu_to_le32(MESSAGE_HANDSHAKE_COOKIE)) { net_dbg_skb_ratelimited("%s: Receiving cookie response from %pISpfsc\n", wg->dev->name, skb); wg_cookie_message_consume( (struct message_handshake_cookie *)skb->data, wg); return; } under_load = atomic_read(&wg->handshake_queue_len) >= MAX_QUEUED_INCOMING_HANDSHAKES / 8; if (under_load) { last_under_load = ktime_get_coarse_boottime_ns(); } else if (last_under_load) { under_load = !wg_birthdate_has_expired(last_under_load, 1); if (!under_load) last_under_load = 0; } mac_state = wg_cookie_validate_packet(&wg->cookie_checker, skb, under_load); if ((under_load && mac_state == VALID_MAC_WITH_COOKIE) || (!under_load && mac_state == VALID_MAC_BUT_NO_COOKIE)) { packet_needs_cookie = false; } else if (under_load && mac_state == VALID_MAC_BUT_NO_COOKIE) { packet_needs_cookie = true; } else { net_dbg_skb_ratelimited("%s: Invalid MAC of handshake, dropping packet from %pISpfsc\n", wg->dev->name, skb); return; } switch (SKB_TYPE_LE32(skb)) { case cpu_to_le32(MESSAGE_HANDSHAKE_INITIATION): { struct message_handshake_initiation *message = (struct message_handshake_initiation *)skb->data; if (packet_needs_cookie) { wg_packet_send_handshake_cookie(wg, skb, message->sender_index); return; } peer = wg_noise_handshake_consume_initiation(message, wg); if (unlikely(!peer)) { net_dbg_skb_ratelimited("%s: Invalid handshake initiation from %pISpfsc\n", wg->dev->name, skb); return; } wg_socket_set_peer_endpoint_from_skb(peer, skb); net_dbg_ratelimited("%s: Receiving handshake initiation from peer %llu (%pISpfsc)\n", wg->dev->name, peer->internal_id, &peer->endpoint.addr); wg_packet_send_handshake_response(peer); break; } case cpu_to_le32(MESSAGE_HANDSHAKE_RESPONSE): { struct message_handshake_response *message = (struct message_handshake_response *)skb->data; if (packet_needs_cookie) { wg_packet_send_handshake_cookie(wg, skb, message->sender_index); return; } peer = wg_noise_handshake_consume_response(message, wg); if (unlikely(!peer)) { net_dbg_skb_ratelimited("%s: Invalid handshake response from %pISpfsc\n", wg->dev->name, skb); return; } wg_socket_set_peer_endpoint_from_skb(peer, skb); net_dbg_ratelimited("%s: Receiving handshake response from peer %llu (%pISpfsc)\n", wg->dev->name, peer->internal_id, &peer->endpoint.addr); if (wg_noise_handshake_begin_session(&peer->handshake, &peer->keypairs)) { wg_timers_session_derived(peer); wg_timers_handshake_complete(peer); /* Calling this function will either send any existing * packets in the queue and not send a keepalive, which * is the best case, Or, if there's nothing in the * queue, it will send a keepalive, in order to give * immediate confirmation of the session. */ wg_packet_send_keepalive(peer); } break; } } if (unlikely(!peer)) { WARN(1, "Somehow a wrong type of packet wound up in the handshake queue!\n"); return; } local_bh_disable(); update_rx_stats(peer, skb->len); local_bh_enable(); wg_timers_any_authenticated_packet_received(peer); wg_timers_any_authenticated_packet_traversal(peer); wg_peer_put(peer); } void wg_packet_handshake_receive_worker(struct work_struct *work) { struct crypt_queue *queue = container_of(work, struct multicore_worker, work)->ptr; struct wg_device *wg = container_of(queue, struct wg_device, handshake_queue); struct sk_buff *skb; while ((skb = ptr_ring_consume_bh(&queue->ring)) != NULL) { wg_receive_handshake_packet(wg, skb); dev_kfree_skb(skb); atomic_dec(&wg->handshake_queue_len); cond_resched(); } } static void keep_key_fresh(struct wg_peer *peer) { struct noise_keypair *keypair; bool send; if (peer->sent_lastminute_handshake) return; rcu_read_lock_bh(); keypair = rcu_dereference_bh(peer->keypairs.current_keypair); send = keypair && READ_ONCE(keypair->sending.is_valid) && keypair->i_am_the_initiator && wg_birthdate_has_expired(keypair->sending.birthdate, REJECT_AFTER_TIME - KEEPALIVE_TIMEOUT - REKEY_TIMEOUT); rcu_read_unlock_bh(); if (unlikely(send)) { peer->sent_lastminute_handshake = true; wg_packet_send_queued_handshake_initiation(peer, false); } } static bool decrypt_packet(struct sk_buff *skb, struct noise_keypair *keypair) { struct scatterlist sg[MAX_SKB_FRAGS + 8]; struct sk_buff *trailer; unsigned int offset; int num_frags; if (unlikely(!keypair)) return false; if (unlikely(!READ_ONCE(keypair->receiving.is_valid) || wg_birthdate_has_expired(keypair->receiving.birthdate, REJECT_AFTER_TIME) || keypair->receiving_counter.counter >= REJECT_AFTER_MESSAGES)) { WRITE_ONCE(keypair->receiving.is_valid, false); return false; } PACKET_CB(skb)->nonce = le64_to_cpu(((struct message_data *)skb->data)->counter); /* We ensure that the network header is part of the packet before we * call skb_cow_data, so that there's no chance that data is removed * from the skb, so that later we can extract the original endpoint. */ offset = skb->data - skb_network_header(skb); skb_push(skb, offset); num_frags = skb_cow_data(skb, 0, &trailer); offset += sizeof(struct message_data); skb_pull(skb, offset); if (unlikely(num_frags < 0 || num_frags > ARRAY_SIZE(sg))) return false; sg_init_table(sg, num_frags); if (skb_to_sgvec(skb, sg, 0, skb->len) <= 0) return false; if (!chacha20poly1305_decrypt_sg_inplace(sg, skb->len, NULL, 0, PACKET_CB(skb)->nonce, keypair->receiving.key)) return false; /* Another ugly situation of pushing and pulling the header so as to * keep endpoint information intact. */ skb_push(skb, offset); if (pskb_trim(skb, skb->len - noise_encrypted_len(0))) return false; skb_pull(skb, offset); return true; } /* This is RFC6479, a replay detection bitmap algorithm that avoids bitshifts */ static bool counter_validate(struct noise_replay_counter *counter, u64 their_counter) { unsigned long index, index_current, top, i; bool ret = false; spin_lock_bh(&counter->lock); if (unlikely(counter->counter >= REJECT_AFTER_MESSAGES + 1 || their_counter >= REJECT_AFTER_MESSAGES)) goto out; ++their_counter; if (unlikely((COUNTER_WINDOW_SIZE + their_counter) < counter->counter)) goto out; index = their_counter >> ilog2(BITS_PER_LONG); if (likely(their_counter > counter->counter)) { index_current = counter->counter >> ilog2(BITS_PER_LONG); top = min_t(unsigned long, index - index_current, COUNTER_BITS_TOTAL / BITS_PER_LONG); for (i = 1; i <= top; ++i) counter->backtrack[(i + index_current) & ((COUNTER_BITS_TOTAL / BITS_PER_LONG) - 1)] = 0; counter->counter = their_counter; } index &= (COUNTER_BITS_TOTAL / BITS_PER_LONG) - 1; ret = !test_and_set_bit(their_counter & (BITS_PER_LONG - 1), &counter->backtrack[index]); out: spin_unlock_bh(&counter->lock); return ret; } #include "selftest/counter.c" static void wg_packet_consume_data_done(struct wg_peer *peer, struct sk_buff *skb, struct endpoint *endpoint) { struct net_device *dev = peer->device->dev; unsigned int len, len_before_trim; struct wg_peer *routed_peer; wg_socket_set_peer_endpoint(peer, endpoint); if (unlikely(wg_noise_received_with_keypair(&peer->keypairs, PACKET_CB(skb)->keypair))) { wg_timers_handshake_complete(peer); wg_packet_send_staged_packets(peer); } keep_key_fresh(peer); wg_timers_any_authenticated_packet_received(peer); wg_timers_any_authenticated_packet_traversal(peer); /* A packet with length 0 is a keepalive packet */ if (unlikely(!skb->len)) { update_rx_stats(peer, message_data_len(0)); net_dbg_ratelimited("%s: Receiving keepalive packet from peer %llu (%pISpfsc)\n", dev->name, peer->internal_id, &peer->endpoint.addr); goto packet_processed; } wg_timers_data_received(peer); if (unlikely(skb_network_header(skb) < skb->head)) goto dishonest_packet_size; if (unlikely(!(pskb_network_may_pull(skb, sizeof(struct iphdr)) && (ip_hdr(skb)->version == 4 || (ip_hdr(skb)->version == 6 && pskb_network_may_pull(skb, sizeof(struct ipv6hdr))))))) goto dishonest_packet_type; skb->dev = dev; /* We've already verified the Poly1305 auth tag, which means this packet * was not modified in transit. We can therefore tell the networking * stack that all checksums of every layer of encapsulation have already * been checked "by the hardware" and therefore is unnecessary to check * again in software. */ skb->ip_summed = CHECKSUM_UNNECESSARY; skb->csum_level = ~0; /* All levels */ skb->protocol = ip_tunnel_parse_protocol(skb); if (skb->protocol == htons(ETH_P_IP)) { len = ntohs(ip_hdr(skb)->tot_len); if (unlikely(len < sizeof(struct iphdr))) goto dishonest_packet_size; INET_ECN_decapsulate(skb, PACKET_CB(skb)->ds, ip_hdr(skb)->tos); } else if (skb->protocol == htons(ETH_P_IPV6)) { len = ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr); INET_ECN_decapsulate(skb, PACKET_CB(skb)->ds, ipv6_get_dsfield(ipv6_hdr(skb))); } else { goto dishonest_packet_type; } if (unlikely(len > skb->len)) goto dishonest_packet_size; len_before_trim = skb->len; if (unlikely(pskb_trim(skb, len))) goto packet_processed; routed_peer = wg_allowedips_lookup_src(&peer->device->peer_allowedips, skb); wg_peer_put(routed_peer); /* We don't need the extra reference. */ if (unlikely(routed_peer != peer)) goto dishonest_packet_peer; napi_gro_receive(&peer->napi, skb); update_rx_stats(peer, message_data_len(len_before_trim)); return; dishonest_packet_peer: net_dbg_skb_ratelimited("%s: Packet has unallowed src IP (%pISc) from peer %llu (%pISpfsc)\n", dev->name, skb, peer->internal_id, &peer->endpoint.addr); DEV_STATS_INC(dev, rx_errors); DEV_STATS_INC(dev, rx_frame_errors); goto packet_processed; dishonest_packet_type: net_dbg_ratelimited("%s: Packet is neither ipv4 nor ipv6 from peer %llu (%pISpfsc)\n", dev->name, peer->internal_id, &peer->endpoint.addr); DEV_STATS_INC(dev, rx_errors); DEV_STATS_INC(dev, rx_frame_errors); goto packet_processed; dishonest_packet_size: net_dbg_ratelimited("%s: Packet has incorrect size from peer %llu (%pISpfsc)\n", dev->name, peer->internal_id, &peer->endpoint.addr); DEV_STATS_INC(dev, rx_errors); DEV_STATS_INC(dev, rx_length_errors); goto packet_processed; packet_processed: dev_kfree_skb(skb); } int wg_packet_rx_poll(struct napi_struct *napi, int budget) { struct wg_peer *peer = container_of(napi, struct wg_peer, napi); struct noise_keypair *keypair; struct endpoint endpoint; enum packet_state state; struct sk_buff *skb; int work_done = 0; bool free; if (unlikely(budget <= 0)) return 0; while ((skb = wg_prev_queue_peek(&peer->rx_queue)) != NULL && (state = atomic_read_acquire(&PACKET_CB(skb)->state)) != PACKET_STATE_UNCRYPTED) { wg_prev_queue_drop_peeked(&peer->rx_queue); keypair = PACKET_CB(skb)->keypair; free = true; if (unlikely(state != PACKET_STATE_CRYPTED)) goto next; if (unlikely(!counter_validate(&keypair->receiving_counter, PACKET_CB(skb)->nonce))) { net_dbg_ratelimited("%s: Packet has invalid nonce %llu (max %llu)\n", peer->device->dev->name, PACKET_CB(skb)->nonce, keypair->receiving_counter.counter); goto next; } if (unlikely(wg_socket_endpoint_from_skb(&endpoint, skb))) goto next; wg_reset_packet(skb, false); wg_packet_consume_data_done(peer, skb, &endpoint); free = false; next: wg_noise_keypair_put(keypair, false); wg_peer_put(peer); if (unlikely(free)) dev_kfree_skb(skb); if (++work_done >= budget) break; } if (work_done < budget) napi_complete_done(napi, work_done); return work_done; } void wg_packet_decrypt_worker(struct work_struct *work) { struct crypt_queue *queue = container_of(work, struct multicore_worker, work)->ptr; struct sk_buff *skb; while ((skb = ptr_ring_consume_bh(&queue->ring)) != NULL) { enum packet_state state = likely(decrypt_packet(skb, PACKET_CB(skb)->keypair)) ? PACKET_STATE_CRYPTED : PACKET_STATE_DEAD; wg_queue_enqueue_per_peer_rx(skb, state); if (need_resched()) cond_resched(); } } static void wg_packet_consume_data(struct wg_device *wg, struct sk_buff *skb) { __le32 idx = ((struct message_data *)skb->data)->key_idx; struct wg_peer *peer = NULL; int ret; rcu_read_lock_bh(); PACKET_CB(skb)->keypair = (struct noise_keypair *)wg_index_hashtable_lookup( wg->index_hashtable, INDEX_HASHTABLE_KEYPAIR, idx, &peer); if (unlikely(!wg_noise_keypair_get(PACKET_CB(skb)->keypair))) goto err_keypair; if (unlikely(READ_ONCE(peer->is_dead))) goto err; ret = wg_queue_enqueue_per_device_and_peer(&wg->decrypt_queue, &peer->rx_queue, skb, wg->packet_crypt_wq); if (unlikely(ret == -EPIPE)) wg_queue_enqueue_per_peer_rx(skb, PACKET_STATE_DEAD); if (likely(!ret || ret == -EPIPE)) { rcu_read_unlock_bh(); return; } err: wg_noise_keypair_put(PACKET_CB(skb)->keypair, false); err_keypair: rcu_read_unlock_bh(); wg_peer_put(peer); dev_kfree_skb(skb); } void wg_packet_receive(struct wg_device *wg, struct sk_buff *skb) { if (unlikely(prepare_skb_header(skb, wg) < 0)) goto err; switch (SKB_TYPE_LE32(skb)) { case cpu_to_le32(MESSAGE_HANDSHAKE_INITIATION): case cpu_to_le32(MESSAGE_HANDSHAKE_RESPONSE): case cpu_to_le32(MESSAGE_HANDSHAKE_COOKIE): { int cpu, ret = -EBUSY; if (unlikely(!rng_is_initialized())) goto drop; if (atomic_read(&wg->handshake_queue_len) > MAX_QUEUED_INCOMING_HANDSHAKES / 2) { if (spin_trylock_bh(&wg->handshake_queue.ring.producer_lock)) { ret = __ptr_ring_produce(&wg->handshake_queue.ring, skb); spin_unlock_bh(&wg->handshake_queue.ring.producer_lock); } } else ret = ptr_ring_produce_bh(&wg->handshake_queue.ring, skb); if (ret) { drop: net_dbg_skb_ratelimited("%s: Dropping handshake packet from %pISpfsc\n", wg->dev->name, skb); goto err; } atomic_inc(&wg->handshake_queue_len); cpu = wg_cpumask_next_online(&wg->handshake_queue.last_cpu); /* Queues up a call to packet_process_queued_handshake_packets(skb): */ queue_work_on(cpu, wg->handshake_receive_wq, &per_cpu_ptr(wg->handshake_queue.worker, cpu)->work); break; } case cpu_to_le32(MESSAGE_DATA): PACKET_CB(skb)->ds = ip_tunnel_get_dsfield(ip_hdr(skb), skb); wg_packet_consume_data(wg, skb); break; default: WARN(1, "Non-exhaustive parsing of packet header lead to unknown packet type!\n"); goto err; } return; err: dev_kfree_skb(skb); }
5 5 5 5 5 5 4 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 // SPDX-License-Identifier: GPL-2.0 /* * USB Serial Converter driver * * Copyright (C) 2009 - 2013 Johan Hovold (jhovold@gmail.com) * Copyright (C) 1999 - 2012 Greg Kroah-Hartman (greg@kroah.com) * Copyright (C) 2000 Peter Berger (pberger@brimson.com) * Copyright (C) 2000 Al Borchers (borchers@steinerpoint.com) * * This driver was originally based on the ACM driver by Armin Fuerst (which was * based on a driver by Brad Keryan) * * See Documentation/usb/usb-serial.rst for more information on using this * driver */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/seq_file.h> #include <linux/spinlock.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/uaccess.h> #include <linux/serial.h> #include <linux/usb.h> #include <linux/usb/serial.h> #include <linux/kfifo.h> #include <linux/idr.h> #define DRIVER_AUTHOR "Greg Kroah-Hartman <gregkh@linuxfoundation.org>" #define DRIVER_DESC "USB Serial Driver core" #define USB_SERIAL_TTY_MAJOR 188 #define USB_SERIAL_TTY_MINORS 512 /* should be enough for a while */ /* There is no MODULE_DEVICE_TABLE for usbserial.c. Instead the MODULE_DEVICE_TABLE declarations in each serial driver cause the "hotplug" program to pull in whatever module is necessary via modprobe, and modprobe will load usbserial because the serial drivers depend on it. */ static DEFINE_IDR(serial_minors); static DEFINE_MUTEX(table_lock); static LIST_HEAD(usb_serial_driver_list); /* * Look up the serial port structure. If it is found and it hasn't been * disconnected, return with the parent usb_serial structure's disc_mutex held * and its refcount incremented. Otherwise return NULL. */ struct usb_serial_port *usb_serial_port_get_by_minor(unsigned minor) { struct usb_serial *serial; struct usb_serial_port *port; mutex_lock(&table_lock); port = idr_find(&serial_minors, minor); if (!port) goto exit; serial = port->serial; mutex_lock(&serial->disc_mutex); if (serial->disconnected) { mutex_unlock(&serial->disc_mutex); port = NULL; } else { kref_get(&serial->kref); } exit: mutex_unlock(&table_lock); return port; } static int allocate_minors(struct usb_serial *serial, int num_ports) { struct usb_serial_port *port; unsigned int i, j; int minor; dev_dbg(&serial->interface->dev, "%s %d\n", __func__, num_ports); mutex_lock(&table_lock); for (i = 0; i < num_ports; ++i) { port = serial->port[i]; minor = idr_alloc(&serial_minors, port, 0, USB_SERIAL_TTY_MINORS, GFP_KERNEL); if (minor < 0) goto error; port->minor = minor; port->port_number = i; } serial->minors_reserved = 1; mutex_unlock(&table_lock); return 0; error: /* unwind the already allocated minors */ for (j = 0; j < i; ++j) idr_remove(&serial_minors, serial->port[j]->minor); mutex_unlock(&table_lock); return minor; } static void release_minors(struct usb_serial *serial) { int i; mutex_lock(&table_lock); for (i = 0; i < serial->num_ports; ++i) idr_remove(&serial_minors, serial->port[i]->minor); mutex_unlock(&table_lock); serial->minors_reserved = 0; } int usb_serial_claim_interface(struct usb_serial *serial, struct usb_interface *intf) { struct usb_driver *driver = serial->type->usb_driver; int ret; if (serial->sibling) return -EBUSY; ret = usb_driver_claim_interface(driver, intf, serial); if (ret) { dev_err(&serial->interface->dev, "failed to claim sibling interface: %d\n", ret); return ret; } serial->sibling = intf; return 0; } EXPORT_SYMBOL_GPL(usb_serial_claim_interface); static void release_sibling(struct usb_serial *serial, struct usb_interface *intf) { struct usb_driver *driver = serial->type->usb_driver; struct usb_interface *sibling; if (!serial->sibling) return; if (intf == serial->sibling) sibling = serial->interface; else sibling = serial->sibling; usb_set_intfdata(sibling, NULL); usb_driver_release_interface(driver, sibling); } static void destroy_serial(struct kref *kref) { struct usb_serial *serial; struct usb_serial_port *port; int i; serial = to_usb_serial(kref); /* return the minor range that this device had */ if (serial->minors_reserved) release_minors(serial); if (serial->attached && serial->type->release) serial->type->release(serial); /* Now that nothing is using the ports, they can be freed */ for (i = 0; i < serial->num_port_pointers; ++i) { port = serial->port[i]; if (port) { port->serial = NULL; put_device(&port->dev); } } usb_put_intf(serial->interface); usb_put_dev(serial->dev); kfree(serial); } void usb_serial_put(struct usb_serial *serial) { kref_put(&serial->kref, destroy_serial); } /***************************************************************************** * Driver tty interface functions *****************************************************************************/ /** * serial_install - install tty * @driver: the driver (USB in our case) * @tty: the tty being created * * Initialise the termios structure for this tty. We use the default * USB serial settings but permit them to be overridden by * serial->type->init_termios on first open. * * This is the first place a new tty gets used. Hence this is where we * acquire references to the usb_serial structure and the driver module, * where we store a pointer to the port. All these actions are reversed * in serial_cleanup(). */ static int serial_install(struct tty_driver *driver, struct tty_struct *tty) { int idx = tty->index; struct usb_serial *serial; struct usb_serial_port *port; bool init_termios; int retval = -ENODEV; port = usb_serial_port_get_by_minor(idx); if (!port) return retval; serial = port->serial; if (!try_module_get(serial->type->driver.owner)) goto err_put_serial; init_termios = (driver->termios[idx] == NULL); retval = tty_standard_install(driver, tty); if (retval) goto err_put_module; mutex_unlock(&serial->disc_mutex); /* allow the driver to update the initial settings */ if (init_termios && serial->type->init_termios) serial->type->init_termios(tty); tty->driver_data = port; return retval; err_put_module: module_put(serial->type->driver.owner); err_put_serial: usb_serial_put(serial); mutex_unlock(&serial->disc_mutex); return retval; } static int serial_port_activate(struct tty_port *tport, struct tty_struct *tty) { struct usb_serial_port *port = container_of(tport, struct usb_serial_port, port); struct usb_serial *serial = port->serial; int retval; mutex_lock(&serial->disc_mutex); if (serial->disconnected) { retval = -ENODEV; goto out_unlock; } retval = usb_autopm_get_interface(serial->interface); if (retval) goto out_unlock; retval = port->serial->type->open(tty, port); if (retval) usb_autopm_put_interface(serial->interface); out_unlock: mutex_unlock(&serial->disc_mutex); if (retval < 0) retval = usb_translate_errors(retval); return retval; } static int serial_open(struct tty_struct *tty, struct file *filp) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); return tty_port_open(&port->port, tty, filp); } /** * serial_port_shutdown - shut down hardware * @tport: tty port to shut down * * Shut down a USB serial port. Serialized against activate by the * tport mutex and kept to matching open/close pairs * of calls by the tty-port initialized flag. * * Not called if tty is console. */ static void serial_port_shutdown(struct tty_port *tport) { struct usb_serial_port *port = container_of(tport, struct usb_serial_port, port); struct usb_serial_driver *drv = port->serial->type; if (drv->close) drv->close(port); usb_autopm_put_interface(port->serial->interface); } static void serial_hangup(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); tty_port_hangup(&port->port); } static void serial_close(struct tty_struct *tty, struct file *filp) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); tty_port_close(&port->port, tty, filp); } /** * serial_cleanup - free resources post close/hangup * @tty: tty to clean up * * Do the resource freeing and refcount dropping for the port. * Avoid freeing the console. * * Called asynchronously after the last tty kref is dropped. */ static void serial_cleanup(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct usb_serial *serial; struct module *owner; dev_dbg(&port->dev, "%s\n", __func__); /* The console is magical. Do not hang up the console hardware * or there will be tears. */ if (port->port.console) return; tty->driver_data = NULL; serial = port->serial; owner = serial->type->driver.owner; usb_serial_put(serial); module_put(owner); } static ssize_t serial_write(struct tty_struct *tty, const u8 *buf, size_t count) { struct usb_serial_port *port = tty->driver_data; int retval = -ENODEV; if (port->serial->dev->state == USB_STATE_NOTATTACHED) goto exit; dev_dbg(&port->dev, "%s - %zu byte(s)\n", __func__, count); retval = port->serial->type->write(tty, port, buf, count); if (retval < 0) retval = usb_translate_errors(retval); exit: return retval; } static unsigned int serial_write_room(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); return port->serial->type->write_room(tty); } static unsigned int serial_chars_in_buffer(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct usb_serial *serial = port->serial; dev_dbg(&port->dev, "%s\n", __func__); if (serial->disconnected) return 0; return serial->type->chars_in_buffer(tty); } static void serial_wait_until_sent(struct tty_struct *tty, int timeout) { struct usb_serial_port *port = tty->driver_data; struct usb_serial *serial = port->serial; dev_dbg(&port->dev, "%s\n", __func__); if (!port->serial->type->wait_until_sent) return; mutex_lock(&serial->disc_mutex); if (!serial->disconnected) port->serial->type->wait_until_sent(tty, timeout); mutex_unlock(&serial->disc_mutex); } static void serial_throttle(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); if (port->serial->type->throttle) port->serial->type->throttle(tty); } static void serial_unthrottle(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); if (port->serial->type->unthrottle) port->serial->type->unthrottle(tty); } static int serial_get_serial(struct tty_struct *tty, struct serial_struct *ss) { struct usb_serial_port *port = tty->driver_data; struct tty_port *tport = &port->port; unsigned int close_delay, closing_wait; mutex_lock(&tport->mutex); close_delay = jiffies_to_msecs(tport->close_delay) / 10; closing_wait = tport->closing_wait; if (closing_wait != ASYNC_CLOSING_WAIT_NONE) closing_wait = jiffies_to_msecs(closing_wait) / 10; ss->line = port->minor; ss->close_delay = close_delay; ss->closing_wait = closing_wait; if (port->serial->type->get_serial) port->serial->type->get_serial(tty, ss); mutex_unlock(&tport->mutex); return 0; } static int serial_set_serial(struct tty_struct *tty, struct serial_struct *ss) { struct usb_serial_port *port = tty->driver_data; struct tty_port *tport = &port->port; unsigned int close_delay, closing_wait; int ret = 0; close_delay = msecs_to_jiffies(ss->close_delay * 10); closing_wait = ss->closing_wait; if (closing_wait != ASYNC_CLOSING_WAIT_NONE) closing_wait = msecs_to_jiffies(closing_wait * 10); mutex_lock(&tport->mutex); if (!capable(CAP_SYS_ADMIN)) { if (close_delay != tport->close_delay || closing_wait != tport->closing_wait) { ret = -EPERM; goto out_unlock; } } if (port->serial->type->set_serial) { ret = port->serial->type->set_serial(tty, ss); if (ret) goto out_unlock; } tport->close_delay = close_delay; tport->closing_wait = closing_wait; out_unlock: mutex_unlock(&tport->mutex); return ret; } static int serial_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { struct usb_serial_port *port = tty->driver_data; int retval = -ENOIOCTLCMD; dev_dbg(&port->dev, "%s - cmd 0x%04x\n", __func__, cmd); switch (cmd) { case TIOCMIWAIT: if (port->serial->type->tiocmiwait) retval = port->serial->type->tiocmiwait(tty, arg); break; default: if (port->serial->type->ioctl) retval = port->serial->type->ioctl(tty, cmd, arg); } return retval; } static void serial_set_termios(struct tty_struct *tty, const struct ktermios *old) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); if (port->serial->type->set_termios) port->serial->type->set_termios(tty, port, old); else tty_termios_copy_hw(&tty->termios, old); } static int serial_break(struct tty_struct *tty, int break_state) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); if (port->serial->type->break_ctl) return port->serial->type->break_ctl(tty, break_state); return -ENOTTY; } static int serial_proc_show(struct seq_file *m, void *v) { struct usb_serial *serial; struct usb_serial_port *port; int i; char tmp[40]; seq_puts(m, "usbserinfo:1.0 driver:2.0\n"); for (i = 0; i < USB_SERIAL_TTY_MINORS; ++i) { port = usb_serial_port_get_by_minor(i); if (port == NULL) continue; serial = port->serial; seq_printf(m, "%d:", i); if (serial->type->driver.owner) seq_printf(m, " module:%s", module_name(serial->type->driver.owner)); seq_printf(m, " name:\"%s\"", serial->type->description); seq_printf(m, " vendor:%04x product:%04x", le16_to_cpu(serial->dev->descriptor.idVendor), le16_to_cpu(serial->dev->descriptor.idProduct)); seq_printf(m, " num_ports:%d", serial->num_ports); seq_printf(m, " port:%d", port->port_number); usb_make_path(serial->dev, tmp, sizeof(tmp)); seq_printf(m, " path:%s", tmp); seq_putc(m, '\n'); usb_serial_put(serial); mutex_unlock(&serial->disc_mutex); } return 0; } static int serial_tiocmget(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); if (port->serial->type->tiocmget) return port->serial->type->tiocmget(tty); return -ENOTTY; } static int serial_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); if (port->serial->type->tiocmset) return port->serial->type->tiocmset(tty, set, clear); return -ENOTTY; } static int serial_get_icount(struct tty_struct *tty, struct serial_icounter_struct *icount) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); if (port->serial->type->get_icount) return port->serial->type->get_icount(tty, icount); return -ENOTTY; } /* * We would be calling tty_wakeup here, but unfortunately some line * disciplines have an annoying habit of calling tty->write from * the write wakeup callback (e.g. n_hdlc.c). */ void usb_serial_port_softint(struct usb_serial_port *port) { schedule_work(&port->work); } EXPORT_SYMBOL_GPL(usb_serial_port_softint); static void usb_serial_port_work(struct work_struct *work) { struct usb_serial_port *port = container_of(work, struct usb_serial_port, work); tty_port_tty_wakeup(&port->port); } static void usb_serial_port_poison_urbs(struct usb_serial_port *port) { int i; for (i = 0; i < ARRAY_SIZE(port->read_urbs); ++i) usb_poison_urb(port->read_urbs[i]); for (i = 0; i < ARRAY_SIZE(port->write_urbs); ++i) usb_poison_urb(port->write_urbs[i]); usb_poison_urb(port->interrupt_in_urb); usb_poison_urb(port->interrupt_out_urb); } static void usb_serial_port_unpoison_urbs(struct usb_serial_port *port) { int i; for (i = 0; i < ARRAY_SIZE(port->read_urbs); ++i) usb_unpoison_urb(port->read_urbs[i]); for (i = 0; i < ARRAY_SIZE(port->write_urbs); ++i) usb_unpoison_urb(port->write_urbs[i]); usb_unpoison_urb(port->interrupt_in_urb); usb_unpoison_urb(port->interrupt_out_urb); } static void usb_serial_port_release(struct device *dev) { struct usb_serial_port *port = to_usb_serial_port(dev); int i; dev_dbg(dev, "%s\n", __func__); usb_free_urb(port->interrupt_in_urb); usb_free_urb(port->interrupt_out_urb); for (i = 0; i < ARRAY_SIZE(port->read_urbs); ++i) { usb_free_urb(port->read_urbs[i]); kfree(port->bulk_in_buffers[i]); } for (i = 0; i < ARRAY_SIZE(port->write_urbs); ++i) { usb_free_urb(port->write_urbs[i]); kfree(port->bulk_out_buffers[i]); } kfifo_free(&port->write_fifo); kfree(port->interrupt_in_buffer); kfree(port->interrupt_out_buffer); tty_port_destroy(&port->port); kfree(port); } static struct usb_serial *create_serial(struct usb_device *dev, struct usb_interface *interface, struct usb_serial_driver *driver) { struct usb_serial *serial; serial = kzalloc(sizeof(*serial), GFP_KERNEL); if (!serial) return NULL; serial->dev = usb_get_dev(dev); serial->type = driver; serial->interface = usb_get_intf(interface); kref_init(&serial->kref); mutex_init(&serial->disc_mutex); serial->minors_reserved = 0; return serial; } static const struct usb_device_id *match_dynamic_id(struct usb_interface *intf, struct usb_serial_driver *drv) { struct usb_dynid *dynid; spin_lock(&drv->dynids.lock); list_for_each_entry(dynid, &drv->dynids.list, node) { if (usb_match_one_id(intf, &dynid->id)) { spin_unlock(&drv->dynids.lock); return &dynid->id; } } spin_unlock(&drv->dynids.lock); return NULL; } static const struct usb_device_id *get_iface_id(struct usb_serial_driver *drv, struct usb_interface *intf) { const struct usb_device_id *id; id = usb_match_id(intf, drv->id_table); if (id) { dev_dbg(&intf->dev, "static descriptor matches\n"); goto exit; } id = match_dynamic_id(intf, drv); if (id) dev_dbg(&intf->dev, "dynamic descriptor matches\n"); exit: return id; } /* Caller must hold table_lock */ static struct usb_serial_driver *search_serial_device( struct usb_interface *iface) { const struct usb_device_id *id = NULL; struct usb_serial_driver *drv; struct usb_driver *driver = to_usb_driver(iface->dev.driver); /* Check if the usb id matches a known device */ list_for_each_entry(drv, &usb_serial_driver_list, driver_list) { if (drv->usb_driver == driver) id = get_iface_id(drv, iface); if (id) return drv; } return NULL; } static bool serial_port_carrier_raised(struct tty_port *port) { struct usb_serial_port *p = container_of(port, struct usb_serial_port, port); struct usb_serial_driver *drv = p->serial->type; if (drv->carrier_raised) return drv->carrier_raised(p); /* No carrier control - don't block */ return true; } static void serial_port_dtr_rts(struct tty_port *port, bool on) { struct usb_serial_port *p = container_of(port, struct usb_serial_port, port); struct usb_serial_driver *drv = p->serial->type; if (drv->dtr_rts) drv->dtr_rts(p, on); } static ssize_t port_number_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_serial_port *port = to_usb_serial_port(dev); return sprintf(buf, "%u\n", port->port_number); } static DEVICE_ATTR_RO(port_number); static struct attribute *usb_serial_port_attrs[] = { &dev_attr_port_number.attr, NULL }; ATTRIBUTE_GROUPS(usb_serial_port); static const struct tty_port_operations serial_port_ops = { .carrier_raised = serial_port_carrier_raised, .dtr_rts = serial_port_dtr_rts, .activate = serial_port_activate, .shutdown = serial_port_shutdown, }; static void store_endpoint(struct usb_serial *serial, struct usb_serial_endpoints *epds, struct usb_endpoint_descriptor *epd) { struct device *dev = &serial->interface->dev; u8 addr = epd->bEndpointAddress; if (usb_endpoint_is_bulk_in(epd)) { if (epds->num_bulk_in == ARRAY_SIZE(epds->bulk_in)) return; dev_dbg(dev, "found bulk in endpoint %02x\n", addr); epds->bulk_in[epds->num_bulk_in++] = epd; } else if (usb_endpoint_is_bulk_out(epd)) { if (epds->num_bulk_out == ARRAY_SIZE(epds->bulk_out)) return; dev_dbg(dev, "found bulk out endpoint %02x\n", addr); epds->bulk_out[epds->num_bulk_out++] = epd; } else if (usb_endpoint_is_int_in(epd)) { if (epds->num_interrupt_in == ARRAY_SIZE(epds->interrupt_in)) return; dev_dbg(dev, "found interrupt in endpoint %02x\n", addr); epds->interrupt_in[epds->num_interrupt_in++] = epd; } else if (usb_endpoint_is_int_out(epd)) { if (epds->num_interrupt_out == ARRAY_SIZE(epds->interrupt_out)) return; dev_dbg(dev, "found interrupt out endpoint %02x\n", addr); epds->interrupt_out[epds->num_interrupt_out++] = epd; } } static void find_endpoints(struct usb_serial *serial, struct usb_serial_endpoints *epds, struct usb_interface *intf) { struct usb_host_interface *iface_desc; struct usb_endpoint_descriptor *epd; unsigned int i; iface_desc = intf->cur_altsetting; for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) { epd = &iface_desc->endpoint[i].desc; store_endpoint(serial, epds, epd); } } static int setup_port_bulk_in(struct usb_serial_port *port, struct usb_endpoint_descriptor *epd) { struct usb_serial_driver *type = port->serial->type; struct usb_device *udev = port->serial->dev; int buffer_size; int i; buffer_size = max_t(int, type->bulk_in_size, usb_endpoint_maxp(epd)); port->bulk_in_size = buffer_size; port->bulk_in_endpointAddress = epd->bEndpointAddress; for (i = 0; i < ARRAY_SIZE(port->read_urbs); ++i) { set_bit(i, &port->read_urbs_free); port->read_urbs[i] = usb_alloc_urb(0, GFP_KERNEL); if (!port->read_urbs[i]) return -ENOMEM; port->bulk_in_buffers[i] = kmalloc(buffer_size, GFP_KERNEL); if (!port->bulk_in_buffers[i]) return -ENOMEM; usb_fill_bulk_urb(port->read_urbs[i], udev, usb_rcvbulkpipe(udev, epd->bEndpointAddress), port->bulk_in_buffers[i], buffer_size, type->read_bulk_callback, port); } port->read_urb = port->read_urbs[0]; port->bulk_in_buffer = port->bulk_in_buffers[0]; return 0; } static int setup_port_bulk_out(struct usb_serial_port *port, struct usb_endpoint_descriptor *epd) { struct usb_serial_driver *type = port->serial->type; struct usb_device *udev = port->serial->dev; int buffer_size; int i; if (kfifo_alloc(&port->write_fifo, PAGE_SIZE, GFP_KERNEL)) return -ENOMEM; if (type->bulk_out_size) buffer_size = type->bulk_out_size; else buffer_size = usb_endpoint_maxp(epd); port->bulk_out_size = buffer_size; port->bulk_out_endpointAddress = epd->bEndpointAddress; for (i = 0; i < ARRAY_SIZE(port->write_urbs); ++i) { set_bit(i, &port->write_urbs_free); port->write_urbs[i] = usb_alloc_urb(0, GFP_KERNEL); if (!port->write_urbs[i]) return -ENOMEM; port->bulk_out_buffers[i] = kmalloc(buffer_size, GFP_KERNEL); if (!port->bulk_out_buffers[i]) return -ENOMEM; usb_fill_bulk_urb(port->write_urbs[i], udev, usb_sndbulkpipe(udev, epd->bEndpointAddress), port->bulk_out_buffers[i], buffer_size, type->write_bulk_callback, port); } port->write_urb = port->write_urbs[0]; port->bulk_out_buffer = port->bulk_out_buffers[0]; return 0; } static int setup_port_interrupt_in(struct usb_serial_port *port, struct usb_endpoint_descriptor *epd) { struct usb_serial_driver *type = port->serial->type; struct usb_device *udev = port->serial->dev; int buffer_size; port->interrupt_in_urb = usb_alloc_urb(0, GFP_KERNEL); if (!port->interrupt_in_urb) return -ENOMEM; buffer_size = usb_endpoint_maxp(epd); port->interrupt_in_endpointAddress = epd->bEndpointAddress; port->interrupt_in_buffer = kmalloc(buffer_size, GFP_KERNEL); if (!port->interrupt_in_buffer) return -ENOMEM; usb_fill_int_urb(port->interrupt_in_urb, udev, usb_rcvintpipe(udev, epd->bEndpointAddress), port->interrupt_in_buffer, buffer_size, type->read_int_callback, port, epd->bInterval); return 0; } static int setup_port_interrupt_out(struct usb_serial_port *port, struct usb_endpoint_descriptor *epd) { struct usb_serial_driver *type = port->serial->type; struct usb_device *udev = port->serial->dev; int buffer_size; port->interrupt_out_urb = usb_alloc_urb(0, GFP_KERNEL); if (!port->interrupt_out_urb) return -ENOMEM; buffer_size = usb_endpoint_maxp(epd); port->interrupt_out_size = buffer_size; port->interrupt_out_endpointAddress = epd->bEndpointAddress; port->interrupt_out_buffer = kmalloc(buffer_size, GFP_KERNEL); if (!port->interrupt_out_buffer) return -ENOMEM; usb_fill_int_urb(port->interrupt_out_urb, udev, usb_sndintpipe(udev, epd->bEndpointAddress), port->interrupt_out_buffer, buffer_size, type->write_int_callback, port, epd->bInterval); return 0; } static int usb_serial_probe(struct usb_interface *interface, const struct usb_device_id *id) { struct device *ddev = &interface->dev; struct usb_device *dev = interface_to_usbdev(interface); struct usb_serial *serial = NULL; struct usb_serial_port *port; struct usb_serial_endpoints *epds; struct usb_serial_driver *type = NULL; int retval; int i; int num_ports = 0; unsigned char max_endpoints; mutex_lock(&table_lock); type = search_serial_device(interface); if (!type) { mutex_unlock(&table_lock); dev_dbg(ddev, "none matched\n"); return -ENODEV; } if (!try_module_get(type->driver.owner)) { mutex_unlock(&table_lock); dev_err(ddev, "module get failed, exiting\n"); return -EIO; } mutex_unlock(&table_lock); serial = create_serial(dev, interface, type); if (!serial) { retval = -ENOMEM; goto err_put_module; } /* if this device type has a probe function, call it */ if (type->probe) { const struct usb_device_id *id; id = get_iface_id(type, interface); retval = type->probe(serial, id); if (retval) { dev_dbg(ddev, "sub driver rejected device\n"); goto err_release_sibling; } } /* descriptor matches, let's find the endpoints needed */ epds = kzalloc(sizeof(*epds), GFP_KERNEL); if (!epds) { retval = -ENOMEM; goto err_release_sibling; } find_endpoints(serial, epds, interface); if (serial->sibling) find_endpoints(serial, epds, serial->sibling); if (epds->num_bulk_in < type->num_bulk_in || epds->num_bulk_out < type->num_bulk_out || epds->num_interrupt_in < type->num_interrupt_in || epds->num_interrupt_out < type->num_interrupt_out) { dev_err(ddev, "required endpoints missing\n"); retval = -ENODEV; goto err_free_epds; } if (type->calc_num_ports) { retval = type->calc_num_ports(serial, epds); if (retval < 0) goto err_free_epds; num_ports = retval; } if (!num_ports) num_ports = type->num_ports; if (num_ports > MAX_NUM_PORTS) { dev_warn(ddev, "too many ports requested: %d\n", num_ports); num_ports = MAX_NUM_PORTS; } serial->num_ports = (unsigned char)num_ports; serial->num_bulk_in = epds->num_bulk_in; serial->num_bulk_out = epds->num_bulk_out; serial->num_interrupt_in = epds->num_interrupt_in; serial->num_interrupt_out = epds->num_interrupt_out; /* found all that we need */ dev_info(ddev, "%s converter detected\n", type->description); /* create our ports, we need as many as the max endpoints */ /* we don't use num_ports here because some devices have more endpoint pairs than ports */ max_endpoints = max(epds->num_bulk_in, epds->num_bulk_out); max_endpoints = max(max_endpoints, epds->num_interrupt_in); max_endpoints = max(max_endpoints, epds->num_interrupt_out); max_endpoints = max(max_endpoints, serial->num_ports); serial->num_port_pointers = max_endpoints; dev_dbg(ddev, "setting up %d port structure(s)\n", max_endpoints); for (i = 0; i < max_endpoints; ++i) { port = kzalloc(sizeof(struct usb_serial_port), GFP_KERNEL); if (!port) { retval = -ENOMEM; goto err_free_epds; } tty_port_init(&port->port); port->port.ops = &serial_port_ops; port->serial = serial; spin_lock_init(&port->lock); /* Keep this for private driver use for the moment but should probably go away */ INIT_WORK(&port->work, usb_serial_port_work); serial->port[i] = port; port->dev.parent = &interface->dev; port->dev.driver = NULL; port->dev.bus = &usb_serial_bus_type; port->dev.release = &usb_serial_port_release; port->dev.groups = usb_serial_port_groups; device_initialize(&port->dev); } /* set up the endpoint information */ for (i = 0; i < epds->num_bulk_in; ++i) { retval = setup_port_bulk_in(serial->port[i], epds->bulk_in[i]); if (retval) goto err_free_epds; } for (i = 0; i < epds->num_bulk_out; ++i) { retval = setup_port_bulk_out(serial->port[i], epds->bulk_out[i]); if (retval) goto err_free_epds; } if (serial->type->read_int_callback) { for (i = 0; i < epds->num_interrupt_in; ++i) { retval = setup_port_interrupt_in(serial->port[i], epds->interrupt_in[i]); if (retval) goto err_free_epds; } } else if (epds->num_interrupt_in) { dev_dbg(ddev, "The device claims to support interrupt in transfers, but read_int_callback is not defined\n"); } if (serial->type->write_int_callback) { for (i = 0; i < epds->num_interrupt_out; ++i) { retval = setup_port_interrupt_out(serial->port[i], epds->interrupt_out[i]); if (retval) goto err_free_epds; } } else if (epds->num_interrupt_out) { dev_dbg(ddev, "The device claims to support interrupt out transfers, but write_int_callback is not defined\n"); } usb_set_intfdata(interface, serial); /* if this device type has an attach function, call it */ if (type->attach) { retval = type->attach(serial); if (retval < 0) goto err_free_epds; serial->attached = 1; if (retval > 0) { /* quietly accept this device, but don't bind to a serial port as it's about to disappear */ serial->num_ports = 0; goto exit; } } else { serial->attached = 1; } retval = allocate_minors(serial, num_ports); if (retval) { dev_err(ddev, "No more free serial minor numbers\n"); goto err_free_epds; } /* register all of the individual ports with the driver core */ for (i = 0; i < num_ports; ++i) { port = serial->port[i]; dev_set_name(&port->dev, "ttyUSB%d", port->minor); dev_dbg(ddev, "registering %s\n", dev_name(&port->dev)); device_enable_async_suspend(&port->dev); retval = device_add(&port->dev); if (retval) dev_err(ddev, "Error registering port device, continuing\n"); } if (num_ports > 0) usb_serial_console_init(serial->port[0]->minor); exit: kfree(epds); module_put(type->driver.owner); return 0; err_free_epds: kfree(epds); err_release_sibling: release_sibling(serial, interface); usb_serial_put(serial); err_put_module: module_put(type->driver.owner); return retval; } static void usb_serial_disconnect(struct usb_interface *interface) { int i; struct usb_serial *serial = usb_get_intfdata(interface); struct device *dev = &interface->dev; struct usb_serial_port *port; struct tty_struct *tty; /* sibling interface is cleaning up */ if (!serial) return; usb_serial_console_disconnect(serial); mutex_lock(&serial->disc_mutex); /* must set a flag, to signal subdrivers */ serial->disconnected = 1; mutex_unlock(&serial->disc_mutex); for (i = 0; i < serial->num_ports; ++i) { port = serial->port[i]; tty = tty_port_tty_get(&port->port); if (tty) { tty_vhangup(tty); tty_kref_put(tty); } usb_serial_port_poison_urbs(port); wake_up_interruptible(&port->port.delta_msr_wait); cancel_work_sync(&port->work); if (device_is_registered(&port->dev)) device_del(&port->dev); } if (serial->type->disconnect) serial->type->disconnect(serial); release_sibling(serial, interface); /* let the last holder of this object cause it to be cleaned up */ usb_serial_put(serial); dev_info(dev, "device disconnected\n"); } int usb_serial_suspend(struct usb_interface *intf, pm_message_t message) { struct usb_serial *serial = usb_get_intfdata(intf); int i, r; /* suspend when called for first sibling interface */ if (serial->suspend_count++) return 0; /* * serial->type->suspend() MUST return 0 in system sleep context, * otherwise, the resume callback has to recover device from * previous suspend failure. */ if (serial->type->suspend) { r = serial->type->suspend(serial, message); if (r < 0) { serial->suspend_count--; return r; } } for (i = 0; i < serial->num_ports; ++i) usb_serial_port_poison_urbs(serial->port[i]); return 0; } EXPORT_SYMBOL(usb_serial_suspend); static void usb_serial_unpoison_port_urbs(struct usb_serial *serial) { int i; for (i = 0; i < serial->num_ports; ++i) usb_serial_port_unpoison_urbs(serial->port[i]); } int usb_serial_resume(struct usb_interface *intf) { struct usb_serial *serial = usb_get_intfdata(intf); int rv; /* resume when called for last sibling interface */ if (--serial->suspend_count) return 0; usb_serial_unpoison_port_urbs(serial); if (serial->type->resume) rv = serial->type->resume(serial); else rv = usb_serial_generic_resume(serial); return rv; } EXPORT_SYMBOL(usb_serial_resume); static int usb_serial_reset_resume(struct usb_interface *intf) { struct usb_serial *serial = usb_get_intfdata(intf); int rv; /* resume when called for last sibling interface */ if (--serial->suspend_count) return 0; usb_serial_unpoison_port_urbs(serial); if (serial->type->reset_resume) { rv = serial->type->reset_resume(serial); } else { rv = -EOPNOTSUPP; intf->needs_binding = 1; } return rv; } static const struct tty_operations serial_ops = { .open = serial_open, .close = serial_close, .write = serial_write, .hangup = serial_hangup, .write_room = serial_write_room, .ioctl = serial_ioctl, .set_termios = serial_set_termios, .throttle = serial_throttle, .unthrottle = serial_unthrottle, .break_ctl = serial_break, .chars_in_buffer = serial_chars_in_buffer, .wait_until_sent = serial_wait_until_sent, .tiocmget = serial_tiocmget, .tiocmset = serial_tiocmset, .get_icount = serial_get_icount, .set_serial = serial_set_serial, .get_serial = serial_get_serial, .cleanup = serial_cleanup, .install = serial_install, .proc_show = serial_proc_show, }; struct tty_driver *usb_serial_tty_driver; static int __init usb_serial_init(void) { int result; usb_serial_tty_driver = tty_alloc_driver(USB_SERIAL_TTY_MINORS, TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV); if (IS_ERR(usb_serial_tty_driver)) return PTR_ERR(usb_serial_tty_driver); /* Initialize our global data */ result = bus_register(&usb_serial_bus_type); if (result) { pr_err("%s - registering bus driver failed\n", __func__); goto err_put_driver; } usb_serial_tty_driver->driver_name = "usbserial"; usb_serial_tty_driver->name = "ttyUSB"; usb_serial_tty_driver->major = USB_SERIAL_TTY_MAJOR; usb_serial_tty_driver->minor_start = 0; usb_serial_tty_driver->type = TTY_DRIVER_TYPE_SERIAL; usb_serial_tty_driver->subtype = SERIAL_TYPE_NORMAL; usb_serial_tty_driver->init_termios = tty_std_termios; usb_serial_tty_driver->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL | CLOCAL; usb_serial_tty_driver->init_termios.c_ispeed = 9600; usb_serial_tty_driver->init_termios.c_ospeed = 9600; tty_set_operations(usb_serial_tty_driver, &serial_ops); result = tty_register_driver(usb_serial_tty_driver); if (result) { pr_err("%s - tty_register_driver failed\n", __func__); goto err_unregister_bus; } /* register the generic driver, if we should */ result = usb_serial_generic_register(); if (result < 0) { pr_err("%s - registering generic driver failed\n", __func__); goto err_unregister_driver; } return result; err_unregister_driver: tty_unregister_driver(usb_serial_tty_driver); err_unregister_bus: bus_unregister(&usb_serial_bus_type); err_put_driver: pr_err("%s - returning with error %d\n", __func__, result); tty_driver_kref_put(usb_serial_tty_driver); return result; } static void __exit usb_serial_exit(void) { usb_serial_console_exit(); usb_serial_generic_deregister(); tty_unregister_driver(usb_serial_tty_driver); tty_driver_kref_put(usb_serial_tty_driver); bus_unregister(&usb_serial_bus_type); idr_destroy(&serial_minors); } module_init(usb_serial_init); module_exit(usb_serial_exit); #define set_to_generic_if_null(type, function) \ do { \ if (!type->function) { \ type->function = usb_serial_generic_##function; \ pr_debug("%s: using generic " #function "\n", \ type->driver.name); \ } \ } while (0) static void usb_serial_operations_init(struct usb_serial_driver *device) { set_to_generic_if_null(device, open); set_to_generic_if_null(device, write); set_to_generic_if_null(device, close); set_to_generic_if_null(device, write_room); set_to_generic_if_null(device, chars_in_buffer); if (device->tx_empty) set_to_generic_if_null(device, wait_until_sent); set_to_generic_if_null(device, read_bulk_callback); set_to_generic_if_null(device, write_bulk_callback); set_to_generic_if_null(device, process_read_urb); set_to_generic_if_null(device, prepare_write_buffer); } static int usb_serial_register(struct usb_serial_driver *driver) { int retval; if (usb_disabled()) return -ENODEV; if (!driver->description) driver->description = driver->driver.name; if (!driver->usb_driver) { WARN(1, "Serial driver %s has no usb_driver\n", driver->description); return -EINVAL; } /* Prevent individual ports from being unbound. */ driver->driver.suppress_bind_attrs = true; usb_serial_operations_init(driver); /* Add this device to our list of devices */ mutex_lock(&table_lock); list_add(&driver->driver_list, &usb_serial_driver_list); retval = usb_serial_bus_register(driver); if (retval) { pr_err("problem %d when registering driver %s\n", retval, driver->description); list_del(&driver->driver_list); } else { pr_info("USB Serial support registered for %s\n", driver->description); } mutex_unlock(&table_lock); return retval; } static void usb_serial_deregister(struct usb_serial_driver *device) { pr_info("USB Serial deregistering driver %s\n", device->description); mutex_lock(&table_lock); list_del(&device->driver_list); mutex_unlock(&table_lock); usb_serial_bus_deregister(device); } /** * usb_serial_register_drivers - register drivers for a usb-serial module * @serial_drivers: NULL-terminated array of pointers to drivers to be registered * @name: name of the usb_driver for this set of @serial_drivers * @id_table: list of all devices this @serial_drivers set binds to * * Registers all the drivers in the @serial_drivers array, and dynamically * creates a struct usb_driver with the name @name and id_table of @id_table. */ int usb_serial_register_drivers(struct usb_serial_driver *const serial_drivers[], const char *name, const struct usb_device_id *id_table) { int rc; struct usb_driver *udriver; struct usb_serial_driver * const *sd; /* * udriver must be registered before any of the serial drivers, * because the store_new_id() routine for the serial drivers (in * bus.c) probes udriver. * * Performance hack: We don't want udriver to be probed until * the serial drivers are registered, because the probe would * simply fail for lack of a matching serial driver. * So we leave udriver's id_table set to NULL until we are all set. * * Suspend/resume support is implemented in the usb-serial core, * so fill in the PM-related fields in udriver. */ udriver = kzalloc(sizeof(*udriver), GFP_KERNEL); if (!udriver) return -ENOMEM; udriver->name = name; udriver->no_dynamic_id = 1; udriver->supports_autosuspend = 1; udriver->suspend = usb_serial_suspend; udriver->resume = usb_serial_resume; udriver->probe = usb_serial_probe; udriver->disconnect = usb_serial_disconnect; /* we only set the reset_resume field if the serial_driver has one */ for (sd = serial_drivers; *sd; ++sd) { if ((*sd)->reset_resume) { udriver->reset_resume = usb_serial_reset_resume; break; } } rc = usb_register(udriver); if (rc) goto err_free_driver; for (sd = serial_drivers; *sd; ++sd) { (*sd)->usb_driver = udriver; rc = usb_serial_register(*sd); if (rc) goto err_deregister_drivers; } /* Now set udriver's id_table and look for matches */ udriver->id_table = id_table; rc = driver_attach(&udriver->driver); return 0; err_deregister_drivers: while (sd-- > serial_drivers) usb_serial_deregister(*sd); usb_deregister(udriver); err_free_driver: kfree(udriver); return rc; } EXPORT_SYMBOL_GPL(usb_serial_register_drivers); /** * usb_serial_deregister_drivers - deregister drivers for a usb-serial module * @serial_drivers: NULL-terminated array of pointers to drivers to be deregistered * * Deregisters all the drivers in the @serial_drivers array and deregisters and * frees the struct usb_driver that was created by the call to * usb_serial_register_drivers(). */ void usb_serial_deregister_drivers(struct usb_serial_driver *const serial_drivers[]) { struct usb_driver *udriver = (*serial_drivers)->usb_driver; for (; *serial_drivers; ++serial_drivers) usb_serial_deregister(*serial_drivers); usb_deregister(udriver); kfree(udriver); } EXPORT_SYMBOL_GPL(usb_serial_deregister_drivers); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL v2");
3 1 1 2 4 2 1 1 3 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/namei.h> #include <linux/io_uring.h> #include <linux/fsnotify.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "sync.h" struct io_sync { struct file *file; loff_t len; loff_t off; int flags; int mode; }; int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sync *sync = io_kiocb_to_cmd(req, struct io_sync); if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) return -EINVAL; sync->off = READ_ONCE(sqe->off); sync->len = READ_ONCE(sqe->len); sync->flags = READ_ONCE(sqe->sync_range_flags); req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) { struct io_sync *sync = io_kiocb_to_cmd(req, struct io_sync); int ret; /* sync_file_range always requires a blocking context */ WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = sync_file_range(req->file, sync->off, sync->len, sync->flags); io_req_set_res(req, ret, 0); return IOU_OK; } int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sync *sync = io_kiocb_to_cmd(req, struct io_sync); if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) return -EINVAL; sync->flags = READ_ONCE(sqe->fsync_flags); if (unlikely(sync->flags & ~IORING_FSYNC_DATASYNC)) return -EINVAL; sync->off = READ_ONCE(sqe->off); sync->len = READ_ONCE(sqe->len); req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_fsync(struct io_kiocb *req, unsigned int issue_flags) { struct io_sync *sync = io_kiocb_to_cmd(req, struct io_sync); loff_t end = sync->off + sync->len; int ret; /* fsync always requires a blocking context */ WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX, sync->flags & IORING_FSYNC_DATASYNC); io_req_set_res(req, ret, 0); return IOU_OK; } int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sync *sync = io_kiocb_to_cmd(req, struct io_sync); if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; sync->off = READ_ONCE(sqe->off); sync->len = READ_ONCE(sqe->addr); sync->mode = READ_ONCE(sqe->len); req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) { struct io_sync *sync = io_kiocb_to_cmd(req, struct io_sync); int ret; /* fallocate always requiring blocking context */ WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = vfs_fallocate(req->file, sync->mode, sync->off, sync->len); if (ret >= 0) fsnotify_modify(req->file); io_req_set_res(req, ret, 0); return IOU_OK; }
23 20 20 3 23 23 23 1 22 20 19 19 19 2 19 19 3 3 19 19 19 3 19 18 1 19 19 19 1 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 /* * xxHash - Extremely Fast Hash algorithm * Copyright (C) 2012-2016, Yann Collet. * * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * This program is free software; you can redistribute it and/or modify it under * the terms of the GNU General Public License version 2 as published by the * Free Software Foundation. This program is dual-licensed; you may select * either version 2 of the GNU General Public License ("GPL") or BSD license * ("BSD"). * * You can contact the author at: * - xxHash homepage: https://cyan4973.github.io/xxHash/ * - xxHash source repository: https://github.com/Cyan4973/xxHash */ #include <asm/unaligned.h> #include <linux/errno.h> #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/string.h> #include <linux/xxhash.h> /*-************************************* * Macros **************************************/ #define xxh_rotl32(x, r) ((x << r) | (x >> (32 - r))) #define xxh_rotl64(x, r) ((x << r) | (x >> (64 - r))) #ifdef __LITTLE_ENDIAN # define XXH_CPU_LITTLE_ENDIAN 1 #else # define XXH_CPU_LITTLE_ENDIAN 0 #endif /*-************************************* * Constants **************************************/ static const uint32_t PRIME32_1 = 2654435761U; static const uint32_t PRIME32_2 = 2246822519U; static const uint32_t PRIME32_3 = 3266489917U; static const uint32_t PRIME32_4 = 668265263U; static const uint32_t PRIME32_5 = 374761393U; static const uint64_t PRIME64_1 = 11400714785074694791ULL; static const uint64_t PRIME64_2 = 14029467366897019727ULL; static const uint64_t PRIME64_3 = 1609587929392839161ULL; static const uint64_t PRIME64_4 = 9650029242287828579ULL; static const uint64_t PRIME64_5 = 2870177450012600261ULL; /*-************************** * Utils ***************************/ void xxh32_copy_state(struct xxh32_state *dst, const struct xxh32_state *src) { memcpy(dst, src, sizeof(*dst)); } EXPORT_SYMBOL(xxh32_copy_state); void xxh64_copy_state(struct xxh64_state *dst, const struct xxh64_state *src) { memcpy(dst, src, sizeof(*dst)); } EXPORT_SYMBOL(xxh64_copy_state); /*-*************************** * Simple Hash Functions ****************************/ static uint32_t xxh32_round(uint32_t seed, const uint32_t input) { seed += input * PRIME32_2; seed = xxh_rotl32(seed, 13); seed *= PRIME32_1; return seed; } uint32_t xxh32(const void *input, const size_t len, const uint32_t seed) { const uint8_t *p = (const uint8_t *)input; const uint8_t *b_end = p + len; uint32_t h32; if (len >= 16) { const uint8_t *const limit = b_end - 16; uint32_t v1 = seed + PRIME32_1 + PRIME32_2; uint32_t v2 = seed + PRIME32_2; uint32_t v3 = seed + 0; uint32_t v4 = seed - PRIME32_1; do { v1 = xxh32_round(v1, get_unaligned_le32(p)); p += 4; v2 = xxh32_round(v2, get_unaligned_le32(p)); p += 4; v3 = xxh32_round(v3, get_unaligned_le32(p)); p += 4; v4 = xxh32_round(v4, get_unaligned_le32(p)); p += 4; } while (p <= limit); h32 = xxh_rotl32(v1, 1) + xxh_rotl32(v2, 7) + xxh_rotl32(v3, 12) + xxh_rotl32(v4, 18); } else { h32 = seed + PRIME32_5; } h32 += (uint32_t)len; while (p + 4 <= b_end) { h32 += get_unaligned_le32(p) * PRIME32_3; h32 = xxh_rotl32(h32, 17) * PRIME32_4; p += 4; } while (p < b_end) { h32 += (*p) * PRIME32_5; h32 = xxh_rotl32(h32, 11) * PRIME32_1; p++; } h32 ^= h32 >> 15; h32 *= PRIME32_2; h32 ^= h32 >> 13; h32 *= PRIME32_3; h32 ^= h32 >> 16; return h32; } EXPORT_SYMBOL(xxh32); static uint64_t xxh64_round(uint64_t acc, const uint64_t input) { acc += input * PRIME64_2; acc = xxh_rotl64(acc, 31); acc *= PRIME64_1; return acc; } static uint64_t xxh64_merge_round(uint64_t acc, uint64_t val) { val = xxh64_round(0, val); acc ^= val; acc = acc * PRIME64_1 + PRIME64_4; return acc; } uint64_t xxh64(const void *input, const size_t len, const uint64_t seed) { const uint8_t *p = (const uint8_t *)input; const uint8_t *const b_end = p + len; uint64_t h64; if (len >= 32) { const uint8_t *const limit = b_end - 32; uint64_t v1 = seed + PRIME64_1 + PRIME64_2; uint64_t v2 = seed + PRIME64_2; uint64_t v3 = seed + 0; uint64_t v4 = seed - PRIME64_1; do { v1 = xxh64_round(v1, get_unaligned_le64(p)); p += 8; v2 = xxh64_round(v2, get_unaligned_le64(p)); p += 8; v3 = xxh64_round(v3, get_unaligned_le64(p)); p += 8; v4 = xxh64_round(v4, get_unaligned_le64(p)); p += 8; } while (p <= limit); h64 = xxh_rotl64(v1, 1) + xxh_rotl64(v2, 7) + xxh_rotl64(v3, 12) + xxh_rotl64(v4, 18); h64 = xxh64_merge_round(h64, v1); h64 = xxh64_merge_round(h64, v2); h64 = xxh64_merge_round(h64, v3); h64 = xxh64_merge_round(h64, v4); } else { h64 = seed + PRIME64_5; } h64 += (uint64_t)len; while (p + 8 <= b_end) { const uint64_t k1 = xxh64_round(0, get_unaligned_le64(p)); h64 ^= k1; h64 = xxh_rotl64(h64, 27) * PRIME64_1 + PRIME64_4; p += 8; } if (p + 4 <= b_end) { h64 ^= (uint64_t)(get_unaligned_le32(p)) * PRIME64_1; h64 = xxh_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; p += 4; } while (p < b_end) { h64 ^= (*p) * PRIME64_5; h64 = xxh_rotl64(h64, 11) * PRIME64_1; p++; } h64 ^= h64 >> 33; h64 *= PRIME64_2; h64 ^= h64 >> 29; h64 *= PRIME64_3; h64 ^= h64 >> 32; return h64; } EXPORT_SYMBOL(xxh64); /*-************************************************** * Advanced Hash Functions ***************************************************/ void xxh32_reset(struct xxh32_state *statePtr, const uint32_t seed) { /* use a local state for memcpy() to avoid strict-aliasing warnings */ struct xxh32_state state; memset(&state, 0, sizeof(state)); state.v1 = seed + PRIME32_1 + PRIME32_2; state.v2 = seed + PRIME32_2; state.v3 = seed + 0; state.v4 = seed - PRIME32_1; memcpy(statePtr, &state, sizeof(state)); } EXPORT_SYMBOL(xxh32_reset); void xxh64_reset(struct xxh64_state *statePtr, const uint64_t seed) { /* use a local state for memcpy() to avoid strict-aliasing warnings */ struct xxh64_state state; memset(&state, 0, sizeof(state)); state.v1 = seed + PRIME64_1 + PRIME64_2; state.v2 = seed + PRIME64_2; state.v3 = seed + 0; state.v4 = seed - PRIME64_1; memcpy(statePtr, &state, sizeof(state)); } EXPORT_SYMBOL(xxh64_reset); int xxh32_update(struct xxh32_state *state, const void *input, const size_t len) { const uint8_t *p = (const uint8_t *)input; const uint8_t *const b_end = p + len; if (input == NULL) return -EINVAL; state->total_len_32 += (uint32_t)len; state->large_len |= (len >= 16) | (state->total_len_32 >= 16); if (state->memsize + len < 16) { /* fill in tmp buffer */ memcpy((uint8_t *)(state->mem32) + state->memsize, input, len); state->memsize += (uint32_t)len; return 0; } if (state->memsize) { /* some data left from previous update */ const uint32_t *p32 = state->mem32; memcpy((uint8_t *)(state->mem32) + state->memsize, input, 16 - state->memsize); state->v1 = xxh32_round(state->v1, get_unaligned_le32(p32)); p32++; state->v2 = xxh32_round(state->v2, get_unaligned_le32(p32)); p32++; state->v3 = xxh32_round(state->v3, get_unaligned_le32(p32)); p32++; state->v4 = xxh32_round(state->v4, get_unaligned_le32(p32)); p32++; p += 16-state->memsize; state->memsize = 0; } if (p <= b_end - 16) { const uint8_t *const limit = b_end - 16; uint32_t v1 = state->v1; uint32_t v2 = state->v2; uint32_t v3 = state->v3; uint32_t v4 = state->v4; do { v1 = xxh32_round(v1, get_unaligned_le32(p)); p += 4; v2 = xxh32_round(v2, get_unaligned_le32(p)); p += 4; v3 = xxh32_round(v3, get_unaligned_le32(p)); p += 4; v4 = xxh32_round(v4, get_unaligned_le32(p)); p += 4; } while (p <= limit); state->v1 = v1; state->v2 = v2; state->v3 = v3; state->v4 = v4; } if (p < b_end) { memcpy(state->mem32, p, (size_t)(b_end-p)); state->memsize = (uint32_t)(b_end-p); } return 0; } EXPORT_SYMBOL(xxh32_update); uint32_t xxh32_digest(const struct xxh32_state *state) { const uint8_t *p = (const uint8_t *)state->mem32; const uint8_t *const b_end = (const uint8_t *)(state->mem32) + state->memsize; uint32_t h32; if (state->large_len) { h32 = xxh_rotl32(state->v1, 1) + xxh_rotl32(state->v2, 7) + xxh_rotl32(state->v3, 12) + xxh_rotl32(state->v4, 18); } else { h32 = state->v3 /* == seed */ + PRIME32_5; } h32 += state->total_len_32; while (p + 4 <= b_end) { h32 += get_unaligned_le32(p) * PRIME32_3; h32 = xxh_rotl32(h32, 17) * PRIME32_4; p += 4; } while (p < b_end) { h32 += (*p) * PRIME32_5; h32 = xxh_rotl32(h32, 11) * PRIME32_1; p++; } h32 ^= h32 >> 15; h32 *= PRIME32_2; h32 ^= h32 >> 13; h32 *= PRIME32_3; h32 ^= h32 >> 16; return h32; } EXPORT_SYMBOL(xxh32_digest); int xxh64_update(struct xxh64_state *state, const void *input, const size_t len) { const uint8_t *p = (const uint8_t *)input; const uint8_t *const b_end = p + len; if (input == NULL) return -EINVAL; state->total_len += len; if (state->memsize + len < 32) { /* fill in tmp buffer */ memcpy(((uint8_t *)state->mem64) + state->memsize, input, len); state->memsize += (uint32_t)len; return 0; } if (state->memsize) { /* tmp buffer is full */ uint64_t *p64 = state->mem64; memcpy(((uint8_t *)p64) + state->memsize, input, 32 - state->memsize); state->v1 = xxh64_round(state->v1, get_unaligned_le64(p64)); p64++; state->v2 = xxh64_round(state->v2, get_unaligned_le64(p64)); p64++; state->v3 = xxh64_round(state->v3, get_unaligned_le64(p64)); p64++; state->v4 = xxh64_round(state->v4, get_unaligned_le64(p64)); p += 32 - state->memsize; state->memsize = 0; } if (p + 32 <= b_end) { const uint8_t *const limit = b_end - 32; uint64_t v1 = state->v1; uint64_t v2 = state->v2; uint64_t v3 = state->v3; uint64_t v4 = state->v4; do { v1 = xxh64_round(v1, get_unaligned_le64(p)); p += 8; v2 = xxh64_round(v2, get_unaligned_le64(p)); p += 8; v3 = xxh64_round(v3, get_unaligned_le64(p)); p += 8; v4 = xxh64_round(v4, get_unaligned_le64(p)); p += 8; } while (p <= limit); state->v1 = v1; state->v2 = v2; state->v3 = v3; state->v4 = v4; } if (p < b_end) { memcpy(state->mem64, p, (size_t)(b_end-p)); state->memsize = (uint32_t)(b_end - p); } return 0; } EXPORT_SYMBOL(xxh64_update); uint64_t xxh64_digest(const struct xxh64_state *state) { const uint8_t *p = (const uint8_t *)state->mem64; const uint8_t *const b_end = (const uint8_t *)state->mem64 + state->memsize; uint64_t h64; if (state->total_len >= 32) { const uint64_t v1 = state->v1; const uint64_t v2 = state->v2; const uint64_t v3 = state->v3; const uint64_t v4 = state->v4; h64 = xxh_rotl64(v1, 1) + xxh_rotl64(v2, 7) + xxh_rotl64(v3, 12) + xxh_rotl64(v4, 18); h64 = xxh64_merge_round(h64, v1); h64 = xxh64_merge_round(h64, v2); h64 = xxh64_merge_round(h64, v3); h64 = xxh64_merge_round(h64, v4); } else { h64 = state->v3 + PRIME64_5; } h64 += (uint64_t)state->total_len; while (p + 8 <= b_end) { const uint64_t k1 = xxh64_round(0, get_unaligned_le64(p)); h64 ^= k1; h64 = xxh_rotl64(h64, 27) * PRIME64_1 + PRIME64_4; p += 8; } if (p + 4 <= b_end) { h64 ^= (uint64_t)(get_unaligned_le32(p)) * PRIME64_1; h64 = xxh_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; p += 4; } while (p < b_end) { h64 ^= (*p) * PRIME64_5; h64 = xxh_rotl64(h64, 11) * PRIME64_1; p++; } h64 ^= h64 >> 33; h64 *= PRIME64_2; h64 ^= h64 >> 29; h64 *= PRIME64_3; h64 ^= h64 >> 32; return h64; } EXPORT_SYMBOL(xxh64_digest); MODULE_LICENSE("Dual BSD/GPL"); MODULE_DESCRIPTION("xxHash");
2441 2447 2206 2449 358 94 20 14 553 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) /* Copyright (C) 2016-2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. * * SipHash: a fast short-input PRF * https://131002.net/siphash/ * * This implementation is specifically for SipHash2-4 for a secure PRF * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for * hashtables. */ #include <linux/siphash.h> #include <asm/unaligned.h> #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 #include <linux/dcache.h> #include <asm/word-at-a-time.h> #endif #define SIPROUND SIPHASH_PERMUTATION(v0, v1, v2, v3) #define PREAMBLE(len) \ u64 v0 = SIPHASH_CONST_0; \ u64 v1 = SIPHASH_CONST_1; \ u64 v2 = SIPHASH_CONST_2; \ u64 v3 = SIPHASH_CONST_3; \ u64 b = ((u64)(len)) << 56; \ v3 ^= key->key[1]; \ v2 ^= key->key[0]; \ v1 ^= key->key[1]; \ v0 ^= key->key[0]; #define POSTAMBLE \ v3 ^= b; \ SIPROUND; \ SIPROUND; \ v0 ^= b; \ v2 ^= 0xff; \ SIPROUND; \ SIPROUND; \ SIPROUND; \ SIPROUND; \ return (v0 ^ v1) ^ (v2 ^ v3); #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); const u8 left = len & (sizeof(u64) - 1); u64 m; PREAMBLE(len) for (; data != end; data += sizeof(u64)) { m = le64_to_cpup(data); v3 ^= m; SIPROUND; SIPROUND; v0 ^= m; } #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 if (left) b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & bytemask_from_count(left))); #else switch (left) { case 7: b |= ((u64)end[6]) << 48; fallthrough; case 6: b |= ((u64)end[5]) << 40; fallthrough; case 5: b |= ((u64)end[4]) << 32; fallthrough; case 4: b |= le32_to_cpup(data); break; case 3: b |= ((u64)end[2]) << 16; fallthrough; case 2: b |= le16_to_cpup(data); break; case 1: b |= end[0]; } #endif POSTAMBLE } EXPORT_SYMBOL(__siphash_aligned); #endif u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); const u8 left = len & (sizeof(u64) - 1); u64 m; PREAMBLE(len) for (; data != end; data += sizeof(u64)) { m = get_unaligned_le64(data); v3 ^= m; SIPROUND; SIPROUND; v0 ^= m; } #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 if (left) b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & bytemask_from_count(left))); #else switch (left) { case 7: b |= ((u64)end[6]) << 48; fallthrough; case 6: b |= ((u64)end[5]) << 40; fallthrough; case 5: b |= ((u64)end[4]) << 32; fallthrough; case 4: b |= get_unaligned_le32(end); break; case 3: b |= ((u64)end[2]) << 16; fallthrough; case 2: b |= get_unaligned_le16(end); break; case 1: b |= end[0]; } #endif POSTAMBLE } EXPORT_SYMBOL(__siphash_unaligned); /** * siphash_1u64 - compute 64-bit siphash PRF value of a u64 * @first: first u64 * @key: the siphash key */ u64 siphash_1u64(const u64 first, const siphash_key_t *key) { PREAMBLE(8) v3 ^= first; SIPROUND; SIPROUND; v0 ^= first; POSTAMBLE } EXPORT_SYMBOL(siphash_1u64); /** * siphash_2u64 - compute 64-bit siphash PRF value of 2 u64 * @first: first u64 * @second: second u64 * @key: the siphash key */ u64 siphash_2u64(const u64 first, const u64 second, const siphash_key_t *key) { PREAMBLE(16) v3 ^= first; SIPROUND; SIPROUND; v0 ^= first; v3 ^= second; SIPROUND; SIPROUND; v0 ^= second; POSTAMBLE } EXPORT_SYMBOL(siphash_2u64); /** * siphash_3u64 - compute 64-bit siphash PRF value of 3 u64 * @first: first u64 * @second: second u64 * @third: third u64 * @key: the siphash key */ u64 siphash_3u64(const u64 first, const u64 second, const u64 third, const siphash_key_t *key) { PREAMBLE(24) v3 ^= first; SIPROUND; SIPROUND; v0 ^= first; v3 ^= second; SIPROUND; SIPROUND; v0 ^= second; v3 ^= third; SIPROUND; SIPROUND; v0 ^= third; POSTAMBLE } EXPORT_SYMBOL(siphash_3u64); /** * siphash_4u64 - compute 64-bit siphash PRF value of 4 u64 * @first: first u64 * @second: second u64 * @third: third u64 * @forth: forth u64 * @key: the siphash key */ u64 siphash_4u64(const u64 first, const u64 second, const u64 third, const u64 forth, const siphash_key_t *key) { PREAMBLE(32) v3 ^= first; SIPROUND; SIPROUND; v0 ^= first; v3 ^= second; SIPROUND; SIPROUND; v0 ^= second; v3 ^= third; SIPROUND; SIPROUND; v0 ^= third; v3 ^= forth; SIPROUND; SIPROUND; v0 ^= forth; POSTAMBLE } EXPORT_SYMBOL(siphash_4u64); u64 siphash_1u32(const u32 first, const siphash_key_t *key) { PREAMBLE(4) b |= first; POSTAMBLE } EXPORT_SYMBOL(siphash_1u32); u64 siphash_3u32(const u32 first, const u32 second, const u32 third, const siphash_key_t *key) { u64 combined = (u64)second << 32 | first; PREAMBLE(12) v3 ^= combined; SIPROUND; SIPROUND; v0 ^= combined; b |= third; POSTAMBLE } EXPORT_SYMBOL(siphash_3u32); #if BITS_PER_LONG == 64 /* Note that on 64-bit, we make HalfSipHash1-3 actually be SipHash1-3, for * performance reasons. On 32-bit, below, we actually implement HalfSipHash1-3. */ #define HSIPROUND SIPROUND #define HPREAMBLE(len) PREAMBLE(len) #define HPOSTAMBLE \ v3 ^= b; \ HSIPROUND; \ v0 ^= b; \ v2 ^= 0xff; \ HSIPROUND; \ HSIPROUND; \ HSIPROUND; \ return (v0 ^ v1) ^ (v2 ^ v3); #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); const u8 left = len & (sizeof(u64) - 1); u64 m; HPREAMBLE(len) for (; data != end; data += sizeof(u64)) { m = le64_to_cpup(data); v3 ^= m; HSIPROUND; v0 ^= m; } #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 if (left) b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & bytemask_from_count(left))); #else switch (left) { case 7: b |= ((u64)end[6]) << 48; fallthrough; case 6: b |= ((u64)end[5]) << 40; fallthrough; case 5: b |= ((u64)end[4]) << 32; fallthrough; case 4: b |= le32_to_cpup(data); break; case 3: b |= ((u64)end[2]) << 16; fallthrough; case 2: b |= le16_to_cpup(data); break; case 1: b |= end[0]; } #endif HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_aligned); #endif u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); const u8 left = len & (sizeof(u64) - 1); u64 m; HPREAMBLE(len) for (; data != end; data += sizeof(u64)) { m = get_unaligned_le64(data); v3 ^= m; HSIPROUND; v0 ^= m; } #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 if (left) b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & bytemask_from_count(left))); #else switch (left) { case 7: b |= ((u64)end[6]) << 48; fallthrough; case 6: b |= ((u64)end[5]) << 40; fallthrough; case 5: b |= ((u64)end[4]) << 32; fallthrough; case 4: b |= get_unaligned_le32(end); break; case 3: b |= ((u64)end[2]) << 16; fallthrough; case 2: b |= get_unaligned_le16(end); break; case 1: b |= end[0]; } #endif HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_unaligned); /** * hsiphash_1u32 - compute 64-bit hsiphash PRF value of a u32 * @first: first u32 * @key: the hsiphash key */ u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key) { HPREAMBLE(4) b |= first; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_1u32); /** * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32 * @first: first u32 * @second: second u32 * @key: the hsiphash key */ u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key) { u64 combined = (u64)second << 32 | first; HPREAMBLE(8) v3 ^= combined; HSIPROUND; v0 ^= combined; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_2u32); /** * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32 * @first: first u32 * @second: second u32 * @third: third u32 * @key: the hsiphash key */ u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third, const hsiphash_key_t *key) { u64 combined = (u64)second << 32 | first; HPREAMBLE(12) v3 ^= combined; HSIPROUND; v0 ^= combined; b |= third; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_3u32); /** * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32 * @first: first u32 * @second: second u32 * @third: third u32 * @forth: forth u32 * @key: the hsiphash key */ u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third, const u32 forth, const hsiphash_key_t *key) { u64 combined = (u64)second << 32 | first; HPREAMBLE(16) v3 ^= combined; HSIPROUND; v0 ^= combined; combined = (u64)forth << 32 | third; v3 ^= combined; HSIPROUND; v0 ^= combined; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_4u32); #else #define HSIPROUND HSIPHASH_PERMUTATION(v0, v1, v2, v3) #define HPREAMBLE(len) \ u32 v0 = HSIPHASH_CONST_0; \ u32 v1 = HSIPHASH_CONST_1; \ u32 v2 = HSIPHASH_CONST_2; \ u32 v3 = HSIPHASH_CONST_3; \ u32 b = ((u32)(len)) << 24; \ v3 ^= key->key[1]; \ v2 ^= key->key[0]; \ v1 ^= key->key[1]; \ v0 ^= key->key[0]; #define HPOSTAMBLE \ v3 ^= b; \ HSIPROUND; \ v0 ^= b; \ v2 ^= 0xff; \ HSIPROUND; \ HSIPROUND; \ HSIPROUND; \ return v1 ^ v3; #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u32)); const u8 left = len & (sizeof(u32) - 1); u32 m; HPREAMBLE(len) for (; data != end; data += sizeof(u32)) { m = le32_to_cpup(data); v3 ^= m; HSIPROUND; v0 ^= m; } switch (left) { case 3: b |= ((u32)end[2]) << 16; fallthrough; case 2: b |= le16_to_cpup(data); break; case 1: b |= end[0]; } HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_aligned); #endif u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u32)); const u8 left = len & (sizeof(u32) - 1); u32 m; HPREAMBLE(len) for (; data != end; data += sizeof(u32)) { m = get_unaligned_le32(data); v3 ^= m; HSIPROUND; v0 ^= m; } switch (left) { case 3: b |= ((u32)end[2]) << 16; fallthrough; case 2: b |= get_unaligned_le16(end); break; case 1: b |= end[0]; } HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_unaligned); /** * hsiphash_1u32 - compute 32-bit hsiphash PRF value of a u32 * @first: first u32 * @key: the hsiphash key */ u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key) { HPREAMBLE(4) v3 ^= first; HSIPROUND; v0 ^= first; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_1u32); /** * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32 * @first: first u32 * @second: second u32 * @key: the hsiphash key */ u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key) { HPREAMBLE(8) v3 ^= first; HSIPROUND; v0 ^= first; v3 ^= second; HSIPROUND; v0 ^= second; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_2u32); /** * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32 * @first: first u32 * @second: second u32 * @third: third u32 * @key: the hsiphash key */ u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third, const hsiphash_key_t *key) { HPREAMBLE(12) v3 ^= first; HSIPROUND; v0 ^= first; v3 ^= second; HSIPROUND; v0 ^= second; v3 ^= third; HSIPROUND; v0 ^= third; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_3u32); /** * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32 * @first: first u32 * @second: second u32 * @third: third u32 * @forth: forth u32 * @key: the hsiphash key */ u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third, const u32 forth, const hsiphash_key_t *key) { HPREAMBLE(16) v3 ^= first; HSIPROUND; v0 ^= first; v3 ^= second; HSIPROUND; v0 ^= second; v3 ^= third; HSIPROUND; v0 ^= third; v3 ^= forth; HSIPROUND; v0 ^= forth; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_4u32); #endif
2 2 3 2 2 2 2 2 1 1 2 4 4 4 3 3 3 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 // SPDX-License-Identifier: GPL-2.0-only /* tunnel4.c: Generic IP tunnel transformer. * * Copyright (C) 2003 David S. Miller (davem@redhat.com) */ #include <linux/init.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/mpls.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/icmp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/xfrm.h> static struct xfrm_tunnel __rcu *tunnel4_handlers __read_mostly; static struct xfrm_tunnel __rcu *tunnel64_handlers __read_mostly; static struct xfrm_tunnel __rcu *tunnelmpls4_handlers __read_mostly; static DEFINE_MUTEX(tunnel4_mutex); static inline struct xfrm_tunnel __rcu **fam_handlers(unsigned short family) { return (family == AF_INET) ? &tunnel4_handlers : (family == AF_INET6) ? &tunnel64_handlers : &tunnelmpls4_handlers; } int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family) { struct xfrm_tunnel __rcu **pprev; struct xfrm_tunnel *t; int ret = -EEXIST; int priority = handler->priority; mutex_lock(&tunnel4_mutex); for (pprev = fam_handlers(family); (t = rcu_dereference_protected(*pprev, lockdep_is_held(&tunnel4_mutex))) != NULL; pprev = &t->next) { if (t->priority > priority) break; if (t->priority == priority) goto err; } handler->next = *pprev; rcu_assign_pointer(*pprev, handler); ret = 0; err: mutex_unlock(&tunnel4_mutex); return ret; } EXPORT_SYMBOL(xfrm4_tunnel_register); int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family) { struct xfrm_tunnel __rcu **pprev; struct xfrm_tunnel *t; int ret = -ENOENT; mutex_lock(&tunnel4_mutex); for (pprev = fam_handlers(family); (t = rcu_dereference_protected(*pprev, lockdep_is_held(&tunnel4_mutex))) != NULL; pprev = &t->next) { if (t == handler) { *pprev = handler->next; ret = 0; break; } } mutex_unlock(&tunnel4_mutex); synchronize_net(); return ret; } EXPORT_SYMBOL(xfrm4_tunnel_deregister); #define for_each_tunnel_rcu(head, handler) \ for (handler = rcu_dereference(head); \ handler != NULL; \ handler = rcu_dereference(handler->next)) \ static int tunnel4_rcv(struct sk_buff *skb) { struct xfrm_tunnel *handler; if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto drop; for_each_tunnel_rcu(tunnel4_handlers, handler) if (!handler->handler(skb)) return 0; icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) static int tunnel4_rcv_cb(struct sk_buff *skb, u8 proto, int err) { struct xfrm_tunnel __rcu *head; struct xfrm_tunnel *handler; int ret; head = (proto == IPPROTO_IPIP) ? tunnel4_handlers : tunnel64_handlers; for_each_tunnel_rcu(head, handler) { if (handler->cb_handler) { ret = handler->cb_handler(skb, err); if (ret <= 0) return ret; } } return 0; } static const struct xfrm_input_afinfo tunnel4_input_afinfo = { .family = AF_INET, .is_ipip = true, .callback = tunnel4_rcv_cb, }; #endif #if IS_ENABLED(CONFIG_IPV6) static int tunnel64_rcv(struct sk_buff *skb) { struct xfrm_tunnel *handler; if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; for_each_tunnel_rcu(tunnel64_handlers, handler) if (!handler->handler(skb)) return 0; icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } #endif #if IS_ENABLED(CONFIG_MPLS) static int tunnelmpls4_rcv(struct sk_buff *skb) { struct xfrm_tunnel *handler; if (!pskb_may_pull(skb, sizeof(struct mpls_label))) goto drop; for_each_tunnel_rcu(tunnelmpls4_handlers, handler) if (!handler->handler(skb)) return 0; icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } #endif static int tunnel4_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; for_each_tunnel_rcu(tunnel4_handlers, handler) if (!handler->err_handler(skb, info)) return 0; return -ENOENT; } #if IS_ENABLED(CONFIG_IPV6) static int tunnel64_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; for_each_tunnel_rcu(tunnel64_handlers, handler) if (!handler->err_handler(skb, info)) return 0; return -ENOENT; } #endif #if IS_ENABLED(CONFIG_MPLS) static int tunnelmpls4_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; for_each_tunnel_rcu(tunnelmpls4_handlers, handler) if (!handler->err_handler(skb, info)) return 0; return -ENOENT; } #endif static const struct net_protocol tunnel4_protocol = { .handler = tunnel4_rcv, .err_handler = tunnel4_err, .no_policy = 1, }; #if IS_ENABLED(CONFIG_IPV6) static const struct net_protocol tunnel64_protocol = { .handler = tunnel64_rcv, .err_handler = tunnel64_err, .no_policy = 1, }; #endif #if IS_ENABLED(CONFIG_MPLS) static const struct net_protocol tunnelmpls4_protocol = { .handler = tunnelmpls4_rcv, .err_handler = tunnelmpls4_err, .no_policy = 1, }; #endif static int __init tunnel4_init(void) { if (inet_add_protocol(&tunnel4_protocol, IPPROTO_IPIP)) goto err; #if IS_ENABLED(CONFIG_IPV6) if (inet_add_protocol(&tunnel64_protocol, IPPROTO_IPV6)) { inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP); goto err; } #endif #if IS_ENABLED(CONFIG_MPLS) if (inet_add_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS)) { inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP); #if IS_ENABLED(CONFIG_IPV6) inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6); #endif goto err; } #endif #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) if (xfrm_input_register_afinfo(&tunnel4_input_afinfo)) { inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP); #if IS_ENABLED(CONFIG_IPV6) inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6); #endif #if IS_ENABLED(CONFIG_MPLS) inet_del_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS); #endif goto err; } #endif return 0; err: pr_err("%s: can't add protocol\n", __func__); return -EAGAIN; } static void __exit tunnel4_fini(void) { #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) if (xfrm_input_unregister_afinfo(&tunnel4_input_afinfo)) pr_err("tunnel4 close: can't remove input afinfo\n"); #endif #if IS_ENABLED(CONFIG_MPLS) if (inet_del_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS)) pr_err("tunnelmpls4 close: can't remove protocol\n"); #endif #if IS_ENABLED(CONFIG_IPV6) if (inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6)) pr_err("tunnel64 close: can't remove protocol\n"); #endif if (inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP)) pr_err("tunnel4 close: can't remove protocol\n"); } module_init(tunnel4_init); module_exit(tunnel4_fini); MODULE_LICENSE("GPL");
136 184 14 28 233 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __CGROUP_INTERNAL_H #define __CGROUP_INTERNAL_H #include <linux/cgroup.h> #include <linux/kernfs.h> #include <linux/workqueue.h> #include <linux/list.h> #include <linux/refcount.h> #include <linux/fs_parser.h> #define TRACE_CGROUP_PATH_LEN 1024 extern spinlock_t trace_cgroup_path_lock; extern char trace_cgroup_path[TRACE_CGROUP_PATH_LEN]; extern void __init enable_debug_cgroup(void); /* * cgroup_path() takes a spin lock. It is good practice not to take * spin locks within trace point handlers, as they are mostly hidden * from normal view. As cgroup_path() can take the kernfs_rename_lock * spin lock, it is best to not call that function from the trace event * handler. * * Note: trace_cgroup_##type##_enabled() is a static branch that will only * be set when the trace event is enabled. */ #define TRACE_CGROUP_PATH(type, cgrp, ...) \ do { \ if (trace_cgroup_##type##_enabled()) { \ unsigned long flags; \ spin_lock_irqsave(&trace_cgroup_path_lock, \ flags); \ cgroup_path(cgrp, trace_cgroup_path, \ TRACE_CGROUP_PATH_LEN); \ trace_cgroup_##type(cgrp, trace_cgroup_path, \ ##__VA_ARGS__); \ spin_unlock_irqrestore(&trace_cgroup_path_lock, \ flags); \ } \ } while (0) /* * The cgroup filesystem superblock creation/mount context. */ struct cgroup_fs_context { struct kernfs_fs_context kfc; struct cgroup_root *root; struct cgroup_namespace *ns; unsigned int flags; /* CGRP_ROOT_* flags */ /* cgroup1 bits */ bool cpuset_clone_children; bool none; /* User explicitly requested empty subsystem */ bool all_ss; /* Seen 'all' option */ u16 subsys_mask; /* Selected subsystems */ char *name; /* Hierarchy name */ char *release_agent; /* Path for release notifications */ }; static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc) { struct kernfs_fs_context *kfc = fc->fs_private; return container_of(kfc, struct cgroup_fs_context, kfc); } struct cgroup_pidlist; struct cgroup_file_ctx { struct cgroup_namespace *ns; struct { void *trigger; } psi; struct { bool started; struct css_task_iter iter; } procs; struct { struct cgroup_pidlist *pidlist; } procs1; }; /* * A cgroup can be associated with multiple css_sets as different tasks may * belong to different cgroups on different hierarchies. In the other * direction, a css_set is naturally associated with multiple cgroups. * This M:N relationship is represented by the following link structure * which exists for each association and allows traversing the associations * from both sides. */ struct cgrp_cset_link { /* the cgroup and css_set this link associates */ struct cgroup *cgrp; struct css_set *cset; /* list of cgrp_cset_links anchored at cgrp->cset_links */ struct list_head cset_link; /* list of cgrp_cset_links anchored at css_set->cgrp_links */ struct list_head cgrp_link; }; /* used to track tasks and csets during migration */ struct cgroup_taskset { /* the src and dst cset list running through cset->mg_node */ struct list_head src_csets; struct list_head dst_csets; /* the number of tasks in the set */ int nr_tasks; /* the subsys currently being processed */ int ssid; /* * Fields for cgroup_taskset_*() iteration. * * Before migration is committed, the target migration tasks are on * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of * the csets on ->dst_csets. ->csets point to either ->src_csets * or ->dst_csets depending on whether migration is committed. * * ->cur_csets and ->cur_task point to the current task position * during iteration. */ struct list_head *csets; struct css_set *cur_cset; struct task_struct *cur_task; }; /* migration context also tracks preloading */ struct cgroup_mgctx { /* * Preloaded source and destination csets. Used to guarantee * atomic success or failure on actual migration. */ struct list_head preloaded_src_csets; struct list_head preloaded_dst_csets; /* tasks and csets to migrate */ struct cgroup_taskset tset; /* subsystems affected by migration */ u16 ss_mask; }; #define CGROUP_TASKSET_INIT(tset) \ { \ .src_csets = LIST_HEAD_INIT(tset.src_csets), \ .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \ .csets = &tset.src_csets, \ } #define CGROUP_MGCTX_INIT(name) \ { \ LIST_HEAD_INIT(name.preloaded_src_csets), \ LIST_HEAD_INIT(name.preloaded_dst_csets), \ CGROUP_TASKSET_INIT(name.tset), \ } #define DEFINE_CGROUP_MGCTX(name) \ struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) extern struct cgroup_subsys *cgroup_subsys[]; extern struct list_head cgroup_roots; /* iterate across the hierarchies */ #define for_each_root(root) \ list_for_each_entry_rcu((root), &cgroup_roots, root_list, \ lockdep_is_held(&cgroup_mutex)) /** * for_each_subsys - iterate all enabled cgroup subsystems * @ss: the iteration cursor * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end */ #define for_each_subsys(ss, ssid) \ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) static inline bool cgroup_is_dead(const struct cgroup *cgrp) { return !(cgrp->self.flags & CSS_ONLINE); } static inline bool notify_on_release(const struct cgroup *cgrp) { return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); } void put_css_set_locked(struct css_set *cset); static inline void put_css_set(struct css_set *cset) { unsigned long flags; /* * Ensure that the refcount doesn't hit zero while any readers * can see it. Similar to atomic_dec_and_lock(), but for an * rwlock */ if (refcount_dec_not_one(&cset->refcount)) return; spin_lock_irqsave(&css_set_lock, flags); put_css_set_locked(cset); spin_unlock_irqrestore(&css_set_lock, flags); } /* * refcounted get/put for css_set objects */ static inline void get_css_set(struct css_set *cset) { refcount_inc(&cset->refcount); } bool cgroup_ssid_enabled(int ssid); bool cgroup_on_dfl(const struct cgroup *cgrp); struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root); struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroup_root *root); struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline); void cgroup_kn_unlock(struct kernfs_node *kn); int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns); void cgroup_favor_dynmods(struct cgroup_root *root, bool favor); void cgroup_free_root(struct cgroup_root *root); void init_cgroup_root(struct cgroup_fs_context *ctx); int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask); int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); int cgroup_do_get_tree(struct fs_context *fc); int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp); void cgroup_migrate_finish(struct cgroup_mgctx *mgctx); void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, struct cgroup_mgctx *mgctx); int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx); int cgroup_migrate(struct task_struct *leader, bool threadgroup, struct cgroup_mgctx *mgctx); int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); void cgroup_attach_lock(bool lock_threadgroup); void cgroup_attach_unlock(bool lock_threadgroup); struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, bool *locked) __acquires(&cgroup_threadgroup_rwsem); void cgroup_procs_write_finish(struct task_struct *task, bool locked) __releases(&cgroup_threadgroup_rwsem); void cgroup_lock_and_drain_offline(struct cgroup *cgrp); int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode); int cgroup_rmdir(struct kernfs_node *kn); int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, struct kernfs_root *kf_root); int __cgroup_task_count(const struct cgroup *cgrp); int cgroup_task_count(const struct cgroup *cgrp); /* * rstat.c */ int cgroup_rstat_init(struct cgroup *cgrp); void cgroup_rstat_exit(struct cgroup *cgrp); void cgroup_rstat_boot(void); void cgroup_base_stat_cputime_show(struct seq_file *seq); /* * namespace.c */ extern const struct proc_ns_operations cgroupns_operations; /* * cgroup-v1.c */ extern struct cftype cgroup1_base_files[]; extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops; extern const struct fs_parameter_spec cgroup1_fs_parameters[]; int proc_cgroupstats_show(struct seq_file *m, void *v); bool cgroup1_ssid_disabled(int ssid); void cgroup1_pidlist_destroy_all(struct cgroup *cgrp); void cgroup1_release_agent(struct work_struct *work); void cgroup1_check_for_release(struct cgroup *cgrp); int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param); int cgroup1_get_tree(struct fs_context *fc); int cgroup1_reconfigure(struct fs_context *ctx); #endif /* __CGROUP_INTERNAL_H */
11635 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_HARDIRQ_H #define _ASM_X86_HARDIRQ_H #include <linux/threads.h> #include <asm/current.h> typedef struct { #if IS_ENABLED(CONFIG_KVM_INTEL) u8 kvm_cpu_l1tf_flush_l1d; #endif unsigned int __nmi_count; /* arch dependent */ #ifdef CONFIG_X86_LOCAL_APIC unsigned int apic_timer_irqs; /* arch dependent */ unsigned int irq_spurious_count; unsigned int icr_read_retry_count; #endif #ifdef CONFIG_HAVE_KVM unsigned int kvm_posted_intr_ipis; unsigned int kvm_posted_intr_wakeup_ipis; unsigned int kvm_posted_intr_nested_ipis; #endif unsigned int x86_platform_ipis; /* arch dependent */ unsigned int apic_perf_irqs; unsigned int apic_irq_work_irqs; #ifdef CONFIG_SMP unsigned int irq_resched_count; unsigned int irq_call_count; #endif unsigned int irq_tlb_count; #ifdef CONFIG_X86_THERMAL_VECTOR unsigned int irq_thermal_count; #endif #ifdef CONFIG_X86_MCE_THRESHOLD unsigned int irq_threshold_count; #endif #ifdef CONFIG_X86_MCE_AMD unsigned int irq_deferred_error_count; #endif #ifdef CONFIG_X86_HV_CALLBACK_VECTOR unsigned int irq_hv_callback_count; #endif #if IS_ENABLED(CONFIG_HYPERV) unsigned int irq_hv_reenlightenment_count; unsigned int hyperv_stimer0_count; #endif } ____cacheline_aligned irq_cpustat_t; DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); #define __ARCH_IRQ_STAT #define inc_irq_stat(member) this_cpu_inc(irq_stat.member) extern void ack_bad_irq(unsigned int irq); extern u64 arch_irq_stat_cpu(unsigned int cpu); #define arch_irq_stat_cpu arch_irq_stat_cpu extern u64 arch_irq_stat(void); #define arch_irq_stat arch_irq_stat #define local_softirq_pending_ref pcpu_hot.softirq_pending #if IS_ENABLED(CONFIG_KVM_INTEL) static inline void kvm_set_cpu_l1tf_flush_l1d(void) { __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1); } static __always_inline void kvm_clear_cpu_l1tf_flush_l1d(void) { __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 0); } static __always_inline bool kvm_get_cpu_l1tf_flush_l1d(void) { return __this_cpu_read(irq_stat.kvm_cpu_l1tf_flush_l1d); } #else /* !IS_ENABLED(CONFIG_KVM_INTEL) */ static inline void kvm_set_cpu_l1tf_flush_l1d(void) { } #endif /* IS_ENABLED(CONFIG_KVM_INTEL) */ #endif /* _ASM_X86_HARDIRQ_H */
1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 // SPDX-License-Identifier: GPL-2.0 /* * DMABUF System heap exporter * * Copyright (C) 2011 Google, Inc. * Copyright (C) 2019, 2020 Linaro Ltd. * * Portions based off of Andrew Davis' SRAM heap: * Copyright (C) 2019 Texas Instruments Incorporated - http://www.ti.com/ * Andrew F. Davis <afd@ti.com> */ #include <linux/dma-buf.h> #include <linux/dma-mapping.h> #include <linux/dma-heap.h> #include <linux/err.h> #include <linux/highmem.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <linux/vmalloc.h> static struct dma_heap *sys_heap; struct system_heap_buffer { struct dma_heap *heap; struct list_head attachments; struct mutex lock; unsigned long len; struct sg_table sg_table; int vmap_cnt; void *vaddr; }; struct dma_heap_attachment { struct device *dev; struct sg_table *table; struct list_head list; bool mapped; }; #define LOW_ORDER_GFP (GFP_HIGHUSER | __GFP_ZERO) #define HIGH_ORDER_GFP (((GFP_HIGHUSER | __GFP_ZERO | __GFP_NOWARN \ | __GFP_NORETRY) & ~__GFP_RECLAIM) \ | __GFP_COMP) static gfp_t order_flags[] = {HIGH_ORDER_GFP, HIGH_ORDER_GFP, LOW_ORDER_GFP}; /* * The selection of the orders used for allocation (1MB, 64K, 4K) is designed * to match with the sizes often found in IOMMUs. Using order 4 pages instead * of order 0 pages can significantly improve the performance of many IOMMUs * by reducing TLB pressure and time spent updating page tables. */ static const unsigned int orders[] = {8, 4, 0}; #define NUM_ORDERS ARRAY_SIZE(orders) static struct sg_table *dup_sg_table(struct sg_table *table) { struct sg_table *new_table; int ret, i; struct scatterlist *sg, *new_sg; new_table = kzalloc(sizeof(*new_table), GFP_KERNEL); if (!new_table) return ERR_PTR(-ENOMEM); ret = sg_alloc_table(new_table, table->orig_nents, GFP_KERNEL); if (ret) { kfree(new_table); return ERR_PTR(-ENOMEM); } new_sg = new_table->sgl; for_each_sgtable_sg(table, sg, i) { sg_set_page(new_sg, sg_page(sg), sg->length, sg->offset); new_sg = sg_next(new_sg); } return new_table; } static int system_heap_attach(struct dma_buf *dmabuf, struct dma_buf_attachment *attachment) { struct system_heap_buffer *buffer = dmabuf->priv; struct dma_heap_attachment *a; struct sg_table *table; a = kzalloc(sizeof(*a), GFP_KERNEL); if (!a) return -ENOMEM; table = dup_sg_table(&buffer->sg_table); if (IS_ERR(table)) { kfree(a); return -ENOMEM; } a->table = table; a->dev = attachment->dev; INIT_LIST_HEAD(&a->list); a->mapped = false; attachment->priv = a; mutex_lock(&buffer->lock); list_add(&a->list, &buffer->attachments); mutex_unlock(&buffer->lock); return 0; } static void system_heap_detach(struct dma_buf *dmabuf, struct dma_buf_attachment *attachment) { struct system_heap_buffer *buffer = dmabuf->priv; struct dma_heap_attachment *a = attachment->priv; mutex_lock(&buffer->lock); list_del(&a->list); mutex_unlock(&buffer->lock); sg_free_table(a->table); kfree(a->table); kfree(a); } static struct sg_table *system_heap_map_dma_buf(struct dma_buf_attachment *attachment, enum dma_data_direction direction) { struct dma_heap_attachment *a = attachment->priv; struct sg_table *table = a->table; int ret; ret = dma_map_sgtable(attachment->dev, table, direction, 0); if (ret) return ERR_PTR(ret); a->mapped = true; return table; } static void system_heap_unmap_dma_buf(struct dma_buf_attachment *attachment, struct sg_table *table, enum dma_data_direction direction) { struct dma_heap_attachment *a = attachment->priv; a->mapped = false; dma_unmap_sgtable(attachment->dev, table, direction, 0); } static int system_heap_dma_buf_begin_cpu_access(struct dma_buf *dmabuf, enum dma_data_direction direction) { struct system_heap_buffer *buffer = dmabuf->priv; struct dma_heap_attachment *a; mutex_lock(&buffer->lock); if (buffer->vmap_cnt) invalidate_kernel_vmap_range(buffer->vaddr, buffer->len); list_for_each_entry(a, &buffer->attachments, list) { if (!a->mapped) continue; dma_sync_sgtable_for_cpu(a->dev, a->table, direction); } mutex_unlock(&buffer->lock); return 0; } static int system_heap_dma_buf_end_cpu_access(struct dma_buf *dmabuf, enum dma_data_direction direction) { struct system_heap_buffer *buffer = dmabuf->priv; struct dma_heap_attachment *a; mutex_lock(&buffer->lock); if (buffer->vmap_cnt) flush_kernel_vmap_range(buffer->vaddr, buffer->len); list_for_each_entry(a, &buffer->attachments, list) { if (!a->mapped) continue; dma_sync_sgtable_for_device(a->dev, a->table, direction); } mutex_unlock(&buffer->lock); return 0; } static int system_heap_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma) { struct system_heap_buffer *buffer = dmabuf->priv; struct sg_table *table = &buffer->sg_table; unsigned long addr = vma->vm_start; struct sg_page_iter piter; int ret; for_each_sgtable_page(table, &piter, vma->vm_pgoff) { struct page *page = sg_page_iter_page(&piter); ret = remap_pfn_range(vma, addr, page_to_pfn(page), PAGE_SIZE, vma->vm_page_prot); if (ret) return ret; addr += PAGE_SIZE; if (addr >= vma->vm_end) return 0; } return 0; } static void *system_heap_do_vmap(struct system_heap_buffer *buffer) { struct sg_table *table = &buffer->sg_table; int npages = PAGE_ALIGN(buffer->len) / PAGE_SIZE; struct page **pages = vmalloc(sizeof(struct page *) * npages); struct page **tmp = pages; struct sg_page_iter piter; void *vaddr; if (!pages) return ERR_PTR(-ENOMEM); for_each_sgtable_page(table, &piter, 0) { WARN_ON(tmp - pages >= npages); *tmp++ = sg_page_iter_page(&piter); } vaddr = vmap(pages, npages, VM_MAP, PAGE_KERNEL); vfree(pages); if (!vaddr) return ERR_PTR(-ENOMEM); return vaddr; } static int system_heap_vmap(struct dma_buf *dmabuf, struct iosys_map *map) { struct system_heap_buffer *buffer = dmabuf->priv; void *vaddr; int ret = 0; mutex_lock(&buffer->lock); if (buffer->vmap_cnt) { buffer->vmap_cnt++; iosys_map_set_vaddr(map, buffer->vaddr); goto out; } vaddr = system_heap_do_vmap(buffer); if (IS_ERR(vaddr)) { ret = PTR_ERR(vaddr); goto out; } buffer->vaddr = vaddr; buffer->vmap_cnt++; iosys_map_set_vaddr(map, buffer->vaddr); out: mutex_unlock(&buffer->lock); return ret; } static void system_heap_vunmap(struct dma_buf *dmabuf, struct iosys_map *map) { struct system_heap_buffer *buffer = dmabuf->priv; mutex_lock(&buffer->lock); if (!--buffer->vmap_cnt) { vunmap(buffer->vaddr); buffer->vaddr = NULL; } mutex_unlock(&buffer->lock); iosys_map_clear(map); } static void system_heap_dma_buf_release(struct dma_buf *dmabuf) { struct system_heap_buffer *buffer = dmabuf->priv; struct sg_table *table; struct scatterlist *sg; int i; table = &buffer->sg_table; for_each_sgtable_sg(table, sg, i) { struct page *page = sg_page(sg); __free_pages(page, compound_order(page)); } sg_free_table(table); kfree(buffer); } static const struct dma_buf_ops system_heap_buf_ops = { .attach = system_heap_attach, .detach = system_heap_detach, .map_dma_buf = system_heap_map_dma_buf, .unmap_dma_buf = system_heap_unmap_dma_buf, .begin_cpu_access = system_heap_dma_buf_begin_cpu_access, .end_cpu_access = system_heap_dma_buf_end_cpu_access, .mmap = system_heap_mmap, .vmap = system_heap_vmap, .vunmap = system_heap_vunmap, .release = system_heap_dma_buf_release, }; static struct page *alloc_largest_available(unsigned long size, unsigned int max_order) { struct page *page; int i; for (i = 0; i < NUM_ORDERS; i++) { if (size < (PAGE_SIZE << orders[i])) continue; if (max_order < orders[i]) continue; page = alloc_pages(order_flags[i], orders[i]); if (!page) continue; return page; } return NULL; } static struct dma_buf *system_heap_allocate(struct dma_heap *heap, unsigned long len, unsigned long fd_flags, unsigned long heap_flags) { struct system_heap_buffer *buffer; DEFINE_DMA_BUF_EXPORT_INFO(exp_info); unsigned long size_remaining = len; unsigned int max_order = orders[0]; struct dma_buf *dmabuf; struct sg_table *table; struct scatterlist *sg; struct list_head pages; struct page *page, *tmp_page; int i, ret = -ENOMEM; buffer = kzalloc(sizeof(*buffer), GFP_KERNEL); if (!buffer) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&buffer->attachments); mutex_init(&buffer->lock); buffer->heap = heap; buffer->len = len; INIT_LIST_HEAD(&pages); i = 0; while (size_remaining > 0) { /* * Avoid trying to allocate memory if the process * has been killed by SIGKILL */ if (fatal_signal_pending(current)) { ret = -EINTR; goto free_buffer; } page = alloc_largest_available(size_remaining, max_order); if (!page) goto free_buffer; list_add_tail(&page->lru, &pages); size_remaining -= page_size(page); max_order = compound_order(page); i++; } table = &buffer->sg_table; if (sg_alloc_table(table, i, GFP_KERNEL)) goto free_buffer; sg = table->sgl; list_for_each_entry_safe(page, tmp_page, &pages, lru) { sg_set_page(sg, page, page_size(page), 0); sg = sg_next(sg); list_del(&page->lru); } /* create the dmabuf */ exp_info.exp_name = dma_heap_get_name(heap); exp_info.ops = &system_heap_buf_ops; exp_info.size = buffer->len; exp_info.flags = fd_flags; exp_info.priv = buffer; dmabuf = dma_buf_export(&exp_info); if (IS_ERR(dmabuf)) { ret = PTR_ERR(dmabuf); goto free_pages; } return dmabuf; free_pages: for_each_sgtable_sg(table, sg, i) { struct page *p = sg_page(sg); __free_pages(p, compound_order(p)); } sg_free_table(table); free_buffer: list_for_each_entry_safe(page, tmp_page, &pages, lru) __free_pages(page, compound_order(page)); kfree(buffer); return ERR_PTR(ret); } static const struct dma_heap_ops system_heap_ops = { .allocate = system_heap_allocate, }; static int system_heap_create(void) { struct dma_heap_export_info exp_info; exp_info.name = "system"; exp_info.ops = &system_heap_ops; exp_info.priv = NULL; sys_heap = dma_heap_add(&exp_info); if (IS_ERR(sys_heap)) return PTR_ERR(sys_heap); return 0; } module_init(system_heap_create);
4 5 5 5 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 // SPDX-License-Identifier: GPL-2.0 #include <linux/export.h> #include <linux/preempt.h> #include <linux/smp.h> #include <linux/completion.h> #include <asm/msr.h> static void __rdmsr_on_cpu(void *info) { struct msr_info *rv = info; struct msr *reg; int this_cpu = raw_smp_processor_id(); if (rv->msrs) reg = per_cpu_ptr(rv->msrs, this_cpu); else reg = &rv->reg; rdmsr(rv->msr_no, reg->l, reg->h); } static void __wrmsr_on_cpu(void *info) { struct msr_info *rv = info; struct msr *reg; int this_cpu = raw_smp_processor_id(); if (rv->msrs) reg = per_cpu_ptr(rv->msrs, this_cpu); else reg = &rv->reg; wrmsr(rv->msr_no, reg->l, reg->h); } int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) { int err; struct msr_info rv; memset(&rv, 0, sizeof(rv)); rv.msr_no = msr_no; err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); *l = rv.reg.l; *h = rv.reg.h; return err; } EXPORT_SYMBOL(rdmsr_on_cpu); int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) { int err; struct msr_info rv; memset(&rv, 0, sizeof(rv)); rv.msr_no = msr_no; err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); *q = rv.reg.q; return err; } EXPORT_SYMBOL(rdmsrl_on_cpu); int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) { int err; struct msr_info rv; memset(&rv, 0, sizeof(rv)); rv.msr_no = msr_no; rv.reg.l = l; rv.reg.h = h; err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); return err; } EXPORT_SYMBOL(wrmsr_on_cpu); int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q) { int err; struct msr_info rv; memset(&rv, 0, sizeof(rv)); rv.msr_no = msr_no; rv.reg.q = q; err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); return err; } EXPORT_SYMBOL(wrmsrl_on_cpu); static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs, void (*msr_func) (void *info)) { struct msr_info rv; int this_cpu; memset(&rv, 0, sizeof(rv)); rv.msrs = msrs; rv.msr_no = msr_no; this_cpu = get_cpu(); if (cpumask_test_cpu(this_cpu, mask)) msr_func(&rv); smp_call_function_many(mask, msr_func, &rv, 1); put_cpu(); } /* rdmsr on a bunch of CPUs * * @mask: which CPUs * @msr_no: which MSR * @msrs: array of MSR values * */ void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs) { __rwmsr_on_cpus(mask, msr_no, msrs, __rdmsr_on_cpu); } EXPORT_SYMBOL(rdmsr_on_cpus); /* * wrmsr on a bunch of CPUs * * @mask: which CPUs * @msr_no: which MSR * @msrs: array of MSR values * */ void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs) { __rwmsr_on_cpus(mask, msr_no, msrs, __wrmsr_on_cpu); } EXPORT_SYMBOL(wrmsr_on_cpus); struct msr_info_completion { struct msr_info msr; struct completion done; }; /* These "safe" variants are slower and should be used when the target MSR may not actually exist. */ static void __rdmsr_safe_on_cpu(void *info) { struct msr_info_completion *rv = info; rv->msr.err = rdmsr_safe(rv->msr.msr_no, &rv->msr.reg.l, &rv->msr.reg.h); complete(&rv->done); } static void __wrmsr_safe_on_cpu(void *info) { struct msr_info *rv = info; rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h); } int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) { struct msr_info_completion rv; call_single_data_t csd; int err; INIT_CSD(&csd, __rdmsr_safe_on_cpu, &rv); memset(&rv, 0, sizeof(rv)); init_completion(&rv.done); rv.msr.msr_no = msr_no; err = smp_call_function_single_async(cpu, &csd); if (!err) { wait_for_completion(&rv.done); err = rv.msr.err; } *l = rv.msr.reg.l; *h = rv.msr.reg.h; return err; } EXPORT_SYMBOL(rdmsr_safe_on_cpu); int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) { int err; struct msr_info rv; memset(&rv, 0, sizeof(rv)); rv.msr_no = msr_no; rv.reg.l = l; rv.reg.h = h; err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1); return err ? err : rv.err; } EXPORT_SYMBOL(wrmsr_safe_on_cpu); int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q) { int err; struct msr_info rv; memset(&rv, 0, sizeof(rv)); rv.msr_no = msr_no; rv.reg.q = q; err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1); return err ? err : rv.err; } EXPORT_SYMBOL(wrmsrl_safe_on_cpu); int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) { u32 low, high; int err; err = rdmsr_safe_on_cpu(cpu, msr_no, &low, &high); *q = (u64)high << 32 | low; return err; } EXPORT_SYMBOL(rdmsrl_safe_on_cpu); /* * These variants are significantly slower, but allows control over * the entire 32-bit GPR set. */ static void __rdmsr_safe_regs_on_cpu(void *info) { struct msr_regs_info *rv = info; rv->err = rdmsr_safe_regs(rv->regs); } static void __wrmsr_safe_regs_on_cpu(void *info) { struct msr_regs_info *rv = info; rv->err = wrmsr_safe_regs(rv->regs); } int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) { int err; struct msr_regs_info rv; rv.regs = regs; rv.err = -EIO; err = smp_call_function_single(cpu, __rdmsr_safe_regs_on_cpu, &rv, 1); return err ? err : rv.err; } EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu); int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) { int err; struct msr_regs_info rv; rv.regs = regs; rv.err = -EIO; err = smp_call_function_single(cpu, __wrmsr_safe_regs_on_cpu, &rv, 1); return err ? err : rv.err; } EXPORT_SYMBOL(wrmsr_safe_regs_on_cpu);
1 62 62 11 62 10 3 2 9 2 62 62 62 56 8 62 60 61 61 62 62 3 10 4 61 61 62 2 1 54 54 3 3 3 5 4 3 3 3 5 4 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 // SPDX-License-Identifier: GPL-2.0-or-later /* Local endpoint object management * * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/net.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/udp.h> #include <linux/ip.h> #include <linux/hashtable.h> #include <net/sock.h> #include <net/udp.h> #include <net/udp_tunnel.h> #include <net/af_rxrpc.h> #include "ar-internal.h" static void rxrpc_local_rcu(struct rcu_head *); /* * Handle an ICMP/ICMP6 error turning up at the tunnel. Push it through the * usual mechanism so that it gets parsed and presented through the UDP * socket's error_report(). */ static void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload) { if (ip_hdr(skb)->version == IPVERSION) return ip_icmp_error(sk, skb, err, port, info, payload); if (IS_ENABLED(CONFIG_AF_RXRPC_IPV6)) return ipv6_icmp_error(sk, skb, err, port, info, payload); } /* * Compare a local to an address. Return -ve, 0 or +ve to indicate less than, * same or greater than. * * We explicitly don't compare the RxRPC service ID as we want to reject * conflicting uses by differing services. Further, we don't want to share * addresses with different options (IPv6), so we don't compare those bits * either. */ static long rxrpc_local_cmp_key(const struct rxrpc_local *local, const struct sockaddr_rxrpc *srx) { long diff; diff = ((local->srx.transport_type - srx->transport_type) ?: (local->srx.transport_len - srx->transport_len) ?: (local->srx.transport.family - srx->transport.family)); if (diff != 0) return diff; switch (srx->transport.family) { case AF_INET: /* If the choice of UDP port is left up to the transport, then * the endpoint record doesn't match. */ return ((u16 __force)local->srx.transport.sin.sin_port - (u16 __force)srx->transport.sin.sin_port) ?: memcmp(&local->srx.transport.sin.sin_addr, &srx->transport.sin.sin_addr, sizeof(struct in_addr)); #ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: /* If the choice of UDP6 port is left up to the transport, then * the endpoint record doesn't match. */ return ((u16 __force)local->srx.transport.sin6.sin6_port - (u16 __force)srx->transport.sin6.sin6_port) ?: memcmp(&local->srx.transport.sin6.sin6_addr, &srx->transport.sin6.sin6_addr, sizeof(struct in6_addr)); #endif default: BUG(); } } static void rxrpc_client_conn_reap_timeout(struct timer_list *timer) { struct rxrpc_local *local = container_of(timer, struct rxrpc_local, client_conn_reap_timer); if (!local->kill_all_client_conns && test_and_set_bit(RXRPC_CLIENT_CONN_REAP_TIMER, &local->client_conn_flags)) rxrpc_wake_up_io_thread(local); } /* * Allocate a new local endpoint. */ static struct rxrpc_local *rxrpc_alloc_local(struct net *net, const struct sockaddr_rxrpc *srx) { struct rxrpc_local *local; u32 tmp; local = kzalloc(sizeof(struct rxrpc_local), GFP_KERNEL); if (local) { refcount_set(&local->ref, 1); atomic_set(&local->active_users, 1); local->net = net; local->rxnet = rxrpc_net(net); INIT_HLIST_NODE(&local->link); init_completion(&local->io_thread_ready); #ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY skb_queue_head_init(&local->rx_delay_queue); #endif skb_queue_head_init(&local->rx_queue); INIT_LIST_HEAD(&local->conn_attend_q); INIT_LIST_HEAD(&local->call_attend_q); local->client_bundles = RB_ROOT; spin_lock_init(&local->client_bundles_lock); local->kill_all_client_conns = false; INIT_LIST_HEAD(&local->idle_client_conns); timer_setup(&local->client_conn_reap_timer, rxrpc_client_conn_reap_timeout, 0); spin_lock_init(&local->lock); rwlock_init(&local->services_lock); local->debug_id = atomic_inc_return(&rxrpc_debug_id); memcpy(&local->srx, srx, sizeof(*srx)); local->srx.srx_service = 0; idr_init(&local->conn_ids); get_random_bytes(&tmp, sizeof(tmp)); tmp &= 0x3fffffff; if (tmp == 0) tmp = 1; idr_set_cursor(&local->conn_ids, tmp); INIT_LIST_HEAD(&local->new_client_calls); spin_lock_init(&local->client_call_lock); trace_rxrpc_local(local->debug_id, rxrpc_local_new, 1, 1); } _leave(" = %p", local); return local; } /* * create the local socket * - must be called with rxrpc_local_mutex locked */ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) { struct udp_tunnel_sock_cfg tuncfg = {NULL}; struct sockaddr_rxrpc *srx = &local->srx; struct udp_port_cfg udp_conf = {0}; struct task_struct *io_thread; struct sock *usk; int ret; _enter("%p{%d,%d}", local, srx->transport_type, srx->transport.family); udp_conf.family = srx->transport.family; udp_conf.use_udp_checksums = true; if (udp_conf.family == AF_INET) { udp_conf.local_ip = srx->transport.sin.sin_addr; udp_conf.local_udp_port = srx->transport.sin.sin_port; #if IS_ENABLED(CONFIG_AF_RXRPC_IPV6) } else { udp_conf.local_ip6 = srx->transport.sin6.sin6_addr; udp_conf.local_udp_port = srx->transport.sin6.sin6_port; udp_conf.use_udp6_tx_checksums = true; udp_conf.use_udp6_rx_checksums = true; #endif } ret = udp_sock_create(net, &udp_conf, &local->socket); if (ret < 0) { _leave(" = %d [socket]", ret); return ret; } tuncfg.encap_type = UDP_ENCAP_RXRPC; tuncfg.encap_rcv = rxrpc_encap_rcv; tuncfg.encap_err_rcv = rxrpc_encap_err_rcv; tuncfg.sk_user_data = local; setup_udp_tunnel_sock(net, local->socket, &tuncfg); /* set the socket up */ usk = local->socket->sk; usk->sk_error_report = rxrpc_error_report; switch (srx->transport.family) { case AF_INET6: /* we want to receive ICMPv6 errors */ ip6_sock_set_recverr(usk); /* Fall through and set IPv4 options too otherwise we don't get * errors from IPv4 packets sent through the IPv6 socket. */ fallthrough; case AF_INET: /* we want to receive ICMP errors */ ip_sock_set_recverr(usk); /* we want to set the don't fragment bit */ ip_sock_set_mtu_discover(usk, IP_PMTUDISC_DO); /* We want receive timestamps. */ sock_enable_timestamps(usk); break; default: BUG(); } io_thread = kthread_run(rxrpc_io_thread, local, "krxrpcio/%u", ntohs(udp_conf.local_udp_port)); if (IS_ERR(io_thread)) { ret = PTR_ERR(io_thread); goto error_sock; } wait_for_completion(&local->io_thread_ready); local->io_thread = io_thread; _leave(" = 0"); return 0; error_sock: kernel_sock_shutdown(local->socket, SHUT_RDWR); local->socket->sk->sk_user_data = NULL; sock_release(local->socket); local->socket = NULL; return ret; } /* * Look up or create a new local endpoint using the specified local address. */ struct rxrpc_local *rxrpc_lookup_local(struct net *net, const struct sockaddr_rxrpc *srx) { struct rxrpc_local *local; struct rxrpc_net *rxnet = rxrpc_net(net); struct hlist_node *cursor; long diff; int ret; _enter("{%d,%d,%pISp}", srx->transport_type, srx->transport.family, &srx->transport); mutex_lock(&rxnet->local_mutex); hlist_for_each(cursor, &rxnet->local_endpoints) { local = hlist_entry(cursor, struct rxrpc_local, link); diff = rxrpc_local_cmp_key(local, srx); if (diff != 0) continue; /* Services aren't allowed to share transport sockets, so * reject that here. It is possible that the object is dying - * but it may also still have the local transport address that * we want bound. */ if (srx->srx_service) { local = NULL; goto addr_in_use; } /* Found a match. We want to replace a dying object. * Attempting to bind the transport socket may still fail if * we're attempting to use a local address that the dying * object is still using. */ if (!rxrpc_use_local(local, rxrpc_local_use_lookup)) break; goto found; } local = rxrpc_alloc_local(net, srx); if (!local) goto nomem; ret = rxrpc_open_socket(local, net); if (ret < 0) goto sock_error; if (cursor) { hlist_replace_rcu(cursor, &local->link); cursor->pprev = NULL; } else { hlist_add_head_rcu(&local->link, &rxnet->local_endpoints); } found: mutex_unlock(&rxnet->local_mutex); _leave(" = %p", local); return local; nomem: ret = -ENOMEM; sock_error: mutex_unlock(&rxnet->local_mutex); if (local) call_rcu(&local->rcu, rxrpc_local_rcu); _leave(" = %d", ret); return ERR_PTR(ret); addr_in_use: mutex_unlock(&rxnet->local_mutex); _leave(" = -EADDRINUSE"); return ERR_PTR(-EADDRINUSE); } /* * Get a ref on a local endpoint. */ struct rxrpc_local *rxrpc_get_local(struct rxrpc_local *local, enum rxrpc_local_trace why) { int r, u; u = atomic_read(&local->active_users); __refcount_inc(&local->ref, &r); trace_rxrpc_local(local->debug_id, why, r + 1, u); return local; } /* * Get a ref on a local endpoint unless its usage has already reached 0. */ struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *local, enum rxrpc_local_trace why) { int r, u; if (local && __refcount_inc_not_zero(&local->ref, &r)) { u = atomic_read(&local->active_users); trace_rxrpc_local(local->debug_id, why, r + 1, u); return local; } return NULL; } /* * Drop a ref on a local endpoint. */ void rxrpc_put_local(struct rxrpc_local *local, enum rxrpc_local_trace why) { unsigned int debug_id; bool dead; int r, u; if (local) { debug_id = local->debug_id; u = atomic_read(&local->active_users); dead = __refcount_dec_and_test(&local->ref, &r); trace_rxrpc_local(debug_id, why, r, u); if (dead) call_rcu(&local->rcu, rxrpc_local_rcu); } } /* * Start using a local endpoint. */ struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *local, enum rxrpc_local_trace why) { local = rxrpc_get_local_maybe(local, rxrpc_local_get_for_use); if (!local) return NULL; if (!__rxrpc_use_local(local, why)) { rxrpc_put_local(local, rxrpc_local_put_for_use); return NULL; } return local; } /* * Cease using a local endpoint. Once the number of active users reaches 0, we * start the closure of the transport in the I/O thread.. */ void rxrpc_unuse_local(struct rxrpc_local *local, enum rxrpc_local_trace why) { unsigned int debug_id; int r, u; if (local) { debug_id = local->debug_id; r = refcount_read(&local->ref); u = atomic_dec_return(&local->active_users); trace_rxrpc_local(debug_id, why, r, u); if (u == 0) kthread_stop(local->io_thread); } } /* * Destroy a local endpoint's socket and then hand the record to RCU to dispose * of. * * Closing the socket cannot be done from bottom half context or RCU callback * context because it might sleep. */ void rxrpc_destroy_local(struct rxrpc_local *local) { struct socket *socket = local->socket; struct rxrpc_net *rxnet = local->rxnet; _enter("%d", local->debug_id); local->dead = true; mutex_lock(&rxnet->local_mutex); hlist_del_init_rcu(&local->link); mutex_unlock(&rxnet->local_mutex); rxrpc_clean_up_local_conns(local); rxrpc_service_connection_reaper(&rxnet->service_conn_reaper); ASSERT(!local->service); if (socket) { local->socket = NULL; kernel_sock_shutdown(socket, SHUT_RDWR); socket->sk->sk_user_data = NULL; sock_release(socket); } /* At this point, there should be no more packets coming in to the * local endpoint. */ #ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY rxrpc_purge_queue(&local->rx_delay_queue); #endif rxrpc_purge_queue(&local->rx_queue); rxrpc_purge_client_connections(local); } /* * Destroy a local endpoint after the RCU grace period expires. */ static void rxrpc_local_rcu(struct rcu_head *rcu) { struct rxrpc_local *local = container_of(rcu, struct rxrpc_local, rcu); rxrpc_see_local(local, rxrpc_local_free); kfree(local); } /* * Verify the local endpoint list is empty by this point. */ void rxrpc_destroy_all_locals(struct rxrpc_net *rxnet) { struct rxrpc_local *local; _enter(""); flush_workqueue(rxrpc_workqueue); if (!hlist_empty(&rxnet->local_endpoints)) { mutex_lock(&rxnet->local_mutex); hlist_for_each_entry(local, &rxnet->local_endpoints, link) { pr_err("AF_RXRPC: Leaked local %p {%d}\n", local, refcount_read(&local->ref)); } mutex_unlock(&rxnet->local_mutex); BUG(); } }
6299 14671 15552 18701 2417 4332 3641 1268 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 /* SPDX-License-Identifier: GPL-2.0 */ /* * Wrapper functions for accessing the file_struct fd array. */ #ifndef __LINUX_FILE_H #define __LINUX_FILE_H #include <linux/compiler.h> #include <linux/types.h> #include <linux/posix_types.h> #include <linux/errno.h> #include <linux/cleanup.h> struct file; extern void fput(struct file *); struct file_operations; struct task_struct; struct vfsmount; struct dentry; struct inode; struct path; extern struct file *alloc_file_pseudo(struct inode *, struct vfsmount *, const char *, int flags, const struct file_operations *); extern struct file *alloc_file_clone(struct file *, int flags, const struct file_operations *); static inline void fput_light(struct file *file, int fput_needed) { if (fput_needed) fput(file); } struct fd { struct file *file; unsigned int flags; }; #define FDPUT_FPUT 1 #define FDPUT_POS_UNLOCK 2 static inline void fdput(struct fd fd) { if (fd.flags & FDPUT_FPUT) fput(fd.file); } extern struct file *fget(unsigned int fd); extern struct file *fget_raw(unsigned int fd); extern struct file *fget_task(struct task_struct *task, unsigned int fd); extern unsigned long __fdget(unsigned int fd); extern unsigned long __fdget_raw(unsigned int fd); extern unsigned long __fdget_pos(unsigned int fd); extern void __f_unlock_pos(struct file *); static inline struct fd __to_fd(unsigned long v) { return (struct fd){(struct file *)(v & ~3),v & 3}; } static inline struct fd fdget(unsigned int fd) { return __to_fd(__fdget(fd)); } static inline struct fd fdget_raw(unsigned int fd) { return __to_fd(__fdget_raw(fd)); } static inline struct fd fdget_pos(int fd) { return __to_fd(__fdget_pos(fd)); } static inline void fdput_pos(struct fd f) { if (f.flags & FDPUT_POS_UNLOCK) __f_unlock_pos(f.file); fdput(f); } DEFINE_CLASS(fd, struct fd, fdput(_T), fdget(fd), int fd) extern int f_dupfd(unsigned int from, struct file *file, unsigned flags); extern int replace_fd(unsigned fd, struct file *file, unsigned flags); extern void set_close_on_exec(unsigned int fd, int flag); extern bool get_close_on_exec(unsigned int fd); extern int __get_unused_fd_flags(unsigned flags, unsigned long nofile); extern int get_unused_fd_flags(unsigned flags); extern void put_unused_fd(unsigned int fd); DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T), get_unused_fd_flags(flags), unsigned flags) extern void fd_install(unsigned int fd, struct file *file); int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags); int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags); extern void flush_delayed_fput(void); extern void __fput_sync(struct file *); extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max; #endif /* __LINUX_FILE_H */
272 1 2 3 4 5 6 7 8 // SPDX-License-Identifier: GPL-2.0 #include <linux/static_call.h> long __static_call_return0(void) { return 0; } EXPORT_SYMBOL_GPL(__static_call_return0);
7 3 2 1 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 // SPDX-License-Identifier: GPL-2.0-only /* * (C) 2007 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/gen_stats.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_rateest.h> #include <net/netfilter/xt_rateest.h> static bool xt_rateest_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_rateest_match_info *info = par->matchinfo; struct gnet_stats_rate_est64 sample = {0}; u_int32_t bps1, bps2, pps1, pps2; bool ret = true; gen_estimator_read(&info->est1->rate_est, &sample); if (info->flags & XT_RATEEST_MATCH_DELTA) { bps1 = info->bps1 >= sample.bps ? info->bps1 - sample.bps : 0; pps1 = info->pps1 >= sample.pps ? info->pps1 - sample.pps : 0; } else { bps1 = sample.bps; pps1 = sample.pps; } if (info->flags & XT_RATEEST_MATCH_ABS) { bps2 = info->bps2; pps2 = info->pps2; } else { gen_estimator_read(&info->est2->rate_est, &sample); if (info->flags & XT_RATEEST_MATCH_DELTA) { bps2 = info->bps2 >= sample.bps ? info->bps2 - sample.bps : 0; pps2 = info->pps2 >= sample.pps ? info->pps2 - sample.pps : 0; } else { bps2 = sample.bps; pps2 = sample.pps; } } switch (info->mode) { case XT_RATEEST_MATCH_LT: if (info->flags & XT_RATEEST_MATCH_BPS) ret &= bps1 < bps2; if (info->flags & XT_RATEEST_MATCH_PPS) ret &= pps1 < pps2; break; case XT_RATEEST_MATCH_GT: if (info->flags & XT_RATEEST_MATCH_BPS) ret &= bps1 > bps2; if (info->flags & XT_RATEEST_MATCH_PPS) ret &= pps1 > pps2; break; case XT_RATEEST_MATCH_EQ: if (info->flags & XT_RATEEST_MATCH_BPS) ret &= bps1 == bps2; if (info->flags & XT_RATEEST_MATCH_PPS) ret &= pps1 == pps2; break; } ret ^= info->flags & XT_RATEEST_MATCH_INVERT ? true : false; return ret; } static int xt_rateest_mt_checkentry(const struct xt_mtchk_param *par) { struct xt_rateest_match_info *info = par->matchinfo; struct xt_rateest *est1, *est2; int ret = -EINVAL; if (hweight32(info->flags & (XT_RATEEST_MATCH_ABS | XT_RATEEST_MATCH_REL)) != 1) goto err1; if (!(info->flags & (XT_RATEEST_MATCH_BPS | XT_RATEEST_MATCH_PPS))) goto err1; switch (info->mode) { case XT_RATEEST_MATCH_EQ: case XT_RATEEST_MATCH_LT: case XT_RATEEST_MATCH_GT: break; default: goto err1; } ret = -ENOENT; est1 = xt_rateest_lookup(par->net, info->name1); if (!est1) goto err1; est2 = NULL; if (info->flags & XT_RATEEST_MATCH_REL) { est2 = xt_rateest_lookup(par->net, info->name2); if (!est2) goto err2; } info->est1 = est1; info->est2 = est2; return 0; err2: xt_rateest_put(par->net, est1); err1: return ret; } static void xt_rateest_mt_destroy(const struct xt_mtdtor_param *par) { struct xt_rateest_match_info *info = par->matchinfo; xt_rateest_put(par->net, info->est1); if (info->est2) xt_rateest_put(par->net, info->est2); } static struct xt_match xt_rateest_mt_reg __read_mostly = { .name = "rateest", .revision = 0, .family = NFPROTO_UNSPEC, .match = xt_rateest_mt, .checkentry = xt_rateest_mt_checkentry, .destroy = xt_rateest_mt_destroy, .matchsize = sizeof(struct xt_rateest_match_info), .usersize = offsetof(struct xt_rateest_match_info, est1), .me = THIS_MODULE, }; static int __init xt_rateest_mt_init(void) { return xt_register_match(&xt_rateest_mt_reg); } static void __exit xt_rateest_mt_fini(void) { xt_unregister_match(&xt_rateest_mt_reg); } MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("xtables rate estimator match"); MODULE_ALIAS("ipt_rateest"); MODULE_ALIAS("ip6t_rateest"); module_init(xt_rateest_mt_init); module_exit(xt_rateest_mt_fini);
2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 // SPDX-License-Identifier: GPL-2.0-or-later /* * MPLS GSO Support * * Authors: Simon Horman (horms@verge.net.au) * * Based on: GSO portions of net/ipv4/gre.c */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/err.h> #include <linux/module.h> #include <linux/netdev_features.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/gso.h> #include <net/mpls.h> static struct sk_buff *mpls_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); u16 mac_offset = skb->mac_header; netdev_features_t mpls_features; u16 mac_len = skb->mac_len; __be16 mpls_protocol; unsigned int mpls_hlen; skb_reset_network_header(skb); mpls_hlen = skb_inner_network_header(skb) - skb_network_header(skb); if (unlikely(!mpls_hlen || mpls_hlen % MPLS_HLEN)) goto out; if (unlikely(!pskb_may_pull(skb, mpls_hlen))) goto out; /* Setup inner SKB. */ mpls_protocol = skb->protocol; skb->protocol = skb->inner_protocol; __skb_pull(skb, mpls_hlen); skb->mac_len = 0; skb_reset_mac_header(skb); /* Segment inner packet. */ mpls_features = skb->dev->mpls_features & features; segs = skb_mac_gso_segment(skb, mpls_features); if (IS_ERR_OR_NULL(segs)) { skb_gso_error_unwind(skb, mpls_protocol, mpls_hlen, mac_offset, mac_len); goto out; } skb = segs; mpls_hlen += mac_len; do { skb->mac_len = mac_len; skb->protocol = mpls_protocol; skb_reset_inner_network_header(skb); __skb_push(skb, mpls_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, mac_len); } while ((skb = skb->next)); out: return segs; } static struct packet_offload mpls_mc_offload __read_mostly = { .type = cpu_to_be16(ETH_P_MPLS_MC), .priority = 15, .callbacks = { .gso_segment = mpls_gso_segment, }, }; static struct packet_offload mpls_uc_offload __read_mostly = { .type = cpu_to_be16(ETH_P_MPLS_UC), .priority = 15, .callbacks = { .gso_segment = mpls_gso_segment, }, }; static int __init mpls_gso_init(void) { pr_info("MPLS GSO support\n"); dev_add_offload(&mpls_uc_offload); dev_add_offload(&mpls_mc_offload); return 0; } static void __exit mpls_gso_exit(void) { dev_remove_offload(&mpls_uc_offload); dev_remove_offload(&mpls_mc_offload); } module_init(mpls_gso_init); module_exit(mpls_gso_exit); MODULE_DESCRIPTION("MPLS GSO support"); MODULE_AUTHOR("Simon Horman (horms@verge.net.au)"); MODULE_LICENSE("GPL");
1 1 1 1 1 1 1 1 30 2 3 6 8 10 11 30 23 30 150 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 // SPDX-License-Identifier: GPL-2.0-only /* * linux/lib/cmdline.c * Helper functions generally used for parsing kernel command line * and module options. * * Code and copyrights come from init/main.c and arch/i386/kernel/setup.c. * * GNU Indent formatting options for this file: -kr -i8 -npsl -pcs */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/ctype.h> /* * If a hyphen was found in get_option, this will handle the * range of numbers, M-N. This will expand the range and insert * the values[M, M+1, ..., N] into the ints array in get_options. */ static int get_range(char **str, int *pint, int n) { int x, inc_counter, upper_range; (*str)++; upper_range = simple_strtol((*str), NULL, 0); inc_counter = upper_range - *pint; for (x = *pint; n && x < upper_range; x++, n--) *pint++ = x; return inc_counter; } /** * get_option - Parse integer from an option string * @str: option string * @pint: (optional output) integer value parsed from @str * * Read an int from an option string; if available accept a subsequent * comma as well. * * When @pint is NULL the function can be used as a validator of * the current option in the string. * * Return values: * 0 - no int in string * 1 - int found, no subsequent comma * 2 - int found including a subsequent comma * 3 - hyphen found to denote a range * * Leading hyphen without integer is no integer case, but we consume it * for the sake of simplification. */ int get_option(char **str, int *pint) { char *cur = *str; int value; if (!cur || !(*cur)) return 0; if (*cur == '-') value = -simple_strtoull(++cur, str, 0); else value = simple_strtoull(cur, str, 0); if (pint) *pint = value; if (cur == *str) return 0; if (**str == ',') { (*str)++; return 2; } if (**str == '-') return 3; return 1; } EXPORT_SYMBOL(get_option); /** * get_options - Parse a string into a list of integers * @str: String to be parsed * @nints: size of integer array * @ints: integer array (must have room for at least one element) * * This function parses a string containing a comma-separated * list of integers, a hyphen-separated range of _positive_ integers, * or a combination of both. The parse halts when the array is * full, or when no more numbers can be retrieved from the * string. * * When @nints is 0, the function just validates the given @str and * returns the amount of parseable integers as described below. * * Returns: * * The first element is filled by the number of collected integers * in the range. The rest is what was parsed from the @str. * * Return value is the character in the string which caused * the parse to end (typically a null terminator, if @str is * completely parseable). */ char *get_options(const char *str, int nints, int *ints) { bool validate = (nints == 0); int res, i = 1; while (i < nints || validate) { int *pint = validate ? ints : ints + i; res = get_option((char **)&str, pint); if (res == 0) break; if (res == 3) { int n = validate ? 0 : nints - i; int range_nums; range_nums = get_range((char **)&str, pint, n); if (range_nums < 0) break; /* * Decrement the result by one to leave out the * last number in the range. The next iteration * will handle the upper number in the range */ i += (range_nums - 1); } i++; if (res == 1) break; } ints[0] = i - 1; return (char *)str; } EXPORT_SYMBOL(get_options); /** * memparse - parse a string with mem suffixes into a number * @ptr: Where parse begins * @retptr: (output) Optional pointer to next char after parse completes * * Parses a string into a number. The number stored at @ptr is * potentially suffixed with K, M, G, T, P, E. */ unsigned long long memparse(const char *ptr, char **retptr) { char *endptr; /* local pointer to end of parsed string */ unsigned long long ret = simple_strtoull(ptr, &endptr, 0); switch (*endptr) { case 'E': case 'e': ret <<= 10; fallthrough; case 'P': case 'p': ret <<= 10; fallthrough; case 'T': case 't': ret <<= 10; fallthrough; case 'G': case 'g': ret <<= 10; fallthrough; case 'M': case 'm': ret <<= 10; fallthrough; case 'K': case 'k': ret <<= 10; endptr++; fallthrough; default: break; } if (retptr) *retptr = endptr; return ret; } EXPORT_SYMBOL(memparse); /** * parse_option_str - Parse a string and check an option is set or not * @str: String to be parsed * @option: option name * * This function parses a string containing a comma-separated list of * strings like a=b,c. * * Return true if there's such option in the string, or return false. */ bool parse_option_str(const char *str, const char *option) { while (*str) { if (!strncmp(str, option, strlen(option))) { str += strlen(option); if (!*str || *str == ',') return true; } while (*str && *str != ',') str++; if (*str == ',') str++; } return false; } /* * Parse a string to get a param value pair. * You can use " around spaces, but can't escape ". * Hyphens and underscores equivalent in parameter names. */ char *next_arg(char *args, char **param, char **val) { unsigned int i, equals = 0; int in_quote = 0, quoted = 0; if (*args == '"') { args++; in_quote = 1; quoted = 1; } for (i = 0; args[i]; i++) { if (isspace(args[i]) && !in_quote) break; if (equals == 0) { if (args[i] == '=') equals = i; } if (args[i] == '"') in_quote = !in_quote; } *param = args; if (!equals) *val = NULL; else { args[equals] = '\0'; *val = args + equals + 1; /* Don't include quotes in value. */ if (**val == '"') { (*val)++; if (args[i-1] == '"') args[i-1] = '\0'; } } if (quoted && i > 0 && args[i-1] == '"') args[i-1] = '\0'; if (args[i]) { args[i] = '\0'; args += i + 1; } else args += i; /* Chew up trailing spaces. */ return skip_spaces(args); } EXPORT_SYMBOL(next_arg);
3 18 17 16 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 /* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/prandom.h * * Include file for the fast pseudo-random 32-bit * generation. */ #ifndef _LINUX_PRANDOM_H #define _LINUX_PRANDOM_H #include <linux/types.h> #include <linux/once.h> #include <linux/random.h> struct rnd_state { __u32 s1, s2, s3, s4; }; u32 prandom_u32_state(struct rnd_state *state); void prandom_bytes_state(struct rnd_state *state, void *buf, size_t nbytes); void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state); #define prandom_init_once(pcpu_state) \ DO_ONCE(prandom_seed_full_state, (pcpu_state)) /* * Handle minimum values for seeds */ static inline u32 __seed(u32 x, u32 m) { return (x < m) ? x + m : x; } /** * prandom_seed_state - set seed for prandom_u32_state(). * @state: pointer to state structure to receive the seed. * @seed: arbitrary 64-bit value to use as a seed. */ static inline void prandom_seed_state(struct rnd_state *state, u64 seed) { u32 i = ((seed >> 32) ^ (seed << 10) ^ seed) & 0xffffffffUL; state->s1 = __seed(i, 2U); state->s2 = __seed(i, 8U); state->s3 = __seed(i, 16U); state->s4 = __seed(i, 128U); } /* Pseudo random number generator from numerical recipes. */ static inline u32 next_pseudo_random32(u32 seed) { return seed * 1664525 + 1013904223; } #endif
15 187 187 187 166 51 187 35 22 77 76 165 166 166 166 166 51 202 202 182 202 201 182 181 158 181 181 23 1 24 24 24 24 92 100 101 24 24 13 13 13 159 158 159 23 39 39 38 39 39 39 169 169 187 187 115 183 187 187 1 187 187 166 156 159 159 159 159 159 158 15 157 156 157 182 182 157 15 15 15 95 39 92 39 92 94 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 // SPDX-License-Identifier: GPL-2.0-or-later /* * zswap.c - zswap driver file * * zswap is a cache that takes pages that are in the process * of being swapped out and attempts to compress and store them in a * RAM-based memory pool. This can result in a significant I/O reduction on * the swap device and, in the case where decompressing from RAM is faster * than reading from the swap device, can also improve workload performance. * * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/cpu.h> #include <linux/highmem.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/types.h> #include <linux/atomic.h> #include <linux/rbtree.h> #include <linux/swap.h> #include <linux/crypto.h> #include <linux/scatterlist.h> #include <linux/mempolicy.h> #include <linux/mempool.h> #include <linux/zpool.h> #include <crypto/acompress.h> #include <linux/zswap.h> #include <linux/mm_types.h> #include <linux/page-flags.h> #include <linux/swapops.h> #include <linux/writeback.h> #include <linux/pagemap.h> #include <linux/workqueue.h> #include <linux/list_lru.h> #include "swap.h" #include "internal.h" /********************************* * statistics **********************************/ /* Total bytes used by the compressed storage */ u64 zswap_pool_total_size; /* The number of compressed pages currently stored in zswap */ atomic_t zswap_stored_pages = ATOMIC_INIT(0); /* The number of same-value filled pages currently stored in zswap */ static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0); /* * The statistics below are not protected from concurrent access for * performance reasons so they may not be a 100% accurate. However, * they do provide useful information on roughly how many times a * certain event is occurring. */ /* Pool limit was hit (see zswap_max_pool_percent) */ static u64 zswap_pool_limit_hit; /* Pages written back when pool limit was reached */ static u64 zswap_written_back_pages; /* Store failed due to a reclaim failure after pool limit was reached */ static u64 zswap_reject_reclaim_fail; /* Store failed due to compression algorithm failure */ static u64 zswap_reject_compress_fail; /* Compressed page was too big for the allocator to (optimally) store */ static u64 zswap_reject_compress_poor; /* Store failed because underlying allocator could not get memory */ static u64 zswap_reject_alloc_fail; /* Store failed because the entry metadata could not be allocated (rare) */ static u64 zswap_reject_kmemcache_fail; /* Duplicate store was encountered (rare) */ static u64 zswap_duplicate_entry; /* Shrinker work queue */ static struct workqueue_struct *shrink_wq; /* Pool limit was hit, we need to calm down */ static bool zswap_pool_reached_full; /********************************* * tunables **********************************/ #define ZSWAP_PARAM_UNSET "" static int zswap_setup(void); /* Enable/disable zswap */ static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON); static int zswap_enabled_param_set(const char *, const struct kernel_param *); static const struct kernel_param_ops zswap_enabled_param_ops = { .set = zswap_enabled_param_set, .get = param_get_bool, }; module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644); /* Crypto compressor to use */ static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; static int zswap_compressor_param_set(const char *, const struct kernel_param *); static const struct kernel_param_ops zswap_compressor_param_ops = { .set = zswap_compressor_param_set, .get = param_get_charp, .free = param_free_charp, }; module_param_cb(compressor, &zswap_compressor_param_ops, &zswap_compressor, 0644); /* Compressed storage zpool to use */ static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; static int zswap_zpool_param_set(const char *, const struct kernel_param *); static const struct kernel_param_ops zswap_zpool_param_ops = { .set = zswap_zpool_param_set, .get = param_get_charp, .free = param_free_charp, }; module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644); /* The maximum percentage of memory that the compressed pool can occupy */ static unsigned int zswap_max_pool_percent = 20; module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); /* The threshold for accepting new pages after the max_pool_percent was hit */ static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */ module_param_named(accept_threshold_percent, zswap_accept_thr_percent, uint, 0644); /* * Enable/disable handling same-value filled pages (enabled by default). * If disabled every page is considered non-same-value filled. */ static bool zswap_same_filled_pages_enabled = true; module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, bool, 0644); /* Enable/disable handling non-same-value filled pages (enabled by default) */ static bool zswap_non_same_filled_pages_enabled = true; module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled, bool, 0644); static bool zswap_exclusive_loads_enabled = IS_ENABLED( CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON); module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644); /* Number of zpools in zswap_pool (empirically determined for scalability) */ #define ZSWAP_NR_ZPOOLS 32 /* Enable/disable memory pressure-based shrinker. */ static bool zswap_shrinker_enabled = IS_ENABLED( CONFIG_ZSWAP_SHRINKER_DEFAULT_ON); module_param_named(shrinker_enabled, zswap_shrinker_enabled, bool, 0644); bool is_zswap_enabled(void) { return zswap_enabled; } /********************************* * data structures **********************************/ struct crypto_acomp_ctx { struct crypto_acomp *acomp; struct acomp_req *req; struct crypto_wait wait; u8 *buffer; struct mutex mutex; }; /* * The lock ordering is zswap_tree.lock -> zswap_pool.lru_lock. * The only case where lru_lock is not acquired while holding tree.lock is * when a zswap_entry is taken off the lru for writeback, in that case it * needs to be verified that it's still valid in the tree. */ struct zswap_pool { struct zpool *zpools[ZSWAP_NR_ZPOOLS]; struct crypto_acomp_ctx __percpu *acomp_ctx; struct kref kref; struct list_head list; struct work_struct release_work; struct work_struct shrink_work; struct hlist_node node; char tfm_name[CRYPTO_MAX_ALG_NAME]; struct list_lru list_lru; struct mem_cgroup *next_shrink; struct shrinker *shrinker; atomic_t nr_stored; }; /* * struct zswap_entry * * This structure contains the metadata for tracking a single compressed * page within zswap. * * rbnode - links the entry into red-black tree for the appropriate swap type * swpentry - associated swap entry, the offset indexes into the red-black tree * refcount - the number of outstanding reference to the entry. This is needed * to protect against premature freeing of the entry by code * concurrent calls to load, invalidate, and writeback. The lock * for the zswap_tree structure that contains the entry must * be held while changing the refcount. Since the lock must * be held, there is no reason to also make refcount atomic. * length - the length in bytes of the compressed page data. Needed during * decompression. For a same value filled page length is 0, and both * pool and lru are invalid and must be ignored. * pool - the zswap_pool the entry's data is in * handle - zpool allocation handle that stores the compressed page data * value - value of the same-value filled pages which have same content * objcg - the obj_cgroup that the compressed memory is charged to * lru - handle to the pool's lru used to evict pages. */ struct zswap_entry { struct rb_node rbnode; swp_entry_t swpentry; int refcount; unsigned int length; struct zswap_pool *pool; union { unsigned long handle; unsigned long value; }; struct obj_cgroup *objcg; struct list_head lru; }; /* * The tree lock in the zswap_tree struct protects a few things: * - the rbtree * - the refcount field of each entry in the tree */ struct zswap_tree { struct rb_root rbroot; spinlock_t lock; }; static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; /* RCU-protected iteration */ static LIST_HEAD(zswap_pools); /* protects zswap_pools list modification */ static DEFINE_SPINLOCK(zswap_pools_lock); /* pool counter to provide unique names to zpool */ static atomic_t zswap_pools_count = ATOMIC_INIT(0); enum zswap_init_type { ZSWAP_UNINIT, ZSWAP_INIT_SUCCEED, ZSWAP_INIT_FAILED }; static enum zswap_init_type zswap_init_state; /* used to ensure the integrity of initialization */ static DEFINE_MUTEX(zswap_init_lock); /* init completed, but couldn't create the initial pool */ static bool zswap_has_pool; /********************************* * helpers and fwd declarations **********************************/ #define zswap_pool_debug(msg, p) \ pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ zpool_get_type((p)->zpools[0])) static int zswap_writeback_entry(struct zswap_entry *entry, struct zswap_tree *tree); static int zswap_pool_get(struct zswap_pool *pool); static void zswap_pool_put(struct zswap_pool *pool); static bool zswap_is_full(void) { return totalram_pages() * zswap_max_pool_percent / 100 < DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); } static bool zswap_can_accept(void) { return totalram_pages() * zswap_accept_thr_percent / 100 * zswap_max_pool_percent / 100 > DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); } static u64 get_zswap_pool_size(struct zswap_pool *pool) { u64 pool_size = 0; int i; for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) pool_size += zpool_get_total_size(pool->zpools[i]); return pool_size; } static void zswap_update_total_size(void) { struct zswap_pool *pool; u64 total = 0; rcu_read_lock(); list_for_each_entry_rcu(pool, &zswap_pools, list) total += get_zswap_pool_size(pool); rcu_read_unlock(); zswap_pool_total_size = total; } /* should be called under RCU */ #ifdef CONFIG_MEMCG static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) { return entry->objcg ? obj_cgroup_memcg(entry->objcg) : NULL; } #else static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) { return NULL; } #endif static inline int entry_to_nid(struct zswap_entry *entry) { return page_to_nid(virt_to_page(entry)); } void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) { struct zswap_pool *pool; /* lock out zswap pools list modification */ spin_lock(&zswap_pools_lock); list_for_each_entry(pool, &zswap_pools, list) { if (pool->next_shrink == memcg) pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); } spin_unlock(&zswap_pools_lock); } /********************************* * zswap entry functions **********************************/ static struct kmem_cache *zswap_entry_cache; static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) { struct zswap_entry *entry; entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); if (!entry) return NULL; entry->refcount = 1; RB_CLEAR_NODE(&entry->rbnode); return entry; } static void zswap_entry_cache_free(struct zswap_entry *entry) { kmem_cache_free(zswap_entry_cache, entry); } /********************************* * zswap lruvec functions **********************************/ void zswap_lruvec_state_init(struct lruvec *lruvec) { atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); } void zswap_folio_swapin(struct folio *folio) { struct lruvec *lruvec; if (folio) { lruvec = folio_lruvec(folio); atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); } } /********************************* * lru functions **********************************/ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) { atomic_long_t *nr_zswap_protected; unsigned long lru_size, old, new; int nid = entry_to_nid(entry); struct mem_cgroup *memcg; struct lruvec *lruvec; /* * Note that it is safe to use rcu_read_lock() here, even in the face of * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection * used in list_lru lookup, only two scenarios are possible: * * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The * new entry will be reparented to memcg's parent's list_lru. * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The * new entry will be added directly to memcg's parent's list_lru. * * Similar reasoning holds for list_lru_del() and list_lru_putback(). */ rcu_read_lock(); memcg = mem_cgroup_from_entry(entry); /* will always succeed */ list_lru_add(list_lru, &entry->lru, nid, memcg); /* Update the protection area */ lru_size = list_lru_count_one(list_lru, nid, memcg); lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected; old = atomic_long_inc_return(nr_zswap_protected); /* * Decay to avoid overflow and adapt to changing workloads. * This is based on LRU reclaim cost decaying heuristics. */ do { new = old > lru_size / 4 ? old / 2 : old; } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new)); rcu_read_unlock(); } static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) { int nid = entry_to_nid(entry); struct mem_cgroup *memcg; rcu_read_lock(); memcg = mem_cgroup_from_entry(entry); /* will always succeed */ list_lru_del(list_lru, &entry->lru, nid, memcg); rcu_read_unlock(); } static void zswap_lru_putback(struct list_lru *list_lru, struct zswap_entry *entry) { int nid = entry_to_nid(entry); spinlock_t *lock = &list_lru->node[nid].lock; struct mem_cgroup *memcg; struct lruvec *lruvec; rcu_read_lock(); memcg = mem_cgroup_from_entry(entry); spin_lock(lock); /* we cannot use list_lru_add here, because it increments node's lru count */ list_lru_putback(list_lru, &entry->lru, nid, memcg); spin_unlock(lock); lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(entry_to_nid(entry))); /* increment the protection area to account for the LRU rotation. */ atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); rcu_read_unlock(); } /********************************* * rbtree functions **********************************/ static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) { struct rb_node *node = root->rb_node; struct zswap_entry *entry; pgoff_t entry_offset; while (node) { entry = rb_entry(node, struct zswap_entry, rbnode); entry_offset = swp_offset(entry->swpentry); if (entry_offset > offset) node = node->rb_left; else if (entry_offset < offset) node = node->rb_right; else return entry; } return NULL; } /* * In the case that a entry with the same offset is found, a pointer to * the existing entry is stored in dupentry and the function returns -EEXIST */ static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, struct zswap_entry **dupentry) { struct rb_node **link = &root->rb_node, *parent = NULL; struct zswap_entry *myentry; pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry); while (*link) { parent = *link; myentry = rb_entry(parent, struct zswap_entry, rbnode); myentry_offset = swp_offset(myentry->swpentry); if (myentry_offset > entry_offset) link = &(*link)->rb_left; else if (myentry_offset < entry_offset) link = &(*link)->rb_right; else { *dupentry = myentry; return -EEXIST; } } rb_link_node(&entry->rbnode, parent, link); rb_insert_color(&entry->rbnode, root); return 0; } static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) { if (!RB_EMPTY_NODE(&entry->rbnode)) { rb_erase(&entry->rbnode, root); RB_CLEAR_NODE(&entry->rbnode); return true; } return false; } static struct zpool *zswap_find_zpool(struct zswap_entry *entry) { int i = 0; if (ZSWAP_NR_ZPOOLS > 1) i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS)); return entry->pool->zpools[i]; } /* * Carries out the common pattern of freeing and entry's zpool allocation, * freeing the entry itself, and decrementing the number of stored pages. */ static void zswap_free_entry(struct zswap_entry *entry) { if (entry->objcg) { obj_cgroup_uncharge_zswap(entry->objcg, entry->length); obj_cgroup_put(entry->objcg); } if (!entry->length) atomic_dec(&zswap_same_filled_pages); else { zswap_lru_del(&entry->pool->list_lru, entry); zpool_free(zswap_find_zpool(entry), entry->handle); atomic_dec(&entry->pool->nr_stored); zswap_pool_put(entry->pool); } zswap_entry_cache_free(entry); atomic_dec(&zswap_stored_pages); zswap_update_total_size(); } /* caller must hold the tree lock */ static void zswap_entry_get(struct zswap_entry *entry) { entry->refcount++; } /* caller must hold the tree lock * remove from the tree and free it, if nobody reference the entry */ static void zswap_entry_put(struct zswap_tree *tree, struct zswap_entry *entry) { int refcount = --entry->refcount; WARN_ON_ONCE(refcount < 0); if (refcount == 0) { WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode)); zswap_free_entry(entry); } } /* caller must hold the tree lock */ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, pgoff_t offset) { struct zswap_entry *entry; entry = zswap_rb_search(root, offset); if (entry) zswap_entry_get(entry); return entry; } /********************************* * shrinker functions **********************************/ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, spinlock_t *lock, void *arg); static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) { struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); unsigned long shrink_ret, nr_protected, lru_size; struct zswap_pool *pool = shrinker->private_data; bool encountered_page_in_swapcache = false; if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(sc->memcg)) { sc->nr_scanned = 0; return SHRINK_STOP; } nr_protected = atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); lru_size = list_lru_shrink_count(&pool->list_lru, sc); /* * Abort if we are shrinking into the protected region. * * This short-circuiting is necessary because if we have too many multiple * concurrent reclaimers getting the freeable zswap object counts at the * same time (before any of them made reasonable progress), the total * number of reclaimed objects might be more than the number of unprotected * objects (i.e the reclaimers will reclaim into the protected area of the * zswap LRU). */ if (nr_protected >= lru_size - sc->nr_to_scan) { sc->nr_scanned = 0; return SHRINK_STOP; } shrink_ret = list_lru_shrink_walk(&pool->list_lru, sc, &shrink_memcg_cb, &encountered_page_in_swapcache); if (encountered_page_in_swapcache) return SHRINK_STOP; return shrink_ret ? shrink_ret : SHRINK_STOP; } static unsigned long zswap_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc) { struct zswap_pool *pool = shrinker->private_data; struct mem_cgroup *memcg = sc->memcg; struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid)); unsigned long nr_backing, nr_stored, nr_freeable, nr_protected; if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg)) return 0; #ifdef CONFIG_MEMCG_KMEM mem_cgroup_flush_stats(memcg); nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); #else /* use pool stats instead of memcg stats */ nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT; nr_stored = atomic_read(&pool->nr_stored); #endif if (!nr_stored) return 0; nr_protected = atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); nr_freeable = list_lru_shrink_count(&pool->list_lru, sc); /* * Subtract the lru size by an estimate of the number of pages * that should be protected. */ nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0; /* * Scale the number of freeable pages by the memory saving factor. * This ensures that the better zswap compresses memory, the fewer * pages we will evict to swap (as it will otherwise incur IO for * relatively small memory saving). */ return mult_frac(nr_freeable, nr_backing, nr_stored); } static void zswap_alloc_shrinker(struct zswap_pool *pool) { pool->shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap"); if (!pool->shrinker) return; pool->shrinker->private_data = pool; pool->shrinker->scan_objects = zswap_shrinker_scan; pool->shrinker->count_objects = zswap_shrinker_count; pool->shrinker->batch = 0; pool->shrinker->seeks = DEFAULT_SEEKS; } /********************************* * per-cpu code **********************************/ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); struct crypto_acomp *acomp; struct acomp_req *req; int ret; mutex_init(&acomp_ctx->mutex); acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); if (!acomp_ctx->buffer) return -ENOMEM; acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); if (IS_ERR(acomp)) { pr_err("could not alloc crypto acomp %s : %ld\n", pool->tfm_name, PTR_ERR(acomp)); ret = PTR_ERR(acomp); goto acomp_fail; } acomp_ctx->acomp = acomp; req = acomp_request_alloc(acomp_ctx->acomp); if (!req) { pr_err("could not alloc crypto acomp_request %s\n", pool->tfm_name); ret = -ENOMEM; goto req_fail; } acomp_ctx->req = req; crypto_init_wait(&acomp_ctx->wait); /* * if the backend of acomp is async zip, crypto_req_done() will wakeup * crypto_wait_req(); if the backend of acomp is scomp, the callback * won't be called, crypto_wait_req() will return without blocking. */ acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, crypto_req_done, &acomp_ctx->wait); return 0; req_fail: crypto_free_acomp(acomp_ctx->acomp); acomp_fail: kfree(acomp_ctx->buffer); return ret; } static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); if (!IS_ERR_OR_NULL(acomp_ctx)) { if (!IS_ERR_OR_NULL(acomp_ctx->req)) acomp_request_free(acomp_ctx->req); if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) crypto_free_acomp(acomp_ctx->acomp); kfree(acomp_ctx->buffer); } return 0; } /********************************* * pool functions **********************************/ static struct zswap_pool *__zswap_pool_current(void) { struct zswap_pool *pool; pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); WARN_ONCE(!pool && zswap_has_pool, "%s: no page storage pool!\n", __func__); return pool; } static struct zswap_pool *zswap_pool_current(void) { assert_spin_locked(&zswap_pools_lock); return __zswap_pool_current(); } static struct zswap_pool *zswap_pool_current_get(void) { struct zswap_pool *pool; rcu_read_lock(); pool = __zswap_pool_current(); if (!zswap_pool_get(pool)) pool = NULL; rcu_read_unlock(); return pool; } static struct zswap_pool *zswap_pool_last_get(void) { struct zswap_pool *pool, *last = NULL; rcu_read_lock(); list_for_each_entry_rcu(pool, &zswap_pools, list) last = pool; WARN_ONCE(!last && zswap_has_pool, "%s: no page storage pool!\n", __func__); if (!zswap_pool_get(last)) last = NULL; rcu_read_unlock(); return last; } /* type and compressor must be null-terminated */ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) { struct zswap_pool *pool; assert_spin_locked(&zswap_pools_lock); list_for_each_entry_rcu(pool, &zswap_pools, list) { if (strcmp(pool->tfm_name, compressor)) continue; /* all zpools share the same type */ if (strcmp(zpool_get_type(pool->zpools[0]), type)) continue; /* if we can't get it, it's about to be destroyed */ if (!zswap_pool_get(pool)) continue; return pool; } return NULL; } /* * If the entry is still valid in the tree, drop the initial ref and remove it * from the tree. This function must be called with an additional ref held, * otherwise it may race with another invalidation freeing the entry. */ static void zswap_invalidate_entry(struct zswap_tree *tree, struct zswap_entry *entry) { if (zswap_rb_erase(&tree->rbroot, entry)) zswap_entry_put(tree, entry); } static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, spinlock_t *lock, void *arg) { struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); bool *encountered_page_in_swapcache = (bool *)arg; struct zswap_tree *tree; pgoff_t swpoffset; enum lru_status ret = LRU_REMOVED_RETRY; int writeback_result; /* * Once the lru lock is dropped, the entry might get freed. The * swpoffset is copied to the stack, and entry isn't deref'd again * until the entry is verified to still be alive in the tree. */ swpoffset = swp_offset(entry->swpentry); tree = zswap_trees[swp_type(entry->swpentry)]; list_lru_isolate(l, item); /* * It's safe to drop the lock here because we return either * LRU_REMOVED_RETRY or LRU_RETRY. */ spin_unlock(lock); /* Check for invalidate() race */ spin_lock(&tree->lock); if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) goto unlock; /* Hold a reference to prevent a free during writeback */ zswap_entry_get(entry); spin_unlock(&tree->lock); writeback_result = zswap_writeback_entry(entry, tree); spin_lock(&tree->lock); if (writeback_result) { zswap_reject_reclaim_fail++; zswap_lru_putback(&entry->pool->list_lru, entry); ret = LRU_RETRY; /* * Encountering a page already in swap cache is a sign that we are shrinking * into the warmer region. We should terminate shrinking (if we're in the dynamic * shrinker context). */ if (writeback_result == -EEXIST && encountered_page_in_swapcache) { ret = LRU_SKIP; *encountered_page_in_swapcache = true; } goto put_unlock; } zswap_written_back_pages++; if (entry->objcg) count_objcg_event(entry->objcg, ZSWPWB); count_vm_event(ZSWPWB); /* * Writeback started successfully, the page now belongs to the * swapcache. Drop the entry from zswap - unless invalidate already * took it out while we had the tree->lock released for IO. */ zswap_invalidate_entry(tree, entry); put_unlock: /* Drop local reference */ zswap_entry_put(tree, entry); unlock: spin_unlock(&tree->lock); spin_lock(lock); return ret; } static int shrink_memcg(struct mem_cgroup *memcg) { struct zswap_pool *pool; int nid, shrunk = 0; if (!mem_cgroup_zswap_writeback_enabled(memcg)) return -EINVAL; /* * Skip zombies because their LRUs are reparented and we would be * reclaiming from the parent instead of the dead memcg. */ if (memcg && !mem_cgroup_online(memcg)) return -ENOENT; pool = zswap_pool_current_get(); if (!pool) return -EINVAL; for_each_node_state(nid, N_NORMAL_MEMORY) { unsigned long nr_to_walk = 1; shrunk += list_lru_walk_one(&pool->list_lru, nid, memcg, &shrink_memcg_cb, NULL, &nr_to_walk); } zswap_pool_put(pool); return shrunk ? 0 : -EAGAIN; } static void shrink_worker(struct work_struct *w) { struct zswap_pool *pool = container_of(w, typeof(*pool), shrink_work); struct mem_cgroup *memcg; int ret, failures = 0; /* global reclaim will select cgroup in a round-robin fashion. */ do { spin_lock(&zswap_pools_lock); pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); memcg = pool->next_shrink; /* * We need to retry if we have gone through a full round trip, or if we * got an offline memcg (or else we risk undoing the effect of the * zswap memcg offlining cleanup callback). This is not catastrophic * per se, but it will keep the now offlined memcg hostage for a while. * * Note that if we got an online memcg, we will keep the extra * reference in case the original reference obtained by mem_cgroup_iter * is dropped by the zswap memcg offlining callback, ensuring that the * memcg is not killed when we are reclaiming. */ if (!memcg) { spin_unlock(&zswap_pools_lock); if (++failures == MAX_RECLAIM_RETRIES) break; goto resched; } if (!mem_cgroup_tryget_online(memcg)) { /* drop the reference from mem_cgroup_iter() */ mem_cgroup_iter_break(NULL, memcg); pool->next_shrink = NULL; spin_unlock(&zswap_pools_lock); if (++failures == MAX_RECLAIM_RETRIES) break; goto resched; } spin_unlock(&zswap_pools_lock); ret = shrink_memcg(memcg); /* drop the extra reference */ mem_cgroup_put(memcg); if (ret == -EINVAL) break; if (ret && ++failures == MAX_RECLAIM_RETRIES) break; resched: cond_resched(); } while (!zswap_can_accept()); zswap_pool_put(pool); } static struct zswap_pool *zswap_pool_create(char *type, char *compressor) { int i; struct zswap_pool *pool; char name[38]; /* 'zswap' + 32 char (max) num + \0 */ gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; int ret; if (!zswap_has_pool) { /* if either are unset, pool initialization failed, and we * need both params to be set correctly before trying to * create a pool. */ if (!strcmp(type, ZSWAP_PARAM_UNSET)) return NULL; if (!strcmp(compressor, ZSWAP_PARAM_UNSET)) return NULL; } pool = kzalloc(sizeof(*pool), GFP_KERNEL); if (!pool) return NULL; for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) { /* unique name for each pool specifically required by zsmalloc */ snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count)); pool->zpools[i] = zpool_create_pool(type, name, gfp); if (!pool->zpools[i]) { pr_err("%s zpool not available\n", type); goto error; } } pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0])); strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx); if (!pool->acomp_ctx) { pr_err("percpu alloc failed\n"); goto error; } ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); if (ret) goto error; zswap_alloc_shrinker(pool); if (!pool->shrinker) goto error; pr_debug("using %s compressor\n", pool->tfm_name); /* being the current pool takes 1 ref; this func expects the * caller to always add the new pool as the current pool */ kref_init(&pool->kref); INIT_LIST_HEAD(&pool->list); if (list_lru_init_memcg(&pool->list_lru, pool->shrinker)) goto lru_fail; shrinker_register(pool->shrinker); INIT_WORK(&pool->shrink_work, shrink_worker); atomic_set(&pool->nr_stored, 0); zswap_pool_debug("created", pool); return pool; lru_fail: list_lru_destroy(&pool->list_lru); shrinker_free(pool->shrinker); error: if (pool->acomp_ctx) free_percpu(pool->acomp_ctx); while (i--) zpool_destroy_pool(pool->zpools[i]); kfree(pool); return NULL; } static struct zswap_pool *__zswap_pool_create_fallback(void) { bool has_comp, has_zpool; has_comp = crypto_has_acomp(zswap_compressor, 0, 0); if (!has_comp && strcmp(zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) { pr_err("compressor %s not available, using default %s\n", zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT); param_free_charp(&zswap_compressor); zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; has_comp = crypto_has_acomp(zswap_compressor, 0, 0); } if (!has_comp) { pr_err("default compressor %s not available\n", zswap_compressor); param_free_charp(&zswap_compressor); zswap_compressor = ZSWAP_PARAM_UNSET; } has_zpool = zpool_has_pool(zswap_zpool_type); if (!has_zpool && strcmp(zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT)) { pr_err("zpool %s not available, using default %s\n", zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT); param_free_charp(&zswap_zpool_type); zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; has_zpool = zpool_has_pool(zswap_zpool_type); } if (!has_zpool) { pr_err("default zpool %s not available\n", zswap_zpool_type); param_free_charp(&zswap_zpool_type); zswap_zpool_type = ZSWAP_PARAM_UNSET; } if (!has_comp || !has_zpool) return NULL; return zswap_pool_create(zswap_zpool_type, zswap_compressor); } static void zswap_pool_destroy(struct zswap_pool *pool) { int i; zswap_pool_debug("destroying", pool); shrinker_free(pool->shrinker); cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); free_percpu(pool->acomp_ctx); list_lru_destroy(&pool->list_lru); spin_lock(&zswap_pools_lock); mem_cgroup_iter_break(NULL, pool->next_shrink); pool->next_shrink = NULL; spin_unlock(&zswap_pools_lock); for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) zpool_destroy_pool(pool->zpools[i]); kfree(pool); } static int __must_check zswap_pool_get(struct zswap_pool *pool) { if (!pool) return 0; return kref_get_unless_zero(&pool->kref); } static void __zswap_pool_release(struct work_struct *work) { struct zswap_pool *pool = container_of(work, typeof(*pool), release_work); synchronize_rcu(); /* nobody should have been able to get a kref... */ WARN_ON(kref_get_unless_zero(&pool->kref)); /* pool is now off zswap_pools list and has no references. */ zswap_pool_destroy(pool); } static void __zswap_pool_empty(struct kref *kref) { struct zswap_pool *pool; pool = container_of(kref, typeof(*pool), kref); spin_lock(&zswap_pools_lock); WARN_ON(pool == zswap_pool_current()); list_del_rcu(&pool->list); INIT_WORK(&pool->release_work, __zswap_pool_release); schedule_work(&pool->release_work); spin_unlock(&zswap_pools_lock); } static void zswap_pool_put(struct zswap_pool *pool) { kref_put(&pool->kref, __zswap_pool_empty); } /********************************* * param callbacks **********************************/ static bool zswap_pool_changed(const char *s, const struct kernel_param *kp) { /* no change required */ if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool) return false; return true; } /* val must be a null-terminated string */ static int __zswap_param_set(const char *val, const struct kernel_param *kp, char *type, char *compressor) { struct zswap_pool *pool, *put_pool = NULL; char *s = strstrip((char *)val); int ret = 0; bool new_pool = false; mutex_lock(&zswap_init_lock); switch (zswap_init_state) { case ZSWAP_UNINIT: /* if this is load-time (pre-init) param setting, * don't create a pool; that's done during init. */ ret = param_set_charp(s, kp); break; case ZSWAP_INIT_SUCCEED: new_pool = zswap_pool_changed(s, kp); break; case ZSWAP_INIT_FAILED: pr_err("can't set param, initialization failed\n"); ret = -ENODEV; } mutex_unlock(&zswap_init_lock); /* no need to create a new pool, return directly */ if (!new_pool) return ret; if (!type) { if (!zpool_has_pool(s)) { pr_err("zpool %s not available\n", s); return -ENOENT; } type = s; } else if (!compressor) { if (!crypto_has_acomp(s, 0, 0)) { pr_err("compressor %s not available\n", s); return -ENOENT; } compressor = s; } else { WARN_ON(1); return -EINVAL; } spin_lock(&zswap_pools_lock); pool = zswap_pool_find_get(type, compressor); if (pool) { zswap_pool_debug("using existing", pool); WARN_ON(pool == zswap_pool_current()); list_del_rcu(&pool->list); } spin_unlock(&zswap_pools_lock); if (!pool) pool = zswap_pool_create(type, compressor); if (pool) ret = param_set_charp(s, kp); else ret = -EINVAL; spin_lock(&zswap_pools_lock); if (!ret) { put_pool = zswap_pool_current(); list_add_rcu(&pool->list, &zswap_pools); zswap_has_pool = true; } else if (pool) { /* add the possibly pre-existing pool to the end of the pools * list; if it's new (and empty) then it'll be removed and * destroyed by the put after we drop the lock */ list_add_tail_rcu(&pool->list, &zswap_pools); put_pool = pool; } spin_unlock(&zswap_pools_lock); if (!zswap_has_pool && !pool) { /* if initial pool creation failed, and this pool creation also * failed, maybe both compressor and zpool params were bad. * Allow changing this param, so pool creation will succeed * when the other param is changed. We already verified this * param is ok in the zpool_has_pool() or crypto_has_acomp() * checks above. */ ret = param_set_charp(s, kp); } /* drop the ref from either the old current pool, * or the new pool we failed to add */ if (put_pool) zswap_pool_put(put_pool); return ret; } static int zswap_compressor_param_set(const char *val, const struct kernel_param *kp) { return __zswap_param_set(val, kp, zswap_zpool_type, NULL); } static int zswap_zpool_param_set(const char *val, const struct kernel_param *kp) { return __zswap_param_set(val, kp, NULL, zswap_compressor); } static int zswap_enabled_param_set(const char *val, const struct kernel_param *kp) { int ret = -ENODEV; /* if this is load-time (pre-init) param setting, only set param. */ if (system_state != SYSTEM_RUNNING) return param_set_bool(val, kp); mutex_lock(&zswap_init_lock); switch (zswap_init_state) { case ZSWAP_UNINIT: if (zswap_setup()) break; fallthrough; case ZSWAP_INIT_SUCCEED: if (!zswap_has_pool) pr_err("can't enable, no pool configured\n"); else ret = param_set_bool(val, kp); break; case ZSWAP_INIT_FAILED: pr_err("can't enable, initialization failed\n"); } mutex_unlock(&zswap_init_lock); return ret; } static void __zswap_load(struct zswap_entry *entry, struct page *page) { struct zpool *zpool = zswap_find_zpool(entry); struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; u8 *src; acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); mutex_lock(&acomp_ctx->mutex); src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); if (!zpool_can_sleep_mapped(zpool)) { memcpy(acomp_ctx->buffer, src, entry->length); src = acomp_ctx->buffer; zpool_unmap_handle(zpool, entry->handle); } sg_init_one(&input, src, entry->length); sg_init_table(&output, 1); sg_set_page(&output, page, PAGE_SIZE, 0); acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE); BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)); BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); mutex_unlock(&acomp_ctx->mutex); if (zpool_can_sleep_mapped(zpool)) zpool_unmap_handle(zpool, entry->handle); } /********************************* * writeback code **********************************/ /* * Attempts to free an entry by adding a folio to the swap cache, * decompressing the entry data into the folio, and issuing a * bio write to write the folio back to the swap device. * * This can be thought of as a "resumed writeback" of the folio * to the swap device. We are basically resuming the same swap * writeback path that was intercepted with the zswap_store() * in the first place. After the folio has been decompressed into * the swap cache, the compressed version stored by zswap can be * freed. */ static int zswap_writeback_entry(struct zswap_entry *entry, struct zswap_tree *tree) { swp_entry_t swpentry = entry->swpentry; struct folio *folio; struct mempolicy *mpol; bool folio_was_allocated; struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, }; /* try to allocate swap cache folio */ mpol = get_task_policy(current); folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, NO_INTERLEAVE_INDEX, &folio_was_allocated, true); if (!folio) return -ENOMEM; /* * Found an existing folio, we raced with load/swapin. We generally * writeback cold folios from zswap, and swapin means the folio just * became hot. Skip this folio and let the caller find another one. */ if (!folio_was_allocated) { folio_put(folio); return -EEXIST; } /* * folio is locked, and the swapcache is now secured against * concurrent swapping to and from the slot. Verify that the * swap entry hasn't been invalidated and recycled behind our * backs (our zswap_entry reference doesn't prevent that), to * avoid overwriting a new swap folio with old compressed data. */ spin_lock(&tree->lock); if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) { spin_unlock(&tree->lock); delete_from_swap_cache(folio); return -ENOMEM; } spin_unlock(&tree->lock); __zswap_load(entry, &folio->page); /* folio is up to date */ folio_mark_uptodate(folio); /* move it to the tail of the inactive list after end_writeback */ folio_set_reclaim(folio); /* start writeback */ __swap_writepage(folio, &wbc); folio_put(folio); return 0; } static int zswap_is_page_same_filled(void *ptr, unsigned long *value) { unsigned long *page; unsigned long val; unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1; page = (unsigned long *)ptr; val = page[0]; if (val != page[last_pos]) return 0; for (pos = 1; pos < last_pos; pos++) { if (val != page[pos]) return 0; } *value = val; return 1; } static void zswap_fill_page(void *ptr, unsigned long value) { unsigned long *page; page = (unsigned long *)ptr; memset_l(page, value, PAGE_SIZE / sizeof(unsigned long)); } bool zswap_store(struct folio *folio) { swp_entry_t swp = folio->swap; int type = swp_type(swp); pgoff_t offset = swp_offset(swp); struct page *page = &folio->page; struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry, *dupentry; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; struct zswap_pool *pool; struct zpool *zpool; unsigned int dlen = PAGE_SIZE; unsigned long handle, value; char *buf; u8 *src, *dst; gfp_t gfp; int ret; VM_WARN_ON_ONCE(!folio_test_locked(folio)); VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); /* Large folios aren't supported */ if (folio_test_large(folio)) return false; if (!zswap_enabled || !tree) return false; /* * If this is a duplicate, it must be removed before attempting to store * it, otherwise, if the store fails the old page won't be removed from * the tree, and it might be written back overriding the new data. */ spin_lock(&tree->lock); dupentry = zswap_rb_search(&tree->rbroot, offset); if (dupentry) { zswap_duplicate_entry++; zswap_invalidate_entry(tree, dupentry); } spin_unlock(&tree->lock); objcg = get_obj_cgroup_from_folio(folio); if (objcg && !obj_cgroup_may_zswap(objcg)) { memcg = get_mem_cgroup_from_objcg(objcg); if (shrink_memcg(memcg)) { mem_cgroup_put(memcg); goto reject; } mem_cgroup_put(memcg); } /* reclaim space if needed */ if (zswap_is_full()) { zswap_pool_limit_hit++; zswap_pool_reached_full = true; goto shrink; } if (zswap_pool_reached_full) { if (!zswap_can_accept()) goto shrink; else zswap_pool_reached_full = false; } /* allocate entry */ entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); if (!entry) { zswap_reject_kmemcache_fail++; goto reject; } if (zswap_same_filled_pages_enabled) { src = kmap_local_page(page); if (zswap_is_page_same_filled(src, &value)) { kunmap_local(src); entry->swpentry = swp_entry(type, offset); entry->length = 0; entry->value = value; atomic_inc(&zswap_same_filled_pages); goto insert_entry; } kunmap_local(src); } if (!zswap_non_same_filled_pages_enabled) goto freepage; /* if entry is successfully added, it keeps the reference */ entry->pool = zswap_pool_current_get(); if (!entry->pool) goto freepage; if (objcg) { memcg = get_mem_cgroup_from_objcg(objcg); if (memcg_list_lru_alloc(memcg, &entry->pool->list_lru, GFP_KERNEL)) { mem_cgroup_put(memcg); goto put_pool; } mem_cgroup_put(memcg); } /* compress */ acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); mutex_lock(&acomp_ctx->mutex); dst = acomp_ctx->buffer; sg_init_table(&input, 1); sg_set_page(&input, &folio->page, PAGE_SIZE, 0); /* * We need PAGE_SIZE * 2 here since there maybe over-compression case, * and hardware-accelerators may won't check the dst buffer size, so * giving the dst buffer with enough length to avoid buffer overflow. */ sg_init_one(&output, dst, PAGE_SIZE * 2); acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); /* * it maybe looks a little bit silly that we send an asynchronous request, * then wait for its completion synchronously. This makes the process look * synchronous in fact. * Theoretically, acomp supports users send multiple acomp requests in one * acomp instance, then get those requests done simultaneously. but in this * case, zswap actually does store and load page by page, there is no * existing method to send the second page before the first page is done * in one thread doing zwap. * but in different threads running on different cpu, we have different * acomp instance, so multiple threads can do (de)compression in parallel. */ ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); dlen = acomp_ctx->req->dlen; if (ret) { zswap_reject_compress_fail++; goto put_dstmem; } /* store */ zpool = zswap_find_zpool(entry); gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; if (zpool_malloc_support_movable(zpool)) gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; ret = zpool_malloc(zpool, dlen, gfp, &handle); if (ret == -ENOSPC) { zswap_reject_compress_poor++; goto put_dstmem; } if (ret) { zswap_reject_alloc_fail++; goto put_dstmem; } buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); memcpy(buf, dst, dlen); zpool_unmap_handle(zpool, handle); mutex_unlock(&acomp_ctx->mutex); /* populate entry */ entry->swpentry = swp_entry(type, offset); entry->handle = handle; entry->length = dlen; insert_entry: entry->objcg = objcg; if (objcg) { obj_cgroup_charge_zswap(objcg, entry->length); /* Account before objcg ref is moved to tree */ count_objcg_event(objcg, ZSWPOUT); } /* map */ spin_lock(&tree->lock); /* * A duplicate entry should have been removed at the beginning of this * function. Since the swap entry should be pinned, if a duplicate is * found again here it means that something went wrong in the swap * cache. */ while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) { WARN_ON(1); zswap_duplicate_entry++; zswap_invalidate_entry(tree, dupentry); } if (entry->length) { INIT_LIST_HEAD(&entry->lru); zswap_lru_add(&entry->pool->list_lru, entry); atomic_inc(&entry->pool->nr_stored); } spin_unlock(&tree->lock); /* update stats */ atomic_inc(&zswap_stored_pages); zswap_update_total_size(); count_vm_event(ZSWPOUT); return true; put_dstmem: mutex_unlock(&acomp_ctx->mutex); put_pool: zswap_pool_put(entry->pool); freepage: zswap_entry_cache_free(entry); reject: if (objcg) obj_cgroup_put(objcg); return false; shrink: pool = zswap_pool_last_get(); if (pool && !queue_work(shrink_wq, &pool->shrink_work)) zswap_pool_put(pool); goto reject; } bool zswap_load(struct folio *folio) { swp_entry_t swp = folio->swap; int type = swp_type(swp); pgoff_t offset = swp_offset(swp); struct page *page = &folio->page; struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry; u8 *dst; VM_WARN_ON_ONCE(!folio_test_locked(folio)); /* find */ spin_lock(&tree->lock); entry = zswap_entry_find_get(&tree->rbroot, offset); if (!entry) { spin_unlock(&tree->lock); return false; } spin_unlock(&tree->lock); if (entry->length) __zswap_load(entry, page); else { dst = kmap_local_page(page); zswap_fill_page(dst, entry->value); kunmap_local(dst); } count_vm_event(ZSWPIN); if (entry->objcg) count_objcg_event(entry->objcg, ZSWPIN); spin_lock(&tree->lock); if (zswap_exclusive_loads_enabled) { zswap_invalidate_entry(tree, entry); folio_mark_dirty(folio); } else if (entry->length) { zswap_lru_del(&entry->pool->list_lru, entry); zswap_lru_add(&entry->pool->list_lru, entry); } zswap_entry_put(tree, entry); spin_unlock(&tree->lock); return true; } void zswap_invalidate(int type, pgoff_t offset) { struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry; /* find */ spin_lock(&tree->lock); entry = zswap_rb_search(&tree->rbroot, offset); if (!entry) { /* entry was written back */ spin_unlock(&tree->lock); return; } zswap_invalidate_entry(tree, entry); spin_unlock(&tree->lock); } void zswap_swapon(int type) { struct zswap_tree *tree; tree = kzalloc(sizeof(*tree), GFP_KERNEL); if (!tree) { pr_err("alloc failed, zswap disabled for swap type %d\n", type); return; } tree->rbroot = RB_ROOT; spin_lock_init(&tree->lock); zswap_trees[type] = tree; } void zswap_swapoff(int type) { struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry, *n; if (!tree) return; /* walk the tree and free everything */ spin_lock(&tree->lock); rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) zswap_free_entry(entry); tree->rbroot = RB_ROOT; spin_unlock(&tree->lock); kfree(tree); zswap_trees[type] = NULL; } /********************************* * debugfs functions **********************************/ #ifdef CONFIG_DEBUG_FS #include <linux/debugfs.h> static struct dentry *zswap_debugfs_root; static int zswap_debugfs_init(void) { if (!debugfs_initialized()) return -ENODEV; zswap_debugfs_root = debugfs_create_dir("zswap", NULL); debugfs_create_u64("pool_limit_hit", 0444, zswap_debugfs_root, &zswap_pool_limit_hit); debugfs_create_u64("reject_reclaim_fail", 0444, zswap_debugfs_root, &zswap_reject_reclaim_fail); debugfs_create_u64("reject_alloc_fail", 0444, zswap_debugfs_root, &zswap_reject_alloc_fail); debugfs_create_u64("reject_kmemcache_fail", 0444, zswap_debugfs_root, &zswap_reject_kmemcache_fail); debugfs_create_u64("reject_compress_fail", 0444, zswap_debugfs_root, &zswap_reject_compress_fail); debugfs_create_u64("reject_compress_poor", 0444, zswap_debugfs_root, &zswap_reject_compress_poor); debugfs_create_u64("written_back_pages", 0444, zswap_debugfs_root, &zswap_written_back_pages); debugfs_create_u64("duplicate_entry", 0444, zswap_debugfs_root, &zswap_duplicate_entry); debugfs_create_u64("pool_total_size", 0444, zswap_debugfs_root, &zswap_pool_total_size); debugfs_create_atomic_t("stored_pages", 0444, zswap_debugfs_root, &zswap_stored_pages); debugfs_create_atomic_t("same_filled_pages", 0444, zswap_debugfs_root, &zswap_same_filled_pages); return 0; } #else static int zswap_debugfs_init(void) { return 0; } #endif /********************************* * module init and exit **********************************/ static int zswap_setup(void) { struct zswap_pool *pool; int ret; zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); if (!zswap_entry_cache) { pr_err("entry cache creation failed\n"); goto cache_fail; } ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE, "mm/zswap_pool:prepare", zswap_cpu_comp_prepare, zswap_cpu_comp_dead); if (ret) goto hp_fail; pool = __zswap_pool_create_fallback(); if (pool) { pr_info("loaded using pool %s/%s\n", pool->tfm_name, zpool_get_type(pool->zpools[0])); list_add(&pool->list, &zswap_pools); zswap_has_pool = true; } else { pr_err("pool creation failed\n"); zswap_enabled = false; } shrink_wq = create_workqueue("zswap-shrink"); if (!shrink_wq) goto fallback_fail; if (zswap_debugfs_init()) pr_warn("debugfs initialization failed\n"); zswap_init_state = ZSWAP_INIT_SUCCEED; return 0; fallback_fail: if (pool) zswap_pool_destroy(pool); hp_fail: kmem_cache_destroy(zswap_entry_cache); cache_fail: /* if built-in, we aren't unloaded on failure; don't allow use */ zswap_init_state = ZSWAP_INIT_FAILED; zswap_enabled = false; return -ENOMEM; } static int __init zswap_init(void) { if (!zswap_enabled) return 0; return zswap_setup(); } /* must be late so crypto has time to come up */ late_initcall(zswap_init); MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>"); MODULE_DESCRIPTION("Compressed cache for swap pages");
133 136 136 103 132 129 130 12 4 3 3 3 2 3 2 2 4 36 36 36 34 34 34 34 34 36 36 36 36 32 7 29 8 8 8 8 8 8 8 8 8 8 8 8 6 6 6 36 36 35 36 36 36 36 6 6 1 1 6 6 34 32 32 34 1 10 10 10 10 10 32 12 3 34 34 34 34 34 33 34 34 34 34 10 10 34 34 3 34 10 1 10 10 1 1 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 // SPDX-License-Identifier: GPL-2.0-or-later /* * ALSA sequencer Ports * Copyright (c) 1998 by Frank van de Pol <fvdpol@coil.demon.nl> * Jaroslav Kysela <perex@perex.cz> */ #include <sound/core.h> #include <linux/slab.h> #include <linux/module.h> #include "seq_system.h" #include "seq_ports.h" #include "seq_clientmgr.h" /* registration of client ports */ /* NOTE: the current implementation of the port structure as a linked list is not optimal for clients that have many ports. For sending messages to all subscribers of a port we first need to find the address of the port structure, which means we have to traverse the list. A direct access table (array) would be better, but big preallocated arrays waste memory. Possible actions: 1) leave it this way, a client does normaly does not have more than a few ports 2) replace the linked list of ports by a array of pointers which is dynamicly kmalloced. When a port is added or deleted we can simply allocate a new array, copy the corresponding pointers, and delete the old one. We then only need a pointer to this array, and an integer that tells us how much elements are in array. */ /* return pointer to port structure - port is locked if found */ struct snd_seq_client_port *snd_seq_port_use_ptr(struct snd_seq_client *client, int num) { struct snd_seq_client_port *port; if (client == NULL) return NULL; read_lock(&client->ports_lock); list_for_each_entry(port, &client->ports_list_head, list) { if (port->addr.port == num) { if (port->closing) break; /* deleting now */ snd_use_lock_use(&port->use_lock); read_unlock(&client->ports_lock); return port; } } read_unlock(&client->ports_lock); return NULL; /* not found */ } /* search for the next port - port is locked if found */ struct snd_seq_client_port *snd_seq_port_query_nearest(struct snd_seq_client *client, struct snd_seq_port_info *pinfo) { int num; struct snd_seq_client_port *port, *found; bool check_inactive = (pinfo->capability & SNDRV_SEQ_PORT_CAP_INACTIVE); num = pinfo->addr.port; found = NULL; read_lock(&client->ports_lock); list_for_each_entry(port, &client->ports_list_head, list) { if ((port->capability & SNDRV_SEQ_PORT_CAP_INACTIVE) && !check_inactive) continue; /* skip inactive ports */ if (port->addr.port < num) continue; if (port->addr.port == num) { found = port; break; } if (found == NULL || port->addr.port < found->addr.port) found = port; } if (found) { if (found->closing) found = NULL; else snd_use_lock_use(&found->use_lock); } read_unlock(&client->ports_lock); return found; } /* initialize snd_seq_port_subs_info */ static void port_subs_info_init(struct snd_seq_port_subs_info *grp) { INIT_LIST_HEAD(&grp->list_head); grp->count = 0; grp->exclusive = 0; rwlock_init(&grp->list_lock); init_rwsem(&grp->list_mutex); grp->open = NULL; grp->close = NULL; } /* create a port, port number or a negative error code is returned * the caller needs to unref the port via snd_seq_port_unlock() appropriately */ int snd_seq_create_port(struct snd_seq_client *client, int port, struct snd_seq_client_port **port_ret) { struct snd_seq_client_port *new_port, *p; int num; *port_ret = NULL; /* sanity check */ if (snd_BUG_ON(!client)) return -EINVAL; if (client->num_ports >= SNDRV_SEQ_MAX_PORTS) { pr_warn("ALSA: seq: too many ports for client %d\n", client->number); return -EINVAL; } /* create a new port */ new_port = kzalloc(sizeof(*new_port), GFP_KERNEL); if (!new_port) return -ENOMEM; /* failure, out of memory */ /* init port data */ new_port->addr.client = client->number; new_port->addr.port = -1; new_port->owner = THIS_MODULE; snd_use_lock_init(&new_port->use_lock); port_subs_info_init(&new_port->c_src); port_subs_info_init(&new_port->c_dest); snd_use_lock_use(&new_port->use_lock); num = max(port, 0); mutex_lock(&client->ports_mutex); write_lock_irq(&client->ports_lock); list_for_each_entry(p, &client->ports_list_head, list) { if (p->addr.port == port) { kfree(new_port); num = -EBUSY; goto unlock; } if (p->addr.port > num) break; if (port < 0) /* auto-probe mode */ num = p->addr.port + 1; } /* insert the new port */ list_add_tail(&new_port->list, &p->list); client->num_ports++; new_port->addr.port = num; /* store the port number in the port */ sprintf(new_port->name, "port-%d", num); *port_ret = new_port; unlock: write_unlock_irq(&client->ports_lock); mutex_unlock(&client->ports_mutex); return num; } /* */ static int subscribe_port(struct snd_seq_client *client, struct snd_seq_client_port *port, struct snd_seq_port_subs_info *grp, struct snd_seq_port_subscribe *info, int send_ack); static int unsubscribe_port(struct snd_seq_client *client, struct snd_seq_client_port *port, struct snd_seq_port_subs_info *grp, struct snd_seq_port_subscribe *info, int send_ack); static struct snd_seq_client_port *get_client_port(struct snd_seq_addr *addr, struct snd_seq_client **cp) { struct snd_seq_client_port *p; *cp = snd_seq_client_use_ptr(addr->client); if (*cp) { p = snd_seq_port_use_ptr(*cp, addr->port); if (! p) { snd_seq_client_unlock(*cp); *cp = NULL; } return p; } return NULL; } static void delete_and_unsubscribe_port(struct snd_seq_client *client, struct snd_seq_client_port *port, struct snd_seq_subscribers *subs, bool is_src, bool ack); static inline struct snd_seq_subscribers * get_subscriber(struct list_head *p, bool is_src) { if (is_src) return list_entry(p, struct snd_seq_subscribers, src_list); else return list_entry(p, struct snd_seq_subscribers, dest_list); } /* * remove all subscribers on the list * this is called from port_delete, for each src and dest list. */ static void clear_subscriber_list(struct snd_seq_client *client, struct snd_seq_client_port *port, struct snd_seq_port_subs_info *grp, int is_src) { struct list_head *p, *n; list_for_each_safe(p, n, &grp->list_head) { struct snd_seq_subscribers *subs; struct snd_seq_client *c; struct snd_seq_client_port *aport; subs = get_subscriber(p, is_src); if (is_src) aport = get_client_port(&subs->info.dest, &c); else aport = get_client_port(&subs->info.sender, &c); delete_and_unsubscribe_port(client, port, subs, is_src, false); if (!aport) { /* looks like the connected port is being deleted. * we decrease the counter, and when both ports are deleted * remove the subscriber info */ if (atomic_dec_and_test(&subs->ref_count)) kfree(subs); continue; } /* ok we got the connected port */ delete_and_unsubscribe_port(c, aport, subs, !is_src, true); kfree(subs); snd_seq_port_unlock(aport); snd_seq_client_unlock(c); } } /* delete port data */ static int port_delete(struct snd_seq_client *client, struct snd_seq_client_port *port) { /* set closing flag and wait for all port access are gone */ port->closing = 1; snd_use_lock_sync(&port->use_lock); /* clear subscribers info */ clear_subscriber_list(client, port, &port->c_src, true); clear_subscriber_list(client, port, &port->c_dest, false); if (port->private_free) port->private_free(port->private_data); snd_BUG_ON(port->c_src.count != 0); snd_BUG_ON(port->c_dest.count != 0); kfree(port); return 0; } /* delete a port with the given port id */ int snd_seq_delete_port(struct snd_seq_client *client, int port) { struct snd_seq_client_port *found = NULL, *p; mutex_lock(&client->ports_mutex); write_lock_irq(&client->ports_lock); list_for_each_entry(p, &client->ports_list_head, list) { if (p->addr.port == port) { /* ok found. delete from the list at first */ list_del(&p->list); client->num_ports--; found = p; break; } } write_unlock_irq(&client->ports_lock); mutex_unlock(&client->ports_mutex); if (found) return port_delete(client, found); else return -ENOENT; } /* delete the all ports belonging to the given client */ int snd_seq_delete_all_ports(struct snd_seq_client *client) { struct list_head deleted_list; struct snd_seq_client_port *port, *tmp; /* move the port list to deleted_list, and * clear the port list in the client data. */ mutex_lock(&client->ports_mutex); write_lock_irq(&client->ports_lock); if (! list_empty(&client->ports_list_head)) { list_add(&deleted_list, &client->ports_list_head); list_del_init(&client->ports_list_head); } else { INIT_LIST_HEAD(&deleted_list); } client->num_ports = 0; write_unlock_irq(&client->ports_lock); /* remove each port in deleted_list */ list_for_each_entry_safe(port, tmp, &deleted_list, list) { list_del(&port->list); snd_seq_system_client_ev_port_exit(port->addr.client, port->addr.port); port_delete(client, port); } mutex_unlock(&client->ports_mutex); return 0; } /* set port info fields */ int snd_seq_set_port_info(struct snd_seq_client_port * port, struct snd_seq_port_info * info) { if (snd_BUG_ON(!port || !info)) return -EINVAL; /* set port name */ if (info->name[0]) strscpy(port->name, info->name, sizeof(port->name)); /* set capabilities */ port->capability = info->capability; /* get port type */ port->type = info->type; /* information about supported channels/voices */ port->midi_channels = info->midi_channels; port->midi_voices = info->midi_voices; port->synth_voices = info->synth_voices; /* timestamping */ port->timestamping = (info->flags & SNDRV_SEQ_PORT_FLG_TIMESTAMP) ? 1 : 0; port->time_real = (info->flags & SNDRV_SEQ_PORT_FLG_TIME_REAL) ? 1 : 0; port->time_queue = info->time_queue; /* UMP direction and group */ port->direction = info->direction; port->ump_group = info->ump_group; if (port->ump_group > SNDRV_UMP_MAX_GROUPS) port->ump_group = 0; /* fill default port direction */ if (!port->direction) { if (info->capability & SNDRV_SEQ_PORT_CAP_READ) port->direction |= SNDRV_SEQ_PORT_DIR_INPUT; if (info->capability & SNDRV_SEQ_PORT_CAP_WRITE) port->direction |= SNDRV_SEQ_PORT_DIR_OUTPUT; } return 0; } /* get port info fields */ int snd_seq_get_port_info(struct snd_seq_client_port * port, struct snd_seq_port_info * info) { if (snd_BUG_ON(!port || !info)) return -EINVAL; /* get port name */ strscpy(info->name, port->name, sizeof(info->name)); /* get capabilities */ info->capability = port->capability; /* get port type */ info->type = port->type; /* information about supported channels/voices */ info->midi_channels = port->midi_channels; info->midi_voices = port->midi_voices; info->synth_voices = port->synth_voices; /* get subscriber counts */ info->read_use = port->c_src.count; info->write_use = port->c_dest.count; /* timestamping */ info->flags = 0; if (port->timestamping) { info->flags |= SNDRV_SEQ_PORT_FLG_TIMESTAMP; if (port->time_real) info->flags |= SNDRV_SEQ_PORT_FLG_TIME_REAL; info->time_queue = port->time_queue; } /* UMP direction and group */ info->direction = port->direction; info->ump_group = port->ump_group; return 0; } /* * call callback functions (if any): * the callbacks are invoked only when the first (for connection) or * the last subscription (for disconnection) is done. Second or later * subscription results in increment of counter, but no callback is * invoked. * This feature is useful if these callbacks are associated with * initialization or termination of devices (see seq_midi.c). */ static int subscribe_port(struct snd_seq_client *client, struct snd_seq_client_port *port, struct snd_seq_port_subs_info *grp, struct snd_seq_port_subscribe *info, int send_ack) { int err = 0; if (!try_module_get(port->owner)) return -EFAULT; grp->count++; if (grp->open && grp->count == 1) { err = grp->open(port->private_data, info); if (err < 0) { module_put(port->owner); grp->count--; } } if (err >= 0 && send_ack && client->type == USER_CLIENT) snd_seq_client_notify_subscription(port->addr.client, port->addr.port, info, SNDRV_SEQ_EVENT_PORT_SUBSCRIBED); return err; } static int unsubscribe_port(struct snd_seq_client *client, struct snd_seq_client_port *port, struct snd_seq_port_subs_info *grp, struct snd_seq_port_subscribe *info, int send_ack) { int err = 0; if (! grp->count) return -EINVAL; grp->count--; if (grp->close && grp->count == 0) err = grp->close(port->private_data, info); if (send_ack && client->type == USER_CLIENT) snd_seq_client_notify_subscription(port->addr.client, port->addr.port, info, SNDRV_SEQ_EVENT_PORT_UNSUBSCRIBED); module_put(port->owner); return err; } /* check if both addresses are identical */ static inline int addr_match(struct snd_seq_addr *r, struct snd_seq_addr *s) { return (r->client == s->client) && (r->port == s->port); } /* check the two subscribe info match */ /* if flags is zero, checks only sender and destination addresses */ static int match_subs_info(struct snd_seq_port_subscribe *r, struct snd_seq_port_subscribe *s) { if (addr_match(&r->sender, &s->sender) && addr_match(&r->dest, &s->dest)) { if (r->flags && r->flags == s->flags) return r->queue == s->queue; else if (! r->flags) return 1; } return 0; } static int check_and_subscribe_port(struct snd_seq_client *client, struct snd_seq_client_port *port, struct snd_seq_subscribers *subs, bool is_src, bool exclusive, bool ack) { struct snd_seq_port_subs_info *grp; struct list_head *p; struct snd_seq_subscribers *s; int err; grp = is_src ? &port->c_src : &port->c_dest; err = -EBUSY; down_write(&grp->list_mutex); if (exclusive) { if (!list_empty(&grp->list_head)) goto __error; } else { if (grp->exclusive) goto __error; /* check whether already exists */ list_for_each(p, &grp->list_head) { s = get_subscriber(p, is_src); if (match_subs_info(&subs->info, &s->info)) goto __error; } } err = subscribe_port(client, port, grp, &subs->info, ack); if (err < 0) { grp->exclusive = 0; goto __error; } /* add to list */ write_lock_irq(&grp->list_lock); if (is_src) list_add_tail(&subs->src_list, &grp->list_head); else list_add_tail(&subs->dest_list, &grp->list_head); grp->exclusive = exclusive; atomic_inc(&subs->ref_count); write_unlock_irq(&grp->list_lock); err = 0; __error: up_write(&grp->list_mutex); return err; } /* called with grp->list_mutex held */ static void __delete_and_unsubscribe_port(struct snd_seq_client *client, struct snd_seq_client_port *port, struct snd_seq_subscribers *subs, bool is_src, bool ack) { struct snd_seq_port_subs_info *grp; struct list_head *list; bool empty; grp = is_src ? &port->c_src : &port->c_dest; list = is_src ? &subs->src_list : &subs->dest_list; write_lock_irq(&grp->list_lock); empty = list_empty(list); if (!empty) list_del_init(list); grp->exclusive = 0; write_unlock_irq(&grp->list_lock); if (!empty) unsubscribe_port(client, port, grp, &subs->info, ack); } static void delete_and_unsubscribe_port(struct snd_seq_client *client, struct snd_seq_client_port *port, struct snd_seq_subscribers *subs, bool is_src, bool ack) { struct snd_seq_port_subs_info *grp; grp = is_src ? &port->c_src : &port->c_dest; down_write(&grp->list_mutex); __delete_and_unsubscribe_port(client, port, subs, is_src, ack); up_write(&grp->list_mutex); } /* connect two ports */ int snd_seq_port_connect(struct snd_seq_client *connector, struct snd_seq_client *src_client, struct snd_seq_client_port *src_port, struct snd_seq_client *dest_client, struct snd_seq_client_port *dest_port, struct snd_seq_port_subscribe *info) { struct snd_seq_subscribers *subs; bool exclusive; int err; subs = kzalloc(sizeof(*subs), GFP_KERNEL); if (!subs) return -ENOMEM; subs->info = *info; atomic_set(&subs->ref_count, 0); INIT_LIST_HEAD(&subs->src_list); INIT_LIST_HEAD(&subs->dest_list); exclusive = !!(info->flags & SNDRV_SEQ_PORT_SUBS_EXCLUSIVE); err = check_and_subscribe_port(src_client, src_port, subs, true, exclusive, connector->number != src_client->number); if (err < 0) goto error; err = check_and_subscribe_port(dest_client, dest_port, subs, false, exclusive, connector->number != dest_client->number); if (err < 0) goto error_dest; return 0; error_dest: delete_and_unsubscribe_port(src_client, src_port, subs, true, connector->number != src_client->number); error: kfree(subs); return err; } /* remove the connection */ int snd_seq_port_disconnect(struct snd_seq_client *connector, struct snd_seq_client *src_client, struct snd_seq_client_port *src_port, struct snd_seq_client *dest_client, struct snd_seq_client_port *dest_port, struct snd_seq_port_subscribe *info) { struct snd_seq_port_subs_info *dest = &dest_port->c_dest; struct snd_seq_subscribers *subs; int err = -ENOENT; /* always start from deleting the dest port for avoiding concurrent * deletions */ down_write(&dest->list_mutex); /* look for the connection */ list_for_each_entry(subs, &dest->list_head, dest_list) { if (match_subs_info(info, &subs->info)) { __delete_and_unsubscribe_port(dest_client, dest_port, subs, false, connector->number != dest_client->number); err = 0; break; } } up_write(&dest->list_mutex); if (err < 0) return err; delete_and_unsubscribe_port(src_client, src_port, subs, true, connector->number != src_client->number); kfree(subs); return 0; } /* get matched subscriber */ int snd_seq_port_get_subscription(struct snd_seq_port_subs_info *src_grp, struct snd_seq_addr *dest_addr, struct snd_seq_port_subscribe *subs) { struct snd_seq_subscribers *s; int err = -ENOENT; down_read(&src_grp->list_mutex); list_for_each_entry(s, &src_grp->list_head, src_list) { if (addr_match(dest_addr, &s->info.dest)) { *subs = s->info; err = 0; break; } } up_read(&src_grp->list_mutex); return err; } /* * Attach a device driver that wants to receive events from the * sequencer. Returns the new port number on success. * A driver that wants to receive the events converted to midi, will * use snd_seq_midisynth_register_port(). */ /* exported */ int snd_seq_event_port_attach(int client, struct snd_seq_port_callback *pcbp, int cap, int type, int midi_channels, int midi_voices, char *portname) { struct snd_seq_port_info portinfo; int ret; /* Set up the port */ memset(&portinfo, 0, sizeof(portinfo)); portinfo.addr.client = client; strscpy(portinfo.name, portname ? portname : "Unnamed port", sizeof(portinfo.name)); portinfo.capability = cap; portinfo.type = type; portinfo.kernel = pcbp; portinfo.midi_channels = midi_channels; portinfo.midi_voices = midi_voices; /* Create it */ ret = snd_seq_kernel_client_ctl(client, SNDRV_SEQ_IOCTL_CREATE_PORT, &portinfo); if (ret >= 0) ret = portinfo.addr.port; return ret; } EXPORT_SYMBOL(snd_seq_event_port_attach); /* * Detach the driver from a port. */ /* exported */ int snd_seq_event_port_detach(int client, int port) { struct snd_seq_port_info portinfo; int err; memset(&portinfo, 0, sizeof(portinfo)); portinfo.addr.client = client; portinfo.addr.port = port; err = snd_seq_kernel_client_ctl(client, SNDRV_SEQ_IOCTL_DELETE_PORT, &portinfo); return err; } EXPORT_SYMBOL(snd_seq_event_port_detach);
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 // SPDX-License-Identifier: GPL-2.0-or-later /* * Virtio balloon implementation, inspired by Dor Laor and Marcelo * Tosatti's implementations. * * Copyright 2008 Rusty Russell IBM Corporation */ #include <linux/virtio.h> #include <linux/virtio_balloon.h> #include <linux/swap.h> #include <linux/workqueue.h> #include <linux/delay.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/balloon_compaction.h> #include <linux/oom.h> #include <linux/wait.h> #include <linux/mm.h> #include <linux/page_reporting.h> /* * Balloon device works in 4K page units. So each page is pointed to by * multiple balloon pages. All memory counters in this driver are in balloon * page units. */ #define VIRTIO_BALLOON_PAGES_PER_PAGE (unsigned int)(PAGE_SIZE >> VIRTIO_BALLOON_PFN_SHIFT) #define VIRTIO_BALLOON_ARRAY_PFNS_MAX 256 /* Maximum number of (4k) pages to deflate on OOM notifications. */ #define VIRTIO_BALLOON_OOM_NR_PAGES 256 #define VIRTIO_BALLOON_OOM_NOTIFY_PRIORITY 80 #define VIRTIO_BALLOON_FREE_PAGE_ALLOC_FLAG (__GFP_NORETRY | __GFP_NOWARN | \ __GFP_NOMEMALLOC) /* The order of free page blocks to report to host */ #define VIRTIO_BALLOON_HINT_BLOCK_ORDER MAX_PAGE_ORDER /* The size of a free page block in bytes */ #define VIRTIO_BALLOON_HINT_BLOCK_BYTES \ (1 << (VIRTIO_BALLOON_HINT_BLOCK_ORDER + PAGE_SHIFT)) #define VIRTIO_BALLOON_HINT_BLOCK_PAGES (1 << VIRTIO_BALLOON_HINT_BLOCK_ORDER) enum virtio_balloon_vq { VIRTIO_BALLOON_VQ_INFLATE, VIRTIO_BALLOON_VQ_DEFLATE, VIRTIO_BALLOON_VQ_STATS, VIRTIO_BALLOON_VQ_FREE_PAGE, VIRTIO_BALLOON_VQ_REPORTING, VIRTIO_BALLOON_VQ_MAX }; enum virtio_balloon_config_read { VIRTIO_BALLOON_CONFIG_READ_CMD_ID = 0, }; struct virtio_balloon { struct virtio_device *vdev; struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq; /* Balloon's own wq for cpu-intensive work items */ struct workqueue_struct *balloon_wq; /* The free page reporting work item submitted to the balloon wq */ struct work_struct report_free_page_work; /* The balloon servicing is delegated to a freezable workqueue. */ struct work_struct update_balloon_stats_work; struct work_struct update_balloon_size_work; /* Prevent updating balloon when it is being canceled. */ spinlock_t stop_update_lock; bool stop_update; /* Bitmap to indicate if reading the related config fields are needed */ unsigned long config_read_bitmap; /* The list of allocated free pages, waiting to be given back to mm */ struct list_head free_page_list; spinlock_t free_page_list_lock; /* The number of free page blocks on the above list */ unsigned long num_free_page_blocks; /* * The cmd id received from host. * Read it via virtio_balloon_cmd_id_received to get the latest value * sent from host. */ u32 cmd_id_received_cache; /* The cmd id that is actively in use */ __virtio32 cmd_id_active; /* Buffer to store the stop sign */ __virtio32 cmd_id_stop; /* Waiting for host to ack the pages we released. */ wait_queue_head_t acked; /* Number of balloon pages we've told the Host we're not using. */ unsigned int num_pages; /* * The pages we've told the Host we're not using are enqueued * at vb_dev_info->pages list. * Each page on this list adds VIRTIO_BALLOON_PAGES_PER_PAGE * to num_pages above. */ struct balloon_dev_info vb_dev_info; /* Synchronize access/update to this struct virtio_balloon elements */ struct mutex balloon_lock; /* The array of pfns we tell the Host about. */ unsigned int num_pfns; __virtio32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX]; /* Memory statistics */ struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR]; /* Shrinker to return free pages - VIRTIO_BALLOON_F_FREE_PAGE_HINT */ struct shrinker *shrinker; /* OOM notifier to deflate on OOM - VIRTIO_BALLOON_F_DEFLATE_ON_OOM */ struct notifier_block oom_nb; /* Free page reporting device */ struct virtqueue *reporting_vq; struct page_reporting_dev_info pr_dev_info; /* State for keeping the wakeup_source active while adjusting the balloon */ spinlock_t adjustment_lock; bool adjustment_signal_pending; bool adjustment_in_progress; }; static const struct virtio_device_id id_table[] = { { VIRTIO_ID_BALLOON, VIRTIO_DEV_ANY_ID }, { 0 }, }; static u32 page_to_balloon_pfn(struct page *page) { unsigned long pfn = page_to_pfn(page); BUILD_BUG_ON(PAGE_SHIFT < VIRTIO_BALLOON_PFN_SHIFT); /* Convert pfn from Linux page size to balloon page size. */ return pfn * VIRTIO_BALLOON_PAGES_PER_PAGE; } static void balloon_ack(struct virtqueue *vq) { struct virtio_balloon *vb = vq->vdev->priv; wake_up(&vb->acked); } static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq) { struct scatterlist sg; unsigned int len; sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns); /* We should always be able to add one buffer to an empty queue. */ virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL); virtqueue_kick(vq); /* When host has read buffer, this completes via balloon_ack */ wait_event(vb->acked, virtqueue_get_buf(vq, &len)); } static int virtballoon_free_page_report(struct page_reporting_dev_info *pr_dev_info, struct scatterlist *sg, unsigned int nents) { struct virtio_balloon *vb = container_of(pr_dev_info, struct virtio_balloon, pr_dev_info); struct virtqueue *vq = vb->reporting_vq; unsigned int unused, err; /* We should always be able to add these buffers to an empty queue. */ err = virtqueue_add_inbuf(vq, sg, nents, vb, GFP_NOWAIT | __GFP_NOWARN); /* * In the extremely unlikely case that something has occurred and we * are able to trigger an error we will simply display a warning * and exit without actually processing the pages. */ if (WARN_ON_ONCE(err)) return err; virtqueue_kick(vq); /* When host has read buffer, this completes via balloon_ack */ wait_event(vb->acked, virtqueue_get_buf(vq, &unused)); return 0; } static void set_page_pfns(struct virtio_balloon *vb, __virtio32 pfns[], struct page *page) { unsigned int i; BUILD_BUG_ON(VIRTIO_BALLOON_PAGES_PER_PAGE > VIRTIO_BALLOON_ARRAY_PFNS_MAX); /* * Set balloon pfns pointing at this page. * Note that the first pfn points at start of the page. */ for (i = 0; i < VIRTIO_BALLOON_PAGES_PER_PAGE; i++) pfns[i] = cpu_to_virtio32(vb->vdev, page_to_balloon_pfn(page) + i); } static unsigned int fill_balloon(struct virtio_balloon *vb, size_t num) { unsigned int num_allocated_pages; unsigned int num_pfns; struct page *page; LIST_HEAD(pages); /* We can only do one array worth at a time. */ num = min(num, ARRAY_SIZE(vb->pfns)); for (num_pfns = 0; num_pfns < num; num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) { struct page *page = balloon_page_alloc(); if (!page) { dev_info_ratelimited(&vb->vdev->dev, "Out of puff! Can't get %u pages\n", VIRTIO_BALLOON_PAGES_PER_PAGE); /* Sleep for at least 1/5 of a second before retry. */ msleep(200); break; } balloon_page_push(&pages, page); } mutex_lock(&vb->balloon_lock); vb->num_pfns = 0; while ((page = balloon_page_pop(&pages))) { balloon_page_enqueue(&vb->vb_dev_info, page); set_page_pfns(vb, vb->pfns + vb->num_pfns, page); vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE; if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) adjust_managed_page_count(page, -1); vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE; } num_allocated_pages = vb->num_pfns; /* Did we get any? */ if (vb->num_pfns != 0) tell_host(vb, vb->inflate_vq); mutex_unlock(&vb->balloon_lock); return num_allocated_pages; } static void release_pages_balloon(struct virtio_balloon *vb, struct list_head *pages) { struct page *page, *next; list_for_each_entry_safe(page, next, pages, lru) { if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) adjust_managed_page_count(page, 1); list_del(&page->lru); put_page(page); /* balloon reference */ } } static unsigned int leak_balloon(struct virtio_balloon *vb, size_t num) { unsigned int num_freed_pages; struct page *page; struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info; LIST_HEAD(pages); /* We can only do one array worth at a time. */ num = min(num, ARRAY_SIZE(vb->pfns)); mutex_lock(&vb->balloon_lock); /* We can't release more pages than taken */ num = min(num, (size_t)vb->num_pages); for (vb->num_pfns = 0; vb->num_pfns < num; vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) { page = balloon_page_dequeue(vb_dev_info); if (!page) break; set_page_pfns(vb, vb->pfns + vb->num_pfns, page); list_add(&page->lru, &pages); vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE; } num_freed_pages = vb->num_pfns; /* * Note that if * virtio_has_feature(vdev, VIRTIO_BALLOON_F_MUST_TELL_HOST); * is true, we *have* to do it in this order */ if (vb->num_pfns != 0) tell_host(vb, vb->deflate_vq); release_pages_balloon(vb, &pages); mutex_unlock(&vb->balloon_lock); return num_freed_pages; } static inline void update_stat(struct virtio_balloon *vb, int idx, u16 tag, u64 val) { BUG_ON(idx >= VIRTIO_BALLOON_S_NR); vb->stats[idx].tag = cpu_to_virtio16(vb->vdev, tag); vb->stats[idx].val = cpu_to_virtio64(vb->vdev, val); } #define pages_to_bytes(x) ((u64)(x) << PAGE_SHIFT) static unsigned int update_balloon_stats(struct virtio_balloon *vb) { unsigned long events[NR_VM_EVENT_ITEMS]; struct sysinfo i; unsigned int idx = 0; long available; unsigned long caches; all_vm_events(events); si_meminfo(&i); available = si_mem_available(); caches = global_node_page_state(NR_FILE_PAGES); #ifdef CONFIG_VM_EVENT_COUNTERS update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_IN, pages_to_bytes(events[PSWPIN])); update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_OUT, pages_to_bytes(events[PSWPOUT])); update_stat(vb, idx++, VIRTIO_BALLOON_S_MAJFLT, events[PGMAJFAULT]); update_stat(vb, idx++, VIRTIO_BALLOON_S_MINFLT, events[PGFAULT]); #ifdef CONFIG_HUGETLB_PAGE update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC, events[HTLB_BUDDY_PGALLOC]); update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGFAIL, events[HTLB_BUDDY_PGALLOC_FAIL]); #endif #endif update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMFREE, pages_to_bytes(i.freeram)); update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMTOT, pages_to_bytes(i.totalram)); update_stat(vb, idx++, VIRTIO_BALLOON_S_AVAIL, pages_to_bytes(available)); update_stat(vb, idx++, VIRTIO_BALLOON_S_CACHES, pages_to_bytes(caches)); return idx; } /* * While most virtqueues communicate guest-initiated requests to the hypervisor, * the stats queue operates in reverse. The driver initializes the virtqueue * with a single buffer. From that point forward, all conversations consist of * a hypervisor request (a call to this function) which directs us to refill * the virtqueue with a fresh stats buffer. Since stats collection can sleep, * we delegate the job to a freezable workqueue that will do the actual work via * stats_handle_request(). */ static void stats_request(struct virtqueue *vq) { struct virtio_balloon *vb = vq->vdev->priv; spin_lock(&vb->stop_update_lock); if (!vb->stop_update) queue_work(system_freezable_wq, &vb->update_balloon_stats_work); spin_unlock(&vb->stop_update_lock); } static void stats_handle_request(struct virtio_balloon *vb) { struct virtqueue *vq; struct scatterlist sg; unsigned int len, num_stats; num_stats = update_balloon_stats(vb); vq = vb->stats_vq; if (!virtqueue_get_buf(vq, &len)) return; sg_init_one(&sg, vb->stats, sizeof(vb->stats[0]) * num_stats); virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL); virtqueue_kick(vq); } static inline s64 towards_target(struct virtio_balloon *vb) { s64 target; u32 num_pages; /* Legacy balloon config space is LE, unlike all other devices. */ virtio_cread_le(vb->vdev, struct virtio_balloon_config, num_pages, &num_pages); /* * Aligned up to guest page size to avoid inflating and deflating * balloon endlessly. */ target = ALIGN(num_pages, VIRTIO_BALLOON_PAGES_PER_PAGE); return target - vb->num_pages; } /* Gives back @num_to_return blocks of free pages to mm. */ static unsigned long return_free_pages_to_mm(struct virtio_balloon *vb, unsigned long num_to_return) { struct page *page; unsigned long num_returned; spin_lock_irq(&vb->free_page_list_lock); for (num_returned = 0; num_returned < num_to_return; num_returned++) { page = balloon_page_pop(&vb->free_page_list); if (!page) break; free_pages((unsigned long)page_address(page), VIRTIO_BALLOON_HINT_BLOCK_ORDER); } vb->num_free_page_blocks -= num_returned; spin_unlock_irq(&vb->free_page_list_lock); return num_returned; } static void virtio_balloon_queue_free_page_work(struct virtio_balloon *vb) { if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) return; /* No need to queue the work if the bit was already set. */ if (test_and_set_bit(VIRTIO_BALLOON_CONFIG_READ_CMD_ID, &vb->config_read_bitmap)) return; queue_work(vb->balloon_wq, &vb->report_free_page_work); } static void start_update_balloon_size(struct virtio_balloon *vb) { unsigned long flags; spin_lock_irqsave(&vb->adjustment_lock, flags); vb->adjustment_signal_pending = true; if (!vb->adjustment_in_progress) { vb->adjustment_in_progress = true; pm_stay_awake(vb->vdev->dev.parent); } spin_unlock_irqrestore(&vb->adjustment_lock, flags); queue_work(system_freezable_wq, &vb->update_balloon_size_work); } static void end_update_balloon_size(struct virtio_balloon *vb) { spin_lock(&vb->adjustment_lock); if (!vb->adjustment_signal_pending && vb->adjustment_in_progress) { vb->adjustment_in_progress = false; pm_relax(vb->vdev->dev.parent); } spin_unlock(&vb->adjustment_lock); } static void virtballoon_changed(struct virtio_device *vdev) { struct virtio_balloon *vb = vdev->priv; unsigned long flags; spin_lock_irqsave(&vb->stop_update_lock, flags); if (!vb->stop_update) { start_update_balloon_size(vb); virtio_balloon_queue_free_page_work(vb); } spin_unlock_irqrestore(&vb->stop_update_lock, flags); } static void update_balloon_size(struct virtio_balloon *vb) { u32 actual = vb->num_pages; /* Legacy balloon config space is LE, unlike all other devices. */ virtio_cwrite_le(vb->vdev, struct virtio_balloon_config, actual, &actual); } static void update_balloon_stats_func(struct work_struct *work) { struct virtio_balloon *vb; vb = container_of(work, struct virtio_balloon, update_balloon_stats_work); stats_handle_request(vb); } static void update_balloon_size_func(struct work_struct *work) { struct virtio_balloon *vb; s64 diff; vb = container_of(work, struct virtio_balloon, update_balloon_size_work); spin_lock(&vb->adjustment_lock); vb->adjustment_signal_pending = false; spin_unlock(&vb->adjustment_lock); diff = towards_target(vb); if (diff) { if (diff > 0) diff -= fill_balloon(vb, diff); else diff += leak_balloon(vb, -diff); update_balloon_size(vb); } if (diff) queue_work(system_freezable_wq, work); else end_update_balloon_size(vb); } static int init_vqs(struct virtio_balloon *vb) { struct virtqueue *vqs[VIRTIO_BALLOON_VQ_MAX]; vq_callback_t *callbacks[VIRTIO_BALLOON_VQ_MAX]; const char *names[VIRTIO_BALLOON_VQ_MAX]; int err; /* * Inflateq and deflateq are used unconditionally. The names[] * will be NULL if the related feature is not enabled, which will * cause no allocation for the corresponding virtqueue in find_vqs. */ callbacks[VIRTIO_BALLOON_VQ_INFLATE] = balloon_ack; names[VIRTIO_BALLOON_VQ_INFLATE] = "inflate"; callbacks[VIRTIO_BALLOON_VQ_DEFLATE] = balloon_ack; names[VIRTIO_BALLOON_VQ_DEFLATE] = "deflate"; callbacks[VIRTIO_BALLOON_VQ_STATS] = NULL; names[VIRTIO_BALLOON_VQ_STATS] = NULL; callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL; names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL; names[VIRTIO_BALLOON_VQ_REPORTING] = NULL; if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) { names[VIRTIO_BALLOON_VQ_STATS] = "stats"; callbacks[VIRTIO_BALLOON_VQ_STATS] = stats_request; } if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { names[VIRTIO_BALLOON_VQ_FREE_PAGE] = "free_page_vq"; callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL; } if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) { names[VIRTIO_BALLOON_VQ_REPORTING] = "reporting_vq"; callbacks[VIRTIO_BALLOON_VQ_REPORTING] = balloon_ack; } err = virtio_find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX, vqs, callbacks, names, NULL); if (err) return err; vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE]; vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE]; if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) { struct scatterlist sg; unsigned int num_stats; vb->stats_vq = vqs[VIRTIO_BALLOON_VQ_STATS]; /* * Prime this virtqueue with one buffer so the hypervisor can * use it to signal us later (it can't be broken yet!). */ num_stats = update_balloon_stats(vb); sg_init_one(&sg, vb->stats, sizeof(vb->stats[0]) * num_stats); err = virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL); if (err) { dev_warn(&vb->vdev->dev, "%s: add stat_vq failed\n", __func__); return err; } virtqueue_kick(vb->stats_vq); } if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) vb->free_page_vq = vqs[VIRTIO_BALLOON_VQ_FREE_PAGE]; if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) vb->reporting_vq = vqs[VIRTIO_BALLOON_VQ_REPORTING]; return 0; } static u32 virtio_balloon_cmd_id_received(struct virtio_balloon *vb) { if (test_and_clear_bit(VIRTIO_BALLOON_CONFIG_READ_CMD_ID, &vb->config_read_bitmap)) { /* Legacy balloon config space is LE, unlike all other devices. */ virtio_cread_le(vb->vdev, struct virtio_balloon_config, free_page_hint_cmd_id, &vb->cmd_id_received_cache); } return vb->cmd_id_received_cache; } static int send_cmd_id_start(struct virtio_balloon *vb) { struct scatterlist sg; struct virtqueue *vq = vb->free_page_vq; int err, unused; /* Detach all the used buffers from the vq */ while (virtqueue_get_buf(vq, &unused)) ; vb->cmd_id_active = cpu_to_virtio32(vb->vdev, virtio_balloon_cmd_id_received(vb)); sg_init_one(&sg, &vb->cmd_id_active, sizeof(vb->cmd_id_active)); err = virtqueue_add_outbuf(vq, &sg, 1, &vb->cmd_id_active, GFP_KERNEL); if (!err) virtqueue_kick(vq); return err; } static int send_cmd_id_stop(struct virtio_balloon *vb) { struct scatterlist sg; struct virtqueue *vq = vb->free_page_vq; int err, unused; /* Detach all the used buffers from the vq */ while (virtqueue_get_buf(vq, &unused)) ; sg_init_one(&sg, &vb->cmd_id_stop, sizeof(vb->cmd_id_stop)); err = virtqueue_add_outbuf(vq, &sg, 1, &vb->cmd_id_stop, GFP_KERNEL); if (!err) virtqueue_kick(vq); return err; } static int get_free_page_and_send(struct virtio_balloon *vb) { struct virtqueue *vq = vb->free_page_vq; struct page *page; struct scatterlist sg; int err, unused; void *p; /* Detach all the used buffers from the vq */ while (virtqueue_get_buf(vq, &unused)) ; page = alloc_pages(VIRTIO_BALLOON_FREE_PAGE_ALLOC_FLAG, VIRTIO_BALLOON_HINT_BLOCK_ORDER); /* * When the allocation returns NULL, it indicates that we have got all * the possible free pages, so return -EINTR to stop. */ if (!page) return -EINTR; p = page_address(page); sg_init_one(&sg, p, VIRTIO_BALLOON_HINT_BLOCK_BYTES); /* There is always 1 entry reserved for the cmd id to use. */ if (vq->num_free > 1) { err = virtqueue_add_inbuf(vq, &sg, 1, p, GFP_KERNEL); if (unlikely(err)) { free_pages((unsigned long)p, VIRTIO_BALLOON_HINT_BLOCK_ORDER); return err; } virtqueue_kick(vq); spin_lock_irq(&vb->free_page_list_lock); balloon_page_push(&vb->free_page_list, page); vb->num_free_page_blocks++; spin_unlock_irq(&vb->free_page_list_lock); } else { /* * The vq has no available entry to add this page block, so * just free it. */ free_pages((unsigned long)p, VIRTIO_BALLOON_HINT_BLOCK_ORDER); } return 0; } static int send_free_pages(struct virtio_balloon *vb) { int err; u32 cmd_id_active; while (1) { /* * If a stop id or a new cmd id was just received from host, * stop the reporting. */ cmd_id_active = virtio32_to_cpu(vb->vdev, vb->cmd_id_active); if (unlikely(cmd_id_active != virtio_balloon_cmd_id_received(vb))) break; /* * The free page blocks are allocated and sent to host one by * one. */ err = get_free_page_and_send(vb); if (err == -EINTR) break; else if (unlikely(err)) return err; } return 0; } static void virtio_balloon_report_free_page(struct virtio_balloon *vb) { int err; struct device *dev = &vb->vdev->dev; /* Start by sending the received cmd id to host with an outbuf. */ err = send_cmd_id_start(vb); if (unlikely(err)) dev_err(dev, "Failed to send a start id, err = %d\n", err); err = send_free_pages(vb); if (unlikely(err)) dev_err(dev, "Failed to send a free page, err = %d\n", err); /* End by sending a stop id to host with an outbuf. */ err = send_cmd_id_stop(vb); if (unlikely(err)) dev_err(dev, "Failed to send a stop id, err = %d\n", err); } static void report_free_page_func(struct work_struct *work) { struct virtio_balloon *vb = container_of(work, struct virtio_balloon, report_free_page_work); u32 cmd_id_received; cmd_id_received = virtio_balloon_cmd_id_received(vb); if (cmd_id_received == VIRTIO_BALLOON_CMD_ID_DONE) { /* Pass ULONG_MAX to give back all the free pages */ return_free_pages_to_mm(vb, ULONG_MAX); } else if (cmd_id_received != VIRTIO_BALLOON_CMD_ID_STOP && cmd_id_received != virtio32_to_cpu(vb->vdev, vb->cmd_id_active)) { virtio_balloon_report_free_page(vb); } } #ifdef CONFIG_BALLOON_COMPACTION /* * virtballoon_migratepage - perform the balloon page migration on behalf of * a compaction thread. (called under page lock) * @vb_dev_info: the balloon device * @newpage: page that will replace the isolated page after migration finishes. * @page : the isolated (old) page that is about to be migrated to newpage. * @mode : compaction mode -- not used for balloon page migration. * * After a ballooned page gets isolated by compaction procedures, this is the * function that performs the page migration on behalf of a compaction thread * The page migration for virtio balloon is done in a simple swap fashion which * follows these two macro steps: * 1) insert newpage into vb->pages list and update the host about it; * 2) update the host about the old page removed from vb->pages list; * * This function preforms the balloon page migration task. * Called through movable_operations->migrate_page */ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info, struct page *newpage, struct page *page, enum migrate_mode mode) { struct virtio_balloon *vb = container_of(vb_dev_info, struct virtio_balloon, vb_dev_info); unsigned long flags; /* * In order to avoid lock contention while migrating pages concurrently * to leak_balloon() or fill_balloon() we just give up the balloon_lock * this turn, as it is easier to retry the page migration later. * This also prevents fill_balloon() getting stuck into a mutex * recursion in the case it ends up triggering memory compaction * while it is attempting to inflate the ballon. */ if (!mutex_trylock(&vb->balloon_lock)) return -EAGAIN; get_page(newpage); /* balloon reference */ /* * When we migrate a page to a different zone and adjusted the * managed page count when inflating, we have to fixup the count of * both involved zones. */ if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM) && page_zone(page) != page_zone(newpage)) { adjust_managed_page_count(page, 1); adjust_managed_page_count(newpage, -1); } /* balloon's page migration 1st step -- inflate "newpage" */ spin_lock_irqsave(&vb_dev_info->pages_lock, flags); balloon_page_insert(vb_dev_info, newpage); vb_dev_info->isolated_pages--; __count_vm_event(BALLOON_MIGRATE); spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags); vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; set_page_pfns(vb, vb->pfns, newpage); tell_host(vb, vb->inflate_vq); /* balloon's page migration 2nd step -- deflate "page" */ spin_lock_irqsave(&vb_dev_info->pages_lock, flags); balloon_page_delete(page); spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags); vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; set_page_pfns(vb, vb->pfns, page); tell_host(vb, vb->deflate_vq); mutex_unlock(&vb->balloon_lock); put_page(page); /* balloon reference */ return MIGRATEPAGE_SUCCESS; } #endif /* CONFIG_BALLOON_COMPACTION */ static unsigned long shrink_free_pages(struct virtio_balloon *vb, unsigned long pages_to_free) { unsigned long blocks_to_free, blocks_freed; pages_to_free = round_up(pages_to_free, VIRTIO_BALLOON_HINT_BLOCK_PAGES); blocks_to_free = pages_to_free / VIRTIO_BALLOON_HINT_BLOCK_PAGES; blocks_freed = return_free_pages_to_mm(vb, blocks_to_free); return blocks_freed * VIRTIO_BALLOON_HINT_BLOCK_PAGES; } static unsigned long virtio_balloon_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) { struct virtio_balloon *vb = shrinker->private_data; return shrink_free_pages(vb, sc->nr_to_scan); } static unsigned long virtio_balloon_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc) { struct virtio_balloon *vb = shrinker->private_data; return vb->num_free_page_blocks * VIRTIO_BALLOON_HINT_BLOCK_PAGES; } static int virtio_balloon_oom_notify(struct notifier_block *nb, unsigned long dummy, void *parm) { struct virtio_balloon *vb = container_of(nb, struct virtio_balloon, oom_nb); unsigned long *freed = parm; *freed += leak_balloon(vb, VIRTIO_BALLOON_OOM_NR_PAGES) / VIRTIO_BALLOON_PAGES_PER_PAGE; update_balloon_size(vb); return NOTIFY_OK; } static void virtio_balloon_unregister_shrinker(struct virtio_balloon *vb) { shrinker_free(vb->shrinker); } static int virtio_balloon_register_shrinker(struct virtio_balloon *vb) { vb->shrinker = shrinker_alloc(0, "virtio-balloon"); if (!vb->shrinker) return -ENOMEM; vb->shrinker->scan_objects = virtio_balloon_shrinker_scan; vb->shrinker->count_objects = virtio_balloon_shrinker_count; vb->shrinker->private_data = vb; shrinker_register(vb->shrinker); return 0; } static int virtballoon_probe(struct virtio_device *vdev) { struct virtio_balloon *vb; int err; if (!vdev->config->get) { dev_err(&vdev->dev, "%s failure: config access disabled\n", __func__); return -EINVAL; } vdev->priv = vb = kzalloc(sizeof(*vb), GFP_KERNEL); if (!vb) { err = -ENOMEM; goto out; } INIT_WORK(&vb->update_balloon_stats_work, update_balloon_stats_func); INIT_WORK(&vb->update_balloon_size_work, update_balloon_size_func); spin_lock_init(&vb->stop_update_lock); mutex_init(&vb->balloon_lock); init_waitqueue_head(&vb->acked); vb->vdev = vdev; balloon_devinfo_init(&vb->vb_dev_info); err = init_vqs(vb); if (err) goto out_free_vb; #ifdef CONFIG_BALLOON_COMPACTION vb->vb_dev_info.migratepage = virtballoon_migratepage; #endif if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { /* * There is always one entry reserved for cmd id, so the ring * size needs to be at least two to report free page hints. */ if (virtqueue_get_vring_size(vb->free_page_vq) < 2) { err = -ENOSPC; goto out_del_vqs; } vb->balloon_wq = alloc_workqueue("balloon-wq", WQ_FREEZABLE | WQ_CPU_INTENSIVE, 0); if (!vb->balloon_wq) { err = -ENOMEM; goto out_del_vqs; } INIT_WORK(&vb->report_free_page_work, report_free_page_func); vb->cmd_id_received_cache = VIRTIO_BALLOON_CMD_ID_STOP; vb->cmd_id_active = cpu_to_virtio32(vb->vdev, VIRTIO_BALLOON_CMD_ID_STOP); vb->cmd_id_stop = cpu_to_virtio32(vb->vdev, VIRTIO_BALLOON_CMD_ID_STOP); spin_lock_init(&vb->free_page_list_lock); INIT_LIST_HEAD(&vb->free_page_list); /* * We're allowed to reuse any free pages, even if they are * still to be processed by the host. */ err = virtio_balloon_register_shrinker(vb); if (err) goto out_del_balloon_wq; } if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) { vb->oom_nb.notifier_call = virtio_balloon_oom_notify; vb->oom_nb.priority = VIRTIO_BALLOON_OOM_NOTIFY_PRIORITY; err = register_oom_notifier(&vb->oom_nb); if (err < 0) goto out_unregister_shrinker; } if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)) { /* Start with poison val of 0 representing general init */ __u32 poison_val = 0; /* * Let the hypervisor know that we are expecting a * specific value to be written back in balloon pages. * * If the PAGE_POISON value was larger than a byte we would * need to byte swap poison_val here to guarantee it is * little-endian. However for now it is a single byte so we * can pass it as-is. */ if (!want_init_on_free()) memset(&poison_val, PAGE_POISON, sizeof(poison_val)); virtio_cwrite_le(vb->vdev, struct virtio_balloon_config, poison_val, &poison_val); } vb->pr_dev_info.report = virtballoon_free_page_report; if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) { unsigned int capacity; capacity = virtqueue_get_vring_size(vb->reporting_vq); if (capacity < PAGE_REPORTING_CAPACITY) { err = -ENOSPC; goto out_unregister_oom; } /* * The default page reporting order is @pageblock_order, which * corresponds to 512MB in size on ARM64 when 64KB base page * size is used. The page reporting won't be triggered if the * freeing page can't come up with a free area like that huge. * So we specify the page reporting order to 5, corresponding * to 2MB. It helps to avoid THP splitting if 4KB base page * size is used by host. * * Ideally, the page reporting order is selected based on the * host's base page size. However, it needs more work to report * that value. The hard-coded order would be fine currently. */ #if defined(CONFIG_ARM64) && defined(CONFIG_ARM64_64K_PAGES) vb->pr_dev_info.order = 5; #endif err = page_reporting_register(&vb->pr_dev_info); if (err) goto out_unregister_oom; } spin_lock_init(&vb->adjustment_lock); virtio_device_ready(vdev); if (towards_target(vb)) virtballoon_changed(vdev); return 0; out_unregister_oom: if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) unregister_oom_notifier(&vb->oom_nb); out_unregister_shrinker: if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) virtio_balloon_unregister_shrinker(vb); out_del_balloon_wq: if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) destroy_workqueue(vb->balloon_wq); out_del_vqs: vdev->config->del_vqs(vdev); out_free_vb: kfree(vb); out: return err; } static void remove_common(struct virtio_balloon *vb) { /* There might be pages left in the balloon: free them. */ while (vb->num_pages) leak_balloon(vb, vb->num_pages); update_balloon_size(vb); /* There might be free pages that are being reported: release them. */ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) return_free_pages_to_mm(vb, ULONG_MAX); /* Now we reset the device so we can clean up the queues. */ virtio_reset_device(vb->vdev); vb->vdev->config->del_vqs(vb->vdev); } static void virtballoon_remove(struct virtio_device *vdev) { struct virtio_balloon *vb = vdev->priv; if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) page_reporting_unregister(&vb->pr_dev_info); if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) unregister_oom_notifier(&vb->oom_nb); if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) virtio_balloon_unregister_shrinker(vb); spin_lock_irq(&vb->stop_update_lock); vb->stop_update = true; spin_unlock_irq(&vb->stop_update_lock); cancel_work_sync(&vb->update_balloon_size_work); cancel_work_sync(&vb->update_balloon_stats_work); if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { cancel_work_sync(&vb->report_free_page_work); destroy_workqueue(vb->balloon_wq); } remove_common(vb); kfree(vb); } #ifdef CONFIG_PM_SLEEP static int virtballoon_freeze(struct virtio_device *vdev) { struct virtio_balloon *vb = vdev->priv; /* * The workqueue is already frozen by the PM core before this * function is called. */ remove_common(vb); return 0; } static int virtballoon_restore(struct virtio_device *vdev) { struct virtio_balloon *vb = vdev->priv; int ret; ret = init_vqs(vdev->priv); if (ret) return ret; virtio_device_ready(vdev); if (towards_target(vb)) virtballoon_changed(vdev); update_balloon_size(vb); return 0; } #endif static int virtballoon_validate(struct virtio_device *vdev) { /* * Inform the hypervisor that our pages are poisoned or * initialized. If we cannot do that then we should disable * page reporting as it could potentially change the contents * of our free pages. */ if (!want_init_on_free() && !page_poisoning_enabled_static()) __virtio_clear_bit(vdev, VIRTIO_BALLOON_F_PAGE_POISON); else if (!virtio_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)) __virtio_clear_bit(vdev, VIRTIO_BALLOON_F_REPORTING); __virtio_clear_bit(vdev, VIRTIO_F_ACCESS_PLATFORM); return 0; } static unsigned int features[] = { VIRTIO_BALLOON_F_MUST_TELL_HOST, VIRTIO_BALLOON_F_STATS_VQ, VIRTIO_BALLOON_F_DEFLATE_ON_OOM, VIRTIO_BALLOON_F_FREE_PAGE_HINT, VIRTIO_BALLOON_F_PAGE_POISON, VIRTIO_BALLOON_F_REPORTING, }; static struct virtio_driver virtio_balloon_driver = { .feature_table = features, .feature_table_size = ARRAY_SIZE(features), .driver.name = KBUILD_MODNAME, .driver.owner = THIS_MODULE, .id_table = id_table, .validate = virtballoon_validate, .probe = virtballoon_probe, .remove = virtballoon_remove, .config_changed = virtballoon_changed, #ifdef CONFIG_PM_SLEEP .freeze = virtballoon_freeze, .restore = virtballoon_restore, #endif }; module_virtio_driver(virtio_balloon_driver); MODULE_DEVICE_TABLE(virtio, id_table); MODULE_DESCRIPTION("Virtio balloon driver"); MODULE_LICENSE("GPL");
1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 // SPDX-License-Identifier: GPL-2.0-or-later /* * em_canid.c Ematch rule to match CAN frames according to their CAN IDs * * Idea: Oliver Hartkopp <oliver.hartkopp@volkswagen.de> * Copyright: (c) 2011 Czech Technical University in Prague * (c) 2011 Volkswagen Group Research * Authors: Michal Sojka <sojkam1@fel.cvut.cz> * Pavel Pisa <pisa@cmp.felk.cvut.cz> * Rostislav Lisovy <lisovy@gmail.cz> * Funded by: Volkswagen Group Research */ #include <linux/slab.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/skbuff.h> #include <net/pkt_cls.h> #include <linux/can.h> #define EM_CAN_RULES_MAX 500 struct canid_match { /* For each SFF CAN ID (11 bit) there is one record in this bitfield */ DECLARE_BITMAP(match_sff, (1 << CAN_SFF_ID_BITS)); int rules_count; int sff_rules_count; int eff_rules_count; /* * Raw rules copied from netlink message; Used for sending * information to userspace (when 'tc filter show' is invoked) * AND when matching EFF frames */ struct can_filter rules_raw[]; }; /** * em_canid_get_id() - Extracts Can ID out of the sk_buff structure. * @skb: buffer to extract Can ID from */ static canid_t em_canid_get_id(struct sk_buff *skb) { /* CAN ID is stored within the data field */ struct can_frame *cf = (struct can_frame *)skb->data; return cf->can_id; } static void em_canid_sff_match_add(struct canid_match *cm, u32 can_id, u32 can_mask) { int i; /* * Limit can_mask and can_id to SFF range to * protect against write after end of array */ can_mask &= CAN_SFF_MASK; can_id &= can_mask; /* Single frame */ if (can_mask == CAN_SFF_MASK) { set_bit(can_id, cm->match_sff); return; } /* All frames */ if (can_mask == 0) { bitmap_fill(cm->match_sff, (1 << CAN_SFF_ID_BITS)); return; } /* * Individual frame filter. * Add record (set bit to 1) for each ID that * conforms particular rule */ for (i = 0; i < (1 << CAN_SFF_ID_BITS); i++) { if ((i & can_mask) == can_id) set_bit(i, cm->match_sff); } } static inline struct canid_match *em_canid_priv(struct tcf_ematch *m) { return (struct canid_match *)m->data; } static int em_canid_match(struct sk_buff *skb, struct tcf_ematch *m, struct tcf_pkt_info *info) { struct canid_match *cm = em_canid_priv(m); canid_t can_id; int match = 0; int i; const struct can_filter *lp; can_id = em_canid_get_id(skb); if (can_id & CAN_EFF_FLAG) { for (i = 0, lp = cm->rules_raw; i < cm->eff_rules_count; i++, lp++) { if (!(((lp->can_id ^ can_id) & lp->can_mask))) { match = 1; break; } } } else { /* SFF */ can_id &= CAN_SFF_MASK; match = (test_bit(can_id, cm->match_sff) ? 1 : 0); } return match; } static int em_canid_change(struct net *net, void *data, int len, struct tcf_ematch *m) { struct can_filter *conf = data; /* Array with rules */ struct canid_match *cm; int i; if (!len) return -EINVAL; if (len % sizeof(struct can_filter)) return -EINVAL; if (len > sizeof(struct can_filter) * EM_CAN_RULES_MAX) return -EINVAL; cm = kzalloc(sizeof(struct canid_match) + len, GFP_KERNEL); if (!cm) return -ENOMEM; cm->rules_count = len / sizeof(struct can_filter); /* * We need two for() loops for copying rules into two contiguous * areas in rules_raw to process all eff rules with a simple loop. * NB: The configuration interface supports sff and eff rules. * We do not support filters here that match for the same can_id * provided in a SFF and EFF frame (e.g. 0x123 / 0x80000123). * For this (unusual case) two filters have to be specified. The * SFF/EFF separation is done with the CAN_EFF_FLAG in the can_id. */ /* Fill rules_raw with EFF rules first */ for (i = 0; i < cm->rules_count; i++) { if (conf[i].can_id & CAN_EFF_FLAG) { memcpy(cm->rules_raw + cm->eff_rules_count, &conf[i], sizeof(struct can_filter)); cm->eff_rules_count++; } } /* append SFF frame rules */ for (i = 0; i < cm->rules_count; i++) { if (!(conf[i].can_id & CAN_EFF_FLAG)) { memcpy(cm->rules_raw + cm->eff_rules_count + cm->sff_rules_count, &conf[i], sizeof(struct can_filter)); cm->sff_rules_count++; em_canid_sff_match_add(cm, conf[i].can_id, conf[i].can_mask); } } m->datalen = sizeof(struct canid_match) + len; m->data = (unsigned long)cm; return 0; } static void em_canid_destroy(struct tcf_ematch *m) { struct canid_match *cm = em_canid_priv(m); kfree(cm); } static int em_canid_dump(struct sk_buff *skb, struct tcf_ematch *m) { struct canid_match *cm = em_canid_priv(m); /* * When configuring this ematch 'rules_count' is set not to exceed * 'rules_raw' array size */ if (nla_put_nohdr(skb, sizeof(struct can_filter) * cm->rules_count, &cm->rules_raw) < 0) return -EMSGSIZE; return 0; } static struct tcf_ematch_ops em_canid_ops = { .kind = TCF_EM_CANID, .change = em_canid_change, .match = em_canid_match, .destroy = em_canid_destroy, .dump = em_canid_dump, .owner = THIS_MODULE, .link = LIST_HEAD_INIT(em_canid_ops.link) }; static int __init init_em_canid(void) { return tcf_em_register(&em_canid_ops); } static void __exit exit_em_canid(void) { tcf_em_unregister(&em_canid_ops); } MODULE_LICENSE("GPL"); module_init(init_em_canid); module_exit(exit_em_canid); MODULE_ALIAS_TCF_EMATCH(TCF_EM_CANID);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Framework and drivers for configuring and reading different PHYs * Based on code in sungem_phy.c and (long-removed) gianfar_phy.c * * Author: Andy Fleming * * Copyright (c) 2004 Freescale Semiconductor, Inc. */ #ifndef __PHY_H #define __PHY_H #include <linux/compiler.h> #include <linux/spinlock.h> #include <linux/ethtool.h> #include <linux/leds.h> #include <linux/linkmode.h> #include <linux/netlink.h> #include <linux/mdio.h> #include <linux/mii.h> #include <linux/mii_timestamper.h> #include <linux/module.h> #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/mod_devicetable.h> #include <linux/u64_stats_sync.h> #include <linux/irqreturn.h> #include <linux/iopoll.h> #include <linux/refcount.h> #include <linux/atomic.h> #define PHY_DEFAULT_FEATURES (SUPPORTED_Autoneg | \ SUPPORTED_TP | \ SUPPORTED_MII) #define PHY_10BT_FEATURES (SUPPORTED_10baseT_Half | \ SUPPORTED_10baseT_Full) #define PHY_100BT_FEATURES (SUPPORTED_100baseT_Half | \ SUPPORTED_100baseT_Full) #define PHY_1000BT_FEATURES (SUPPORTED_1000baseT_Half | \ SUPPORTED_1000baseT_Full) extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_basic_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_basic_t1_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_basic_t1s_p2mp_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_gbit_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_gbit_fibre_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_gbit_all_ports_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_fec_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_full_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_eee_cap1_features) __ro_after_init; #define PHY_BASIC_FEATURES ((unsigned long *)&phy_basic_features) #define PHY_BASIC_T1_FEATURES ((unsigned long *)&phy_basic_t1_features) #define PHY_BASIC_T1S_P2MP_FEATURES ((unsigned long *)&phy_basic_t1s_p2mp_features) #define PHY_GBIT_FEATURES ((unsigned long *)&phy_gbit_features) #define PHY_GBIT_FIBRE_FEATURES ((unsigned long *)&phy_gbit_fibre_features) #define PHY_GBIT_ALL_PORTS_FEATURES ((unsigned long *)&phy_gbit_all_ports_features) #define PHY_10GBIT_FEATURES ((unsigned long *)&phy_10gbit_features) #define PHY_10GBIT_FEC_FEATURES ((unsigned long *)&phy_10gbit_fec_features) #define PHY_10GBIT_FULL_FEATURES ((unsigned long *)&phy_10gbit_full_features) #define PHY_EEE_CAP1_FEATURES ((unsigned long *)&phy_eee_cap1_features) extern const int phy_basic_ports_array[3]; extern const int phy_fibre_port_array[1]; extern const int phy_all_ports_features_array[7]; extern const int phy_10_100_features_array[4]; extern const int phy_basic_t1_features_array[3]; extern const int phy_basic_t1s_p2mp_features_array[2]; extern const int phy_gbit_features_array[2]; extern const int phy_10gbit_features_array[1]; /* * Set phydev->irq to PHY_POLL if interrupts are not supported, * or not desired for this PHY. Set to PHY_MAC_INTERRUPT if * the attached MAC driver handles the interrupt */ #define PHY_POLL -1 #define PHY_MAC_INTERRUPT -2 #define PHY_IS_INTERNAL 0x00000001 #define PHY_RST_AFTER_CLK_EN 0x00000002 #define PHY_POLL_CABLE_TEST 0x00000004 #define PHY_ALWAYS_CALL_SUSPEND 0x00000008 #define MDIO_DEVICE_IS_PHY 0x80000000 /** * enum phy_interface_t - Interface Mode definitions * * @PHY_INTERFACE_MODE_NA: Not Applicable - don't touch * @PHY_INTERFACE_MODE_INTERNAL: No interface, MAC and PHY combined * @PHY_INTERFACE_MODE_MII: Media-independent interface * @PHY_INTERFACE_MODE_GMII: Gigabit media-independent interface * @PHY_INTERFACE_MODE_SGMII: Serial gigabit media-independent interface * @PHY_INTERFACE_MODE_TBI: Ten Bit Interface * @PHY_INTERFACE_MODE_REVMII: Reverse Media Independent Interface * @PHY_INTERFACE_MODE_RMII: Reduced Media Independent Interface * @PHY_INTERFACE_MODE_REVRMII: Reduced Media Independent Interface in PHY role * @PHY_INTERFACE_MODE_RGMII: Reduced gigabit media-independent interface * @PHY_INTERFACE_MODE_RGMII_ID: RGMII with Internal RX+TX delay * @PHY_INTERFACE_MODE_RGMII_RXID: RGMII with Internal RX delay * @PHY_INTERFACE_MODE_RGMII_TXID: RGMII with Internal RX delay * @PHY_INTERFACE_MODE_RTBI: Reduced TBI * @PHY_INTERFACE_MODE_SMII: Serial MII * @PHY_INTERFACE_MODE_XGMII: 10 gigabit media-independent interface * @PHY_INTERFACE_MODE_XLGMII:40 gigabit media-independent interface * @PHY_INTERFACE_MODE_MOCA: Multimedia over Coax * @PHY_INTERFACE_MODE_PSGMII: Penta SGMII * @PHY_INTERFACE_MODE_QSGMII: Quad SGMII * @PHY_INTERFACE_MODE_TRGMII: Turbo RGMII * @PHY_INTERFACE_MODE_100BASEX: 100 BaseX * @PHY_INTERFACE_MODE_1000BASEX: 1000 BaseX * @PHY_INTERFACE_MODE_2500BASEX: 2500 BaseX * @PHY_INTERFACE_MODE_5GBASER: 5G BaseR * @PHY_INTERFACE_MODE_RXAUI: Reduced XAUI * @PHY_INTERFACE_MODE_XAUI: 10 Gigabit Attachment Unit Interface * @PHY_INTERFACE_MODE_10GBASER: 10G BaseR * @PHY_INTERFACE_MODE_25GBASER: 25G BaseR * @PHY_INTERFACE_MODE_USXGMII: Universal Serial 10GE MII * @PHY_INTERFACE_MODE_10GKR: 10GBASE-KR - with Clause 73 AN * @PHY_INTERFACE_MODE_QUSGMII: Quad Universal SGMII * @PHY_INTERFACE_MODE_1000BASEKX: 1000Base-KX - with Clause 73 AN * @PHY_INTERFACE_MODE_MAX: Book keeping * * Describes the interface between the MAC and PHY. */ typedef enum { PHY_INTERFACE_MODE_NA, PHY_INTERFACE_MODE_INTERNAL, PHY_INTERFACE_MODE_MII, PHY_INTERFACE_MODE_GMII, PHY_INTERFACE_MODE_SGMII, PHY_INTERFACE_MODE_TBI, PHY_INTERFACE_MODE_REVMII, PHY_INTERFACE_MODE_RMII, PHY_INTERFACE_MODE_REVRMII, PHY_INTERFACE_MODE_RGMII, PHY_INTERFACE_MODE_RGMII_ID, PHY_INTERFACE_MODE_RGMII_RXID, PHY_INTERFACE_MODE_RGMII_TXID, PHY_INTERFACE_MODE_RTBI, PHY_INTERFACE_MODE_SMII, PHY_INTERFACE_MODE_XGMII, PHY_INTERFACE_MODE_XLGMII, PHY_INTERFACE_MODE_MOCA, PHY_INTERFACE_MODE_PSGMII, PHY_INTERFACE_MODE_QSGMII, PHY_INTERFACE_MODE_TRGMII, PHY_INTERFACE_MODE_100BASEX, PHY_INTERFACE_MODE_1000BASEX, PHY_INTERFACE_MODE_2500BASEX, PHY_INTERFACE_MODE_5GBASER, PHY_INTERFACE_MODE_RXAUI, PHY_INTERFACE_MODE_XAUI, /* 10GBASE-R, XFI, SFI - single lane 10G Serdes */ PHY_INTERFACE_MODE_10GBASER, PHY_INTERFACE_MODE_25GBASER, PHY_INTERFACE_MODE_USXGMII, /* 10GBASE-KR - with Clause 73 AN */ PHY_INTERFACE_MODE_10GKR, PHY_INTERFACE_MODE_QUSGMII, PHY_INTERFACE_MODE_1000BASEKX, PHY_INTERFACE_MODE_MAX, } phy_interface_t; /* PHY interface mode bitmap handling */ #define DECLARE_PHY_INTERFACE_MASK(name) \ DECLARE_BITMAP(name, PHY_INTERFACE_MODE_MAX) static inline void phy_interface_zero(unsigned long *intf) { bitmap_zero(intf, PHY_INTERFACE_MODE_MAX); } static inline bool phy_interface_empty(const unsigned long *intf) { return bitmap_empty(intf, PHY_INTERFACE_MODE_MAX); } static inline void phy_interface_and(unsigned long *dst, const unsigned long *a, const unsigned long *b) { bitmap_and(dst, a, b, PHY_INTERFACE_MODE_MAX); } static inline void phy_interface_or(unsigned long *dst, const unsigned long *a, const unsigned long *b) { bitmap_or(dst, a, b, PHY_INTERFACE_MODE_MAX); } static inline void phy_interface_set_rgmii(unsigned long *intf) { __set_bit(PHY_INTERFACE_MODE_RGMII, intf); __set_bit(PHY_INTERFACE_MODE_RGMII_ID, intf); __set_bit(PHY_INTERFACE_MODE_RGMII_RXID, intf); __set_bit(PHY_INTERFACE_MODE_RGMII_TXID, intf); } /* * phy_supported_speeds - return all speeds currently supported by a PHY device */ unsigned int phy_supported_speeds(struct phy_device *phy, unsigned int *speeds, unsigned int size); /** * phy_modes - map phy_interface_t enum to device tree binding of phy-mode * @interface: enum phy_interface_t value * * Description: maps enum &phy_interface_t defined in this file * into the device tree binding of 'phy-mode', so that Ethernet * device driver can get PHY interface from device tree. */ static inline const char *phy_modes(phy_interface_t interface) { switch (interface) { case PHY_INTERFACE_MODE_NA: return ""; case PHY_INTERFACE_MODE_INTERNAL: return "internal"; case PHY_INTERFACE_MODE_MII: return "mii"; case PHY_INTERFACE_MODE_GMII: return "gmii"; case PHY_INTERFACE_MODE_SGMII: return "sgmii"; case PHY_INTERFACE_MODE_TBI: return "tbi"; case PHY_INTERFACE_MODE_REVMII: return "rev-mii"; case PHY_INTERFACE_MODE_RMII: return "rmii"; case PHY_INTERFACE_MODE_REVRMII: return "rev-rmii"; case PHY_INTERFACE_MODE_RGMII: return "rgmii"; case PHY_INTERFACE_MODE_RGMII_ID: return "rgmii-id"; case PHY_INTERFACE_MODE_RGMII_RXID: return "rgmii-rxid"; case PHY_INTERFACE_MODE_RGMII_TXID: return "rgmii-txid"; case PHY_INTERFACE_MODE_RTBI: return "rtbi"; case PHY_INTERFACE_MODE_SMII: return "smii"; case PHY_INTERFACE_MODE_XGMII: return "xgmii"; case PHY_INTERFACE_MODE_XLGMII: return "xlgmii"; case PHY_INTERFACE_MODE_MOCA: return "moca"; case PHY_INTERFACE_MODE_PSGMII: return "psgmii"; case PHY_INTERFACE_MODE_QSGMII: return "qsgmii"; case PHY_INTERFACE_MODE_TRGMII: return "trgmii"; case PHY_INTERFACE_MODE_1000BASEX: return "1000base-x"; case PHY_INTERFACE_MODE_1000BASEKX: return "1000base-kx"; case PHY_INTERFACE_MODE_2500BASEX: return "2500base-x"; case PHY_INTERFACE_MODE_5GBASER: return "5gbase-r"; case PHY_INTERFACE_MODE_RXAUI: return "rxaui"; case PHY_INTERFACE_MODE_XAUI: return "xaui"; case PHY_INTERFACE_MODE_10GBASER: return "10gbase-r"; case PHY_INTERFACE_MODE_25GBASER: return "25gbase-r"; case PHY_INTERFACE_MODE_USXGMII: return "usxgmii"; case PHY_INTERFACE_MODE_10GKR: return "10gbase-kr"; case PHY_INTERFACE_MODE_100BASEX: return "100base-x"; case PHY_INTERFACE_MODE_QUSGMII: return "qusgmii"; default: return "unknown"; } } #define PHY_INIT_TIMEOUT 100000 #define PHY_FORCE_TIMEOUT 10 #define PHY_MAX_ADDR 32 /* Used when trying to connect to a specific phy (mii bus id:phy device id) */ #define PHY_ID_FMT "%s:%02x" #define MII_BUS_ID_SIZE 61 struct device; struct kernel_hwtstamp_config; struct phylink; struct sfp_bus; struct sfp_upstream_ops; struct sk_buff; /** * struct mdio_bus_stats - Statistics counters for MDIO busses * @transfers: Total number of transfers, i.e. @writes + @reads * @errors: Number of MDIO transfers that returned an error * @writes: Number of write transfers * @reads: Number of read transfers * @syncp: Synchronisation for incrementing statistics */ struct mdio_bus_stats { u64_stats_t transfers; u64_stats_t errors; u64_stats_t writes; u64_stats_t reads; /* Must be last, add new statistics above */ struct u64_stats_sync syncp; }; /** * struct phy_package_shared - Shared information in PHY packages * @base_addr: Base PHY address of PHY package used to combine PHYs * in one package and for offset calculation of phy_package_read/write * @refcnt: Number of PHYs connected to this shared data * @flags: Initialization of PHY package * @priv_size: Size of the shared private data @priv * @priv: Driver private data shared across a PHY package * * Represents a shared structure between different phydev's in the same * package, for example a quad PHY. See phy_package_join() and * phy_package_leave(). */ struct phy_package_shared { u8 base_addr; refcount_t refcnt; unsigned long flags; size_t priv_size; /* private data pointer */ /* note that this pointer is shared between different phydevs and * the user has to take care of appropriate locking. It is allocated * and freed automatically by phy_package_join() and * phy_package_leave(). */ void *priv; }; /* used as bit number in atomic bitops */ #define PHY_SHARED_F_INIT_DONE 0 #define PHY_SHARED_F_PROBE_DONE 1 /** * struct mii_bus - Represents an MDIO bus * * @owner: Who owns this device * @name: User friendly name for this MDIO device, or driver name * @id: Unique identifier for this bus, typical from bus hierarchy * @priv: Driver private data * * The Bus class for PHYs. Devices which provide access to * PHYs should register using this structure */ struct mii_bus { struct module *owner; const char *name; char id[MII_BUS_ID_SIZE]; void *priv; /** @read: Perform a read transfer on the bus */ int (*read)(struct mii_bus *bus, int addr, int regnum); /** @write: Perform a write transfer on the bus */ int (*write)(struct mii_bus *bus, int addr, int regnum, u16 val); /** @read_c45: Perform a C45 read transfer on the bus */ int (*read_c45)(struct mii_bus *bus, int addr, int devnum, int regnum); /** @write_c45: Perform a C45 write transfer on the bus */ int (*write_c45)(struct mii_bus *bus, int addr, int devnum, int regnum, u16 val); /** @reset: Perform a reset of the bus */ int (*reset)(struct mii_bus *bus); /** @stats: Statistic counters per device on the bus */ struct mdio_bus_stats stats[PHY_MAX_ADDR]; /** * @mdio_lock: A lock to ensure that only one thing can read/write * the MDIO bus at a time */ struct mutex mdio_lock; /** @parent: Parent device of this bus */ struct device *parent; /** @state: State of bus structure */ enum { MDIOBUS_ALLOCATED = 1, MDIOBUS_REGISTERED, MDIOBUS_UNREGISTERED, MDIOBUS_RELEASED, } state; /** @dev: Kernel device representation */ struct device dev; /** @mdio_map: list of all MDIO devices on bus */ struct mdio_device *mdio_map[PHY_MAX_ADDR]; /** @phy_mask: PHY addresses to be ignored when probing */ u32 phy_mask; /** @phy_ignore_ta_mask: PHY addresses to ignore the TA/read failure */ u32 phy_ignore_ta_mask; /** * @irq: An array of interrupts, each PHY's interrupt at the index * matching its address */ int irq[PHY_MAX_ADDR]; /** @reset_delay_us: GPIO reset pulse width in microseconds */ int reset_delay_us; /** @reset_post_delay_us: GPIO reset deassert delay in microseconds */ int reset_post_delay_us; /** @reset_gpiod: Reset GPIO descriptor pointer */ struct gpio_desc *reset_gpiod; /** @shared_lock: protect access to the shared element */ struct mutex shared_lock; /** @shared: shared state across different PHYs */ struct phy_package_shared *shared[PHY_MAX_ADDR]; }; #define to_mii_bus(d) container_of(d, struct mii_bus, dev) struct mii_bus *mdiobus_alloc_size(size_t size); /** * mdiobus_alloc - Allocate an MDIO bus structure * * The internal state of the MDIO bus will be set of MDIOBUS_ALLOCATED ready * for the driver to register the bus. */ static inline struct mii_bus *mdiobus_alloc(void) { return mdiobus_alloc_size(0); } int __mdiobus_register(struct mii_bus *bus, struct module *owner); int __devm_mdiobus_register(struct device *dev, struct mii_bus *bus, struct module *owner); #define mdiobus_register(bus) __mdiobus_register(bus, THIS_MODULE) #define devm_mdiobus_register(dev, bus) \ __devm_mdiobus_register(dev, bus, THIS_MODULE) void mdiobus_unregister(struct mii_bus *bus); void mdiobus_free(struct mii_bus *bus); struct mii_bus *devm_mdiobus_alloc_size(struct device *dev, int sizeof_priv); static inline struct mii_bus *devm_mdiobus_alloc(struct device *dev) { return devm_mdiobus_alloc_size(dev, 0); } struct mii_bus *mdio_find_bus(const char *mdio_name); struct phy_device *mdiobus_scan_c22(struct mii_bus *bus, int addr); #define PHY_INTERRUPT_DISABLED false #define PHY_INTERRUPT_ENABLED true /** * enum phy_state - PHY state machine states: * * @PHY_DOWN: PHY device and driver are not ready for anything. probe * should be called if and only if the PHY is in this state, * given that the PHY device exists. * - PHY driver probe function will set the state to @PHY_READY * * @PHY_READY: PHY is ready to send and receive packets, but the * controller is not. By default, PHYs which do not implement * probe will be set to this state by phy_probe(). * - start will set the state to UP * * @PHY_UP: The PHY and attached device are ready to do work. * Interrupts should be started here. * - timer moves to @PHY_NOLINK or @PHY_RUNNING * * @PHY_NOLINK: PHY is up, but not currently plugged in. * - irq or timer will set @PHY_RUNNING if link comes back * - phy_stop moves to @PHY_HALTED * * @PHY_RUNNING: PHY is currently up, running, and possibly sending * and/or receiving packets * - irq or timer will set @PHY_NOLINK if link goes down * - phy_stop moves to @PHY_HALTED * * @PHY_CABLETEST: PHY is performing a cable test. Packet reception/sending * is not expected to work, carrier will be indicated as down. PHY will be * poll once per second, or on interrupt for it current state. * Once complete, move to UP to restart the PHY. * - phy_stop aborts the running test and moves to @PHY_HALTED * * @PHY_HALTED: PHY is up, but no polling or interrupts are done. * - phy_start moves to @PHY_UP * * @PHY_ERROR: PHY is up, but is in an error state. * - phy_stop moves to @PHY_HALTED */ enum phy_state { PHY_DOWN = 0, PHY_READY, PHY_HALTED, PHY_ERROR, PHY_UP, PHY_RUNNING, PHY_NOLINK, PHY_CABLETEST, }; #define MDIO_MMD_NUM 32 /** * struct phy_c45_device_ids - 802.3-c45 Device Identifiers * @devices_in_package: IEEE 802.3 devices in package register value. * @mmds_present: bit vector of MMDs present. * @device_ids: The device identifer for each present device. */ struct phy_c45_device_ids { u32 devices_in_package; u32 mmds_present; u32 device_ids[MDIO_MMD_NUM]; }; struct macsec_context; struct macsec_ops; /** * struct phy_device - An instance of a PHY * * @mdio: MDIO bus this PHY is on * @drv: Pointer to the driver for this PHY instance * @devlink: Create a link between phy dev and mac dev, if the external phy * used by current mac interface is managed by another mac interface. * @phy_id: UID for this device found during discovery * @c45_ids: 802.3-c45 Device Identifiers if is_c45. * @is_c45: Set to true if this PHY uses clause 45 addressing. * @is_internal: Set to true if this PHY is internal to a MAC. * @is_pseudo_fixed_link: Set to true if this PHY is an Ethernet switch, etc. * @is_gigabit_capable: Set to true if PHY supports 1000Mbps * @has_fixups: Set to true if this PHY has fixups/quirks. * @suspended: Set to true if this PHY has been suspended successfully. * @suspended_by_mdio_bus: Set to true if this PHY was suspended by MDIO bus. * @sysfs_links: Internal boolean tracking sysfs symbolic links setup/removal. * @loopback_enabled: Set true if this PHY has been loopbacked successfully. * @downshifted_rate: Set true if link speed has been downshifted. * @is_on_sfp_module: Set true if PHY is located on an SFP module. * @mac_managed_pm: Set true if MAC driver takes of suspending/resuming PHY * @wol_enabled: Set to true if the PHY or the attached MAC have Wake-on-LAN * enabled. * @state: State of the PHY for management purposes * @dev_flags: Device-specific flags used by the PHY driver. * * - Bits [15:0] are free to use by the PHY driver to communicate * driver specific behavior. * - Bits [23:16] are currently reserved for future use. * - Bits [31:24] are reserved for defining generic * PHY driver behavior. * @irq: IRQ number of the PHY's interrupt (-1 if none) * @phylink: Pointer to phylink instance for this PHY * @sfp_bus_attached: Flag indicating whether the SFP bus has been attached * @sfp_bus: SFP bus attached to this PHY's fiber port * @attached_dev: The attached enet driver's device instance ptr * @adjust_link: Callback for the enet controller to respond to changes: in the * link state. * @phy_link_change: Callback for phylink for notification of link change * @macsec_ops: MACsec offloading ops. * * @speed: Current link speed * @duplex: Current duplex * @port: Current port * @pause: Current pause * @asym_pause: Current asymmetric pause * @supported: Combined MAC/PHY supported linkmodes * @advertising: Currently advertised linkmodes * @adv_old: Saved advertised while power saving for WoL * @supported_eee: supported PHY EEE linkmodes * @advertising_eee: Currently advertised EEE linkmodes * @eee_enabled: Flag indicating whether the EEE feature is enabled * @lp_advertising: Current link partner advertised linkmodes * @host_interfaces: PHY interface modes supported by host * @eee_broken_modes: Energy efficient ethernet modes which should be prohibited * @autoneg: Flag autoneg being used * @rate_matching: Current rate matching mode * @link: Current link state * @autoneg_complete: Flag auto negotiation of the link has completed * @mdix: Current crossover * @mdix_ctrl: User setting of crossover * @pma_extable: Cached value of PMA/PMD Extended Abilities Register * @interrupts: Flag interrupts have been enabled * @irq_suspended: Flag indicating PHY is suspended and therefore interrupt * handling shall be postponed until PHY has resumed * @irq_rerun: Flag indicating interrupts occurred while PHY was suspended, * requiring a rerun of the interrupt handler after resume * @interface: enum phy_interface_t value * @possible_interfaces: bitmap if interface modes that the attached PHY * will switch between depending on media speed. * @skb: Netlink message for cable diagnostics * @nest: Netlink nest used for cable diagnostics * @ehdr: nNtlink header for cable diagnostics * @phy_led_triggers: Array of LED triggers * @phy_num_led_triggers: Number of triggers in @phy_led_triggers * @led_link_trigger: LED trigger for link up/down * @last_triggered: last LED trigger for link speed * @leds: list of PHY LED structures * @master_slave_set: User requested master/slave configuration * @master_slave_get: Current master/slave advertisement * @master_slave_state: Current master/slave configuration * @mii_ts: Pointer to time stamper callbacks * @psec: Pointer to Power Sourcing Equipment control struct * @lock: Mutex for serialization access to PHY * @state_queue: Work queue for state machine * @link_down_events: Number of times link was lost * @shared: Pointer to private data shared by phys in one package * @priv: Pointer to driver private data * * interrupts currently only supports enabled or disabled, * but could be changed in the future to support enabling * and disabling specific interrupts * * Contains some infrastructure for polling and interrupt * handling, as well as handling shifts in PHY hardware state */ struct phy_device { struct mdio_device mdio; /* Information about the PHY type */ /* And management functions */ struct phy_driver *drv; struct device_link *devlink; u32 phy_id; struct phy_c45_device_ids c45_ids; unsigned is_c45:1; unsigned is_internal:1; unsigned is_pseudo_fixed_link:1; unsigned is_gigabit_capable:1; unsigned has_fixups:1; unsigned suspended:1; unsigned suspended_by_mdio_bus:1; unsigned sysfs_links:1; unsigned loopback_enabled:1; unsigned downshifted_rate:1; unsigned is_on_sfp_module:1; unsigned mac_managed_pm:1; unsigned wol_enabled:1; unsigned autoneg:1; /* The most recently read link state */ unsigned link:1; unsigned autoneg_complete:1; /* Interrupts are enabled */ unsigned interrupts:1; unsigned irq_suspended:1; unsigned irq_rerun:1; int rate_matching; enum phy_state state; u32 dev_flags; phy_interface_t interface; DECLARE_PHY_INTERFACE_MASK(possible_interfaces); /* * forced speed & duplex (no autoneg) * partner speed & duplex & pause (autoneg) */ int speed; int duplex; int port; int pause; int asym_pause; u8 master_slave_get; u8 master_slave_set; u8 master_slave_state; /* Union of PHY and Attached devices' supported link modes */ /* See ethtool.h for more info */ __ETHTOOL_DECLARE_LINK_MODE_MASK(supported); __ETHTOOL_DECLARE_LINK_MODE_MASK(advertising); __ETHTOOL_DECLARE_LINK_MODE_MASK(lp_advertising); /* used with phy_speed_down */ __ETHTOOL_DECLARE_LINK_MODE_MASK(adv_old); /* used for eee validation */ __ETHTOOL_DECLARE_LINK_MODE_MASK(supported_eee); __ETHTOOL_DECLARE_LINK_MODE_MASK(advertising_eee); bool eee_enabled; /* Host supported PHY interface types. Should be ignored if empty. */ DECLARE_PHY_INTERFACE_MASK(host_interfaces); /* Energy efficient ethernet modes which should be prohibited */ u32 eee_broken_modes; #ifdef CONFIG_LED_TRIGGER_PHY struct phy_led_trigger *phy_led_triggers; unsigned int phy_num_led_triggers; struct phy_led_trigger *last_triggered; struct phy_led_trigger *led_link_trigger; #endif struct list_head leds; /* * Interrupt number for this PHY * -1 means no interrupt */ int irq; /* private data pointer */ /* For use by PHYs to maintain extra state */ void *priv; /* shared data pointer */ /* For use by PHYs inside the same package that need a shared state. */ struct phy_package_shared *shared; /* Reporting cable test results */ struct sk_buff *skb; void *ehdr; struct nlattr *nest; /* Interrupt and Polling infrastructure */ struct delayed_work state_queue; struct mutex lock; /* This may be modified under the rtnl lock */ bool sfp_bus_attached; struct sfp_bus *sfp_bus; struct phylink *phylink; struct net_device *attached_dev; struct mii_timestamper *mii_ts; struct pse_control *psec; u8 mdix; u8 mdix_ctrl; int pma_extable; unsigned int link_down_events; void (*phy_link_change)(struct phy_device *phydev, bool up); void (*adjust_link)(struct net_device *dev); #if IS_ENABLED(CONFIG_MACSEC) /* MACsec management functions */ const struct macsec_ops *macsec_ops; #endif }; /* Generic phy_device::dev_flags */ #define PHY_F_NO_IRQ 0x80000000 static inline struct phy_device *to_phy_device(const struct device *dev) { return container_of(to_mdio_device(dev), struct phy_device, mdio); } /** * struct phy_tdr_config - Configuration of a TDR raw test * * @first: Distance for first data collection point * @last: Distance for last data collection point * @step: Step between data collection points * @pair: Bitmap of cable pairs to collect data for * * A structure containing possible configuration parameters * for a TDR cable test. The driver does not need to implement * all the parameters, but should report what is actually used. * All distances are in centimeters. */ struct phy_tdr_config { u32 first; u32 last; u32 step; s8 pair; }; #define PHY_PAIR_ALL -1 /** * struct phy_plca_cfg - Configuration of the PLCA (Physical Layer Collision * Avoidance) Reconciliation Sublayer. * * @version: read-only PLCA register map version. -1 = not available. Ignored * when setting the configuration. Format is the same as reported by the PLCA * IDVER register (31.CA00). -1 = not available. * @enabled: PLCA configured mode (enabled/disabled). -1 = not available / don't * set. 0 = disabled, anything else = enabled. * @node_id: the PLCA local node identifier. -1 = not available / don't set. * Allowed values [0 .. 254]. 255 = node disabled. * @node_cnt: the PLCA node count (maximum number of nodes having a TO). Only * meaningful for the coordinator (node_id = 0). -1 = not available / don't * set. Allowed values [1 .. 255]. * @to_tmr: The value of the PLCA to_timer in bit-times, which determines the * PLCA transmit opportunity window opening. See IEEE802.3 Clause 148 for * more details. The to_timer shall be set equal over all nodes. * -1 = not available / don't set. Allowed values [0 .. 255]. * @burst_cnt: controls how many additional frames a node is allowed to send in * single transmit opportunity (TO). The default value of 0 means that the * node is allowed exactly one frame per TO. A value of 1 allows two frames * per TO, and so on. -1 = not available / don't set. * Allowed values [0 .. 255]. * @burst_tmr: controls how many bit times to wait for the MAC to send a new * frame before interrupting the burst. This value should be set to a value * greater than the MAC inter-packet gap (which is typically 96 bits). * -1 = not available / don't set. Allowed values [0 .. 255]. * * A structure containing configuration parameters for setting/getting the PLCA * RS configuration. The driver does not need to implement all the parameters, * but should report what is actually used. */ struct phy_plca_cfg { int version; int enabled; int node_id; int node_cnt; int to_tmr; int burst_cnt; int burst_tmr; }; /** * struct phy_plca_status - Status of the PLCA (Physical Layer Collision * Avoidance) Reconciliation Sublayer. * * @pst: The PLCA status as reported by the PST bit in the PLCA STATUS * register(31.CA03), indicating BEACON activity. * * A structure containing status information of the PLCA RS configuration. * The driver does not need to implement all the parameters, but should report * what is actually used. */ struct phy_plca_status { bool pst; }; /** * struct phy_led: An LED driven by the PHY * * @list: List of LEDs * @phydev: PHY this LED is attached to * @led_cdev: Standard LED class structure * @index: Number of the LED */ struct phy_led { struct list_head list; struct phy_device *phydev; struct led_classdev led_cdev; u8 index; }; #define to_phy_led(d) container_of(d, struct phy_led, led_cdev) /** * struct phy_driver - Driver structure for a particular PHY type * * @mdiodrv: Data common to all MDIO devices * @phy_id: The result of reading the UID registers of this PHY * type, and ANDing them with the phy_id_mask. This driver * only works for PHYs with IDs which match this field * @name: The friendly name of this PHY type * @phy_id_mask: Defines the important bits of the phy_id * @features: A mandatory list of features (speed, duplex, etc) * supported by this PHY * @flags: A bitfield defining certain other features this PHY * supports (like interrupts) * @driver_data: Static driver data * * All functions are optional. If config_aneg or read_status * are not implemented, the phy core uses the genphy versions. * Note that none of these functions should be called from * interrupt time. The goal is for the bus read/write functions * to be able to block when the bus transaction is happening, * and be freed up by an interrupt (The MPC85xx has this ability, * though it is not currently supported in the driver). */ struct phy_driver { struct mdio_driver_common mdiodrv; u32 phy_id; char *name; u32 phy_id_mask; const unsigned long * const features; u32 flags; const void *driver_data; /** * @soft_reset: Called to issue a PHY software reset */ int (*soft_reset)(struct phy_device *phydev); /** * @config_init: Called to initialize the PHY, * including after a reset */ int (*config_init)(struct phy_device *phydev); /** * @probe: Called during discovery. Used to set * up device-specific structures, if any */ int (*probe)(struct phy_device *phydev); /** * @get_features: Probe the hardware to determine what * abilities it has. Should only set phydev->supported. */ int (*get_features)(struct phy_device *phydev); /** * @get_rate_matching: Get the supported type of rate matching for a * particular phy interface. This is used by phy consumers to determine * whether to advertise lower-speed modes for that interface. It is * assumed that if a rate matching mode is supported on an interface, * then that interface's rate can be adapted to all slower link speeds * supported by the phy. If the interface is not supported, this should * return %RATE_MATCH_NONE. */ int (*get_rate_matching)(struct phy_device *phydev, phy_interface_t iface); /* PHY Power Management */ /** @suspend: Suspend the hardware, saving state if needed */ int (*suspend)(struct phy_device *phydev); /** @resume: Resume the hardware, restoring state if needed */ int (*resume)(struct phy_device *phydev); /** * @config_aneg: Configures the advertisement and resets * autonegotiation if phydev->autoneg is on, * forces the speed to the current settings in phydev * if phydev->autoneg is off */ int (*config_aneg)(struct phy_device *phydev); /** @aneg_done: Determines the auto negotiation result */ int (*aneg_done)(struct phy_device *phydev); /** @read_status: Determines the negotiated speed and duplex */ int (*read_status)(struct phy_device *phydev); /** * @config_intr: Enables or disables interrupts. * It should also clear any pending interrupts prior to enabling the * IRQs and after disabling them. */ int (*config_intr)(struct phy_device *phydev); /** @handle_interrupt: Override default interrupt handling */ irqreturn_t (*handle_interrupt)(struct phy_device *phydev); /** @remove: Clears up any memory if needed */ void (*remove)(struct phy_device *phydev); /** * @match_phy_device: Returns true if this is a suitable * driver for the given phydev. If NULL, matching is based on * phy_id and phy_id_mask. */ int (*match_phy_device)(struct phy_device *phydev); /** * @set_wol: Some devices (e.g. qnap TS-119P II) require PHY * register changes to enable Wake on LAN, so set_wol is * provided to be called in the ethernet driver's set_wol * function. */ int (*set_wol)(struct phy_device *dev, struct ethtool_wolinfo *wol); /** * @get_wol: See set_wol, but for checking whether Wake on LAN * is enabled. */ void (*get_wol)(struct phy_device *dev, struct ethtool_wolinfo *wol); /** * @link_change_notify: Called to inform a PHY device driver * when the core is about to change the link state. This * callback is supposed to be used as fixup hook for drivers * that need to take action when the link state * changes. Drivers are by no means allowed to mess with the * PHY device structure in their implementations. */ void (*link_change_notify)(struct phy_device *dev); /** * @read_mmd: PHY specific driver override for reading a MMD * register. This function is optional for PHY specific * drivers. When not provided, the default MMD read function * will be used by phy_read_mmd(), which will use either a * direct read for Clause 45 PHYs or an indirect read for * Clause 22 PHYs. devnum is the MMD device number within the * PHY device, regnum is the register within the selected MMD * device. */ int (*read_mmd)(struct phy_device *dev, int devnum, u16 regnum); /** * @write_mmd: PHY specific driver override for writing a MMD * register. This function is optional for PHY specific * drivers. When not provided, the default MMD write function * will be used by phy_write_mmd(), which will use either a * direct write for Clause 45 PHYs, or an indirect write for * Clause 22 PHYs. devnum is the MMD device number within the * PHY device, regnum is the register within the selected MMD * device. val is the value to be written. */ int (*write_mmd)(struct phy_device *dev, int devnum, u16 regnum, u16 val); /** @read_page: Return the current PHY register page number */ int (*read_page)(struct phy_device *dev); /** @write_page: Set the current PHY register page number */ int (*write_page)(struct phy_device *dev, int page); /** * @module_info: Get the size and type of the eeprom contained * within a plug-in module */ int (*module_info)(struct phy_device *dev, struct ethtool_modinfo *modinfo); /** * @module_eeprom: Get the eeprom information from the plug-in * module */ int (*module_eeprom)(struct phy_device *dev, struct ethtool_eeprom *ee, u8 *data); /** @cable_test_start: Start a cable test */ int (*cable_test_start)(struct phy_device *dev); /** @cable_test_tdr_start: Start a raw TDR cable test */ int (*cable_test_tdr_start)(struct phy_device *dev, const struct phy_tdr_config *config); /** * @cable_test_get_status: Once per second, or on interrupt, * request the status of the test. */ int (*cable_test_get_status)(struct phy_device *dev, bool *finished); /* Get statistics from the PHY using ethtool */ /** @get_sset_count: Number of statistic counters */ int (*get_sset_count)(struct phy_device *dev); /** @get_strings: Names of the statistic counters */ void (*get_strings)(struct phy_device *dev, u8 *data); /** @get_stats: Return the statistic counter values */ void (*get_stats)(struct phy_device *dev, struct ethtool_stats *stats, u64 *data); /* Get and Set PHY tunables */ /** @get_tunable: Return the value of a tunable */ int (*get_tunable)(struct phy_device *dev, struct ethtool_tunable *tuna, void *data); /** @set_tunable: Set the value of a tunable */ int (*set_tunable)(struct phy_device *dev, struct ethtool_tunable *tuna, const void *data); /** @set_loopback: Set the loopback mood of the PHY */ int (*set_loopback)(struct phy_device *dev, bool enable); /** @get_sqi: Get the signal quality indication */ int (*get_sqi)(struct phy_device *dev); /** @get_sqi_max: Get the maximum signal quality indication */ int (*get_sqi_max)(struct phy_device *dev); /* PLCA RS interface */ /** @get_plca_cfg: Return the current PLCA configuration */ int (*get_plca_cfg)(struct phy_device *dev, struct phy_plca_cfg *plca_cfg); /** @set_plca_cfg: Set the PLCA configuration */ int (*set_plca_cfg)(struct phy_device *dev, const struct phy_plca_cfg *plca_cfg); /** @get_plca_status: Return the current PLCA status info */ int (*get_plca_status)(struct phy_device *dev, struct phy_plca_status *plca_st); /** * @led_brightness_set: Set a PHY LED brightness. Index * indicates which of the PHYs led should be set. Value * follows the standard LED class meaning, e.g. LED_OFF, * LED_HALF, LED_FULL. */ int (*led_brightness_set)(struct phy_device *dev, u8 index, enum led_brightness value); /** * @led_blink_set: Set a PHY LED brightness. Index indicates * which of the PHYs led should be configured to blink. Delays * are in milliseconds and if both are zero then a sensible * default should be chosen. The call should adjust the * timings in that case and if it can't match the values * specified exactly. */ int (*led_blink_set)(struct phy_device *dev, u8 index, unsigned long *delay_on, unsigned long *delay_off); /** * @led_hw_is_supported: Can the HW support the given rules. * @dev: PHY device which has the LED * @index: Which LED of the PHY device * @rules The core is interested in these rules * * Return 0 if yes, -EOPNOTSUPP if not, or an error code. */ int (*led_hw_is_supported)(struct phy_device *dev, u8 index, unsigned long rules); /** * @led_hw_control_set: Set the HW to control the LED * @dev: PHY device which has the LED * @index: Which LED of the PHY device * @rules The rules used to control the LED * * Returns 0, or a an error code. */ int (*led_hw_control_set)(struct phy_device *dev, u8 index, unsigned long rules); /** * @led_hw_control_get: Get how the HW is controlling the LED * @dev: PHY device which has the LED * @index: Which LED of the PHY device * @rules Pointer to the rules used to control the LED * * Set *@rules to how the HW is currently blinking. Returns 0 * on success, or a error code if the current blinking cannot * be represented in rules, or some other error happens. */ int (*led_hw_control_get)(struct phy_device *dev, u8 index, unsigned long *rules); }; #define to_phy_driver(d) container_of(to_mdio_common_driver(d), \ struct phy_driver, mdiodrv) #define PHY_ANY_ID "MATCH ANY PHY" #define PHY_ANY_UID 0xffffffff #define PHY_ID_MATCH_EXACT(id) .phy_id = (id), .phy_id_mask = GENMASK(31, 0) #define PHY_ID_MATCH_MODEL(id) .phy_id = (id), .phy_id_mask = GENMASK(31, 4) #define PHY_ID_MATCH_VENDOR(id) .phy_id = (id), .phy_id_mask = GENMASK(31, 10) /** * phy_id_compare - compare @id1 with @id2 taking account of @mask * @id1: first PHY ID * @id2: second PHY ID * @mask: the PHY ID mask, set bits are significant in matching * * Return true if the bits from @id1 and @id2 specified by @mask match. * This uses an equivalent test to (@id & @mask) == (@phy_id & @mask). */ static inline bool phy_id_compare(u32 id1, u32 id2, u32 mask) { return !((id1 ^ id2) & mask); } /** * phydev_id_compare - compare @id with the PHY's Clause 22 ID * @phydev: the PHY device * @id: the PHY ID to be matched * * Compare the @phydev clause 22 ID with the provided @id and return true or * false depending whether it matches, using the bound driver mask. The * @phydev must be bound to a driver. */ static inline bool phydev_id_compare(struct phy_device *phydev, u32 id) { return phy_id_compare(id, phydev->phy_id, phydev->drv->phy_id_mask); } /* A Structure for boards to register fixups with the PHY Lib */ struct phy_fixup { struct list_head list; char bus_id[MII_BUS_ID_SIZE + 3]; u32 phy_uid; u32 phy_uid_mask; int (*run)(struct phy_device *phydev); }; const char *phy_speed_to_str(int speed); const char *phy_duplex_to_str(unsigned int duplex); const char *phy_rate_matching_to_str(int rate_matching); int phy_interface_num_ports(phy_interface_t interface); /* A structure for mapping a particular speed and duplex * combination to a particular SUPPORTED and ADVERTISED value */ struct phy_setting { u32 speed; u8 duplex; u8 bit; }; const struct phy_setting * phy_lookup_setting(int speed, int duplex, const unsigned long *mask, bool exact); size_t phy_speeds(unsigned int *speeds, size_t size, unsigned long *mask); void of_set_phy_supported(struct phy_device *phydev); void of_set_phy_eee_broken(struct phy_device *phydev); int phy_speed_down_core(struct phy_device *phydev); /** * phy_is_started - Convenience function to check whether PHY is started * @phydev: The phy_device struct */ static inline bool phy_is_started(struct phy_device *phydev) { return phydev->state >= PHY_UP; } void phy_resolve_aneg_pause(struct phy_device *phydev); void phy_resolve_aneg_linkmode(struct phy_device *phydev); void phy_check_downshift(struct phy_device *phydev); /** * phy_read - Convenience function for reading a given PHY register * @phydev: the phy_device struct * @regnum: register number to read * * NOTE: MUST NOT be called from interrupt context, * because the bus read/write functions may wait for an interrupt * to conclude the operation. */ static inline int phy_read(struct phy_device *phydev, u32 regnum) { return mdiobus_read(phydev->mdio.bus, phydev->mdio.addr, regnum); } #define phy_read_poll_timeout(phydev, regnum, val, cond, sleep_us, \ timeout_us, sleep_before_read) \ ({ \ int __ret, __val; \ __ret = read_poll_timeout(__val = phy_read, val, \ __val < 0 || (cond), \ sleep_us, timeout_us, sleep_before_read, phydev, regnum); \ if (__val < 0) \ __ret = __val; \ if (__ret) \ phydev_err(phydev, "%s failed: %d\n", __func__, __ret); \ __ret; \ }) /** * __phy_read - convenience function for reading a given PHY register * @phydev: the phy_device struct * @regnum: register number to read * * The caller must have taken the MDIO bus lock. */ static inline int __phy_read(struct phy_device *phydev, u32 regnum) { return __mdiobus_read(phydev->mdio.bus, phydev->mdio.addr, regnum); } /** * phy_write - Convenience function for writing a given PHY register * @phydev: the phy_device struct * @regnum: register number to write * @val: value to write to @regnum * * NOTE: MUST NOT be called from interrupt context, * because the bus read/write functions may wait for an interrupt * to conclude the operation. */ static inline int phy_write(struct phy_device *phydev, u32 regnum, u16 val) { return mdiobus_write(phydev->mdio.bus, phydev->mdio.addr, regnum, val); } /** * __phy_write - Convenience function for writing a given PHY register * @phydev: the phy_device struct * @regnum: register number to write * @val: value to write to @regnum * * The caller must have taken the MDIO bus lock. */ static inline int __phy_write(struct phy_device *phydev, u32 regnum, u16 val) { return __mdiobus_write(phydev->mdio.bus, phydev->mdio.addr, regnum, val); } /** * __phy_modify_changed() - Convenience function for modifying a PHY register * @phydev: a pointer to a &struct phy_device * @regnum: register number * @mask: bit mask of bits to clear * @set: bit mask of bits to set * * Unlocked helper function which allows a PHY register to be modified as * new register value = (old register value & ~mask) | set * * Returns negative errno, 0 if there was no change, and 1 in case of change */ static inline int __phy_modify_changed(struct phy_device *phydev, u32 regnum, u16 mask, u16 set) { return __mdiobus_modify_changed(phydev->mdio.bus, phydev->mdio.addr, regnum, mask, set); } /* * phy_read_mmd - Convenience function for reading a register * from an MMD on a given PHY. */ int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum); /** * phy_read_mmd_poll_timeout - Periodically poll a PHY register until a * condition is met or a timeout occurs * * @phydev: The phy_device struct * @devaddr: The MMD to read from * @regnum: The register on the MMD to read * @val: Variable to read the register into * @cond: Break condition (usually involving @val) * @sleep_us: Maximum time to sleep between reads in us (0 * tight-loops). Should be less than ~20ms since usleep_range * is used (see Documentation/timers/timers-howto.rst). * @timeout_us: Timeout in us, 0 means never timeout * @sleep_before_read: if it is true, sleep @sleep_us before read. * Returns 0 on success and -ETIMEDOUT upon a timeout. In either * case, the last read value at @args is stored in @val. Must not * be called from atomic context if sleep_us or timeout_us are used. */ #define phy_read_mmd_poll_timeout(phydev, devaddr, regnum, val, cond, \ sleep_us, timeout_us, sleep_before_read) \ ({ \ int __ret, __val; \ __ret = read_poll_timeout(__val = phy_read_mmd, val, \ __val < 0 || (cond), \ sleep_us, timeout_us, sleep_before_read, \ phydev, devaddr, regnum); \ if (__val < 0) \ __ret = __val; \ if (__ret) \ phydev_err(phydev, "%s failed: %d\n", __func__, __ret); \ __ret; \ }) /* * __phy_read_mmd - Convenience function for reading a register * from an MMD on a given PHY. */ int __phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum); /* * phy_write_mmd - Convenience function for writing a register * on an MMD on a given PHY. */ int phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val); /* * __phy_write_mmd - Convenience function for writing a register * on an MMD on a given PHY. */ int __phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val); int __phy_modify_changed(struct phy_device *phydev, u32 regnum, u16 mask, u16 set); int phy_modify_changed(struct phy_device *phydev, u32 regnum, u16 mask, u16 set); int __phy_modify(struct phy_device *phydev, u32 regnum, u16 mask, u16 set); int phy_modify(struct phy_device *phydev, u32 regnum, u16 mask, u16 set); int __phy_modify_mmd_changed(struct phy_device *phydev, int devad, u32 regnum, u16 mask, u16 set); int phy_modify_mmd_changed(struct phy_device *phydev, int devad, u32 regnum, u16 mask, u16 set); int __phy_modify_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 mask, u16 set); int phy_modify_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 mask, u16 set); /** * __phy_set_bits - Convenience function for setting bits in a PHY register * @phydev: the phy_device struct * @regnum: register number to write * @val: bits to set * * The caller must have taken the MDIO bus lock. */ static inline int __phy_set_bits(struct phy_device *phydev, u32 regnum, u16 val) { return __phy_modify(phydev, regnum, 0, val); } /** * __phy_clear_bits - Convenience function for clearing bits in a PHY register * @phydev: the phy_device struct * @regnum: register number to write * @val: bits to clear * * The caller must have taken the MDIO bus lock. */ static inline int __phy_clear_bits(struct phy_device *phydev, u32 regnum, u16 val) { return __phy_modify(phydev, regnum, val, 0); } /** * phy_set_bits - Convenience function for setting bits in a PHY register * @phydev: the phy_device struct * @regnum: register number to write * @val: bits to set */ static inline int phy_set_bits(struct phy_device *phydev, u32 regnum, u16 val) { return phy_modify(phydev, regnum, 0, val); } /** * phy_clear_bits - Convenience function for clearing bits in a PHY register * @phydev: the phy_device struct * @regnum: register number to write * @val: bits to clear */ static inline int phy_clear_bits(struct phy_device *phydev, u32 regnum, u16 val) { return phy_modify(phydev, regnum, val, 0); } /** * __phy_set_bits_mmd - Convenience function for setting bits in a register * on MMD * @phydev: the phy_device struct * @devad: the MMD containing register to modify * @regnum: register number to modify * @val: bits to set * * The caller must have taken the MDIO bus lock. */ static inline int __phy_set_bits_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val) { return __phy_modify_mmd(phydev, devad, regnum, 0, val); } /** * __phy_clear_bits_mmd - Convenience function for clearing bits in a register * on MMD * @phydev: the phy_device struct * @devad: the MMD containing register to modify * @regnum: register number to modify * @val: bits to clear * * The caller must have taken the MDIO bus lock. */ static inline int __phy_clear_bits_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val) { return __phy_modify_mmd(phydev, devad, regnum, val, 0); } /** * phy_set_bits_mmd - Convenience function for setting bits in a register * on MMD * @phydev: the phy_device struct * @devad: the MMD containing register to modify * @regnum: register number to modify * @val: bits to set */ static inline int phy_set_bits_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val) { return phy_modify_mmd(phydev, devad, regnum, 0, val); } /** * phy_clear_bits_mmd - Convenience function for clearing bits in a register * on MMD * @phydev: the phy_device struct * @devad: the MMD containing register to modify * @regnum: register number to modify * @val: bits to clear */ static inline int phy_clear_bits_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val) { return phy_modify_mmd(phydev, devad, regnum, val, 0); } /** * phy_interrupt_is_valid - Convenience function for testing a given PHY irq * @phydev: the phy_device struct * * NOTE: must be kept in sync with addition/removal of PHY_POLL and * PHY_MAC_INTERRUPT */ static inline bool phy_interrupt_is_valid(struct phy_device *phydev) { return phydev->irq != PHY_POLL && phydev->irq != PHY_MAC_INTERRUPT; } /** * phy_polling_mode - Convenience function for testing whether polling is * used to detect PHY status changes * @phydev: the phy_device struct */ static inline bool phy_polling_mode(struct phy_device *phydev) { if (phydev->state == PHY_CABLETEST) if (phydev->drv->flags & PHY_POLL_CABLE_TEST) return true; return phydev->irq == PHY_POLL; } /** * phy_has_hwtstamp - Tests whether a PHY time stamp configuration. * @phydev: the phy_device struct */ static inline bool phy_has_hwtstamp(struct phy_device *phydev) { return phydev && phydev->mii_ts && phydev->mii_ts->hwtstamp; } /** * phy_has_rxtstamp - Tests whether a PHY supports receive time stamping. * @phydev: the phy_device struct */ static inline bool phy_has_rxtstamp(struct phy_device *phydev) { return phydev && phydev->mii_ts && phydev->mii_ts->rxtstamp; } /** * phy_has_tsinfo - Tests whether a PHY reports time stamping and/or * PTP hardware clock capabilities. * @phydev: the phy_device struct */ static inline bool phy_has_tsinfo(struct phy_device *phydev) { return phydev && phydev->mii_ts && phydev->mii_ts->ts_info; } /** * phy_has_txtstamp - Tests whether a PHY supports transmit time stamping. * @phydev: the phy_device struct */ static inline bool phy_has_txtstamp(struct phy_device *phydev) { return phydev && phydev->mii_ts && phydev->mii_ts->txtstamp; } static inline int phy_hwtstamp(struct phy_device *phydev, struct kernel_hwtstamp_config *cfg, struct netlink_ext_ack *extack) { return phydev->mii_ts->hwtstamp(phydev->mii_ts, cfg, extack); } static inline bool phy_rxtstamp(struct phy_device *phydev, struct sk_buff *skb, int type) { return phydev->mii_ts->rxtstamp(phydev->mii_ts, skb, type); } static inline int phy_ts_info(struct phy_device *phydev, struct ethtool_ts_info *tsinfo) { return phydev->mii_ts->ts_info(phydev->mii_ts, tsinfo); } static inline void phy_txtstamp(struct phy_device *phydev, struct sk_buff *skb, int type) { phydev->mii_ts->txtstamp(phydev->mii_ts, skb, type); } /** * phy_is_internal - Convenience function for testing if a PHY is internal * @phydev: the phy_device struct */ static inline bool phy_is_internal(struct phy_device *phydev) { return phydev->is_internal; } /** * phy_on_sfp - Convenience function for testing if a PHY is on an SFP module * @phydev: the phy_device struct */ static inline bool phy_on_sfp(struct phy_device *phydev) { return phydev->is_on_sfp_module; } /** * phy_interface_mode_is_rgmii - Convenience function for testing if a * PHY interface mode is RGMII (all variants) * @mode: the &phy_interface_t enum */ static inline bool phy_interface_mode_is_rgmii(phy_interface_t mode) { return mode >= PHY_INTERFACE_MODE_RGMII && mode <= PHY_INTERFACE_MODE_RGMII_TXID; }; /** * phy_interface_mode_is_8023z() - does the PHY interface mode use 802.3z * negotiation * @mode: one of &enum phy_interface_t * * Returns true if the PHY interface mode uses the 16-bit negotiation * word as defined in 802.3z. (See 802.3-2015 37.2.1 Config_Reg encoding) */ static inline bool phy_interface_mode_is_8023z(phy_interface_t mode) { return mode == PHY_INTERFACE_MODE_1000BASEX || mode == PHY_INTERFACE_MODE_2500BASEX; } /** * phy_interface_is_rgmii - Convenience function for testing if a PHY interface * is RGMII (all variants) * @phydev: the phy_device struct */ static inline bool phy_interface_is_rgmii(struct phy_device *phydev) { return phy_interface_mode_is_rgmii(phydev->interface); }; /** * phy_is_pseudo_fixed_link - Convenience function for testing if this * PHY is the CPU port facing side of an Ethernet switch, or similar. * @phydev: the phy_device struct */ static inline bool phy_is_pseudo_fixed_link(struct phy_device *phydev) { return phydev->is_pseudo_fixed_link; } int phy_save_page(struct phy_device *phydev); int phy_select_page(struct phy_device *phydev, int page); int phy_restore_page(struct phy_device *phydev, int oldpage, int ret); int phy_read_paged(struct phy_device *phydev, int page, u32 regnum); int phy_write_paged(struct phy_device *phydev, int page, u32 regnum, u16 val); int phy_modify_paged_changed(struct phy_device *phydev, int page, u32 regnum, u16 mask, u16 set); int phy_modify_paged(struct phy_device *phydev, int page, u32 regnum, u16 mask, u16 set); struct phy_device *phy_device_create(struct mii_bus *bus, int addr, u32 phy_id, bool is_c45, struct phy_c45_device_ids *c45_ids); #if IS_ENABLED(CONFIG_PHYLIB) int fwnode_get_phy_id(struct fwnode_handle *fwnode, u32 *phy_id); struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode); struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode); struct phy_device *device_phy_find_device(struct device *dev); struct fwnode_handle *fwnode_get_phy_node(const struct fwnode_handle *fwnode); struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45); int phy_device_register(struct phy_device *phy); void phy_device_free(struct phy_device *phydev); #else static inline int fwnode_get_phy_id(struct fwnode_handle *fwnode, u32 *phy_id) { return 0; } static inline struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode) { return 0; } static inline struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode) { return NULL; } static inline struct phy_device *device_phy_find_device(struct device *dev) { return NULL; } static inline struct fwnode_handle *fwnode_get_phy_node(struct fwnode_handle *fwnode) { return NULL; } static inline struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45) { return NULL; } static inline int phy_device_register(struct phy_device *phy) { return 0; } static inline void phy_device_free(struct phy_device *phydev) { } #endif /* CONFIG_PHYLIB */ void phy_device_remove(struct phy_device *phydev); int phy_get_c45_ids(struct phy_device *phydev); int phy_init_hw(struct phy_device *phydev); int phy_suspend(struct phy_device *phydev); int phy_resume(struct phy_device *phydev); int __phy_resume(struct phy_device *phydev); int phy_loopback(struct phy_device *phydev, bool enable); void phy_sfp_attach(void *upstream, struct sfp_bus *bus); void phy_sfp_detach(void *upstream, struct sfp_bus *bus); int phy_sfp_probe(struct phy_device *phydev, const struct sfp_upstream_ops *ops); struct phy_device *phy_attach(struct net_device *dev, const char *bus_id, phy_interface_t interface); struct phy_device *phy_find_first(struct mii_bus *bus); int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, u32 flags, phy_interface_t interface); int phy_connect_direct(struct net_device *dev, struct phy_device *phydev, void (*handler)(struct net_device *), phy_interface_t interface); struct phy_device *phy_connect(struct net_device *dev, const char *bus_id, void (*handler)(struct net_device *), phy_interface_t interface); void phy_disconnect(struct phy_device *phydev); void phy_detach(struct phy_device *phydev); void phy_start(struct phy_device *phydev); void phy_stop(struct phy_device *phydev); int phy_config_aneg(struct phy_device *phydev); int _phy_start_aneg(struct phy_device *phydev); int phy_start_aneg(struct phy_device *phydev); int phy_aneg_done(struct phy_device *phydev); int phy_speed_down(struct phy_device *phydev, bool sync); int phy_speed_up(struct phy_device *phydev); bool phy_check_valid(int speed, int duplex, unsigned long *features); int phy_restart_aneg(struct phy_device *phydev); int phy_reset_after_clk_enable(struct phy_device *phydev); #if IS_ENABLED(CONFIG_PHYLIB) int phy_start_cable_test(struct phy_device *phydev, struct netlink_ext_ack *extack); int phy_start_cable_test_tdr(struct phy_device *phydev, struct netlink_ext_ack *extack, const struct phy_tdr_config *config); #else static inline int phy_start_cable_test(struct phy_device *phydev, struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "Kernel not compiled with PHYLIB support"); return -EOPNOTSUPP; } static inline int phy_start_cable_test_tdr(struct phy_device *phydev, struct netlink_ext_ack *extack, const struct phy_tdr_config *config) { NL_SET_ERR_MSG(extack, "Kernel not compiled with PHYLIB support"); return -EOPNOTSUPP; } #endif static inline void phy_device_reset(struct phy_device *phydev, int value) { mdio_device_reset(&phydev->mdio, value); } #define phydev_err(_phydev, format, args...) \ dev_err(&_phydev->mdio.dev, format, ##args) #define phydev_err_probe(_phydev, err, format, args...) \ dev_err_probe(&_phydev->mdio.dev, err, format, ##args) #define phydev_info(_phydev, format, args...) \ dev_info(&_phydev->mdio.dev, format, ##args) #define phydev_warn(_phydev, format, args...) \ dev_warn(&_phydev->mdio.dev, format, ##args) #define phydev_dbg(_phydev, format, args...) \ dev_dbg(&_phydev->mdio.dev, format, ##args) static inline const char *phydev_name(const struct phy_device *phydev) { return dev_name(&phydev->mdio.dev); } static inline void phy_lock_mdio_bus(struct phy_device *phydev) { mutex_lock(&phydev->mdio.bus->mdio_lock); } static inline void phy_unlock_mdio_bus(struct phy_device *phydev) { mutex_unlock(&phydev->mdio.bus->mdio_lock); } void phy_attached_print(struct phy_device *phydev, const char *fmt, ...) __printf(2, 3); char *phy_attached_info_irq(struct phy_device *phydev) __malloc; void phy_attached_info(struct phy_device *phydev); /* Clause 22 PHY */ int genphy_read_abilities(struct phy_device *phydev); int genphy_setup_forced(struct phy_device *phydev); int genphy_restart_aneg(struct phy_device *phydev); int genphy_check_and_restart_aneg(struct phy_device *phydev, bool restart); int genphy_config_eee_advert(struct phy_device *phydev); int __genphy_config_aneg(struct phy_device *phydev, bool changed); int genphy_aneg_done(struct phy_device *phydev); int genphy_update_link(struct phy_device *phydev); int genphy_read_lpa(struct phy_device *phydev); int genphy_read_status_fixed(struct phy_device *phydev); int genphy_read_status(struct phy_device *phydev); int genphy_read_master_slave(struct phy_device *phydev); int genphy_suspend(struct phy_device *phydev); int genphy_resume(struct phy_device *phydev); int genphy_loopback(struct phy_device *phydev, bool enable); int genphy_soft_reset(struct phy_device *phydev); irqreturn_t genphy_handle_interrupt_no_ack(struct phy_device *phydev); static inline int genphy_config_aneg(struct phy_device *phydev) { return __genphy_config_aneg(phydev, false); } static inline int genphy_no_config_intr(struct phy_device *phydev) { return 0; } int genphy_read_mmd_unsupported(struct phy_device *phdev, int devad, u16 regnum); int genphy_write_mmd_unsupported(struct phy_device *phdev, int devnum, u16 regnum, u16 val); /* Clause 37 */ int genphy_c37_config_aneg(struct phy_device *phydev); int genphy_c37_read_status(struct phy_device *phydev); /* Clause 45 PHY */ int genphy_c45_restart_aneg(struct phy_device *phydev); int genphy_c45_check_and_restart_aneg(struct phy_device *phydev, bool restart); int genphy_c45_aneg_done(struct phy_device *phydev); int genphy_c45_read_link(struct phy_device *phydev); int genphy_c45_read_lpa(struct phy_device *phydev); int genphy_c45_read_pma(struct phy_device *phydev); int genphy_c45_pma_setup_forced(struct phy_device *phydev); int genphy_c45_pma_baset1_setup_master_slave(struct phy_device *phydev); int genphy_c45_an_config_aneg(struct phy_device *phydev); int genphy_c45_an_disable_aneg(struct phy_device *phydev); int genphy_c45_read_mdix(struct phy_device *phydev); int genphy_c45_pma_read_abilities(struct phy_device *phydev); int genphy_c45_pma_read_ext_abilities(struct phy_device *phydev); int genphy_c45_pma_baset1_read_abilities(struct phy_device *phydev); int genphy_c45_read_eee_abilities(struct phy_device *phydev); int genphy_c45_pma_baset1_read_master_slave(struct phy_device *phydev); int genphy_c45_read_status(struct phy_device *phydev); int genphy_c45_baset1_read_status(struct phy_device *phydev); int genphy_c45_config_aneg(struct phy_device *phydev); int genphy_c45_loopback(struct phy_device *phydev, bool enable); int genphy_c45_pma_resume(struct phy_device *phydev); int genphy_c45_pma_suspend(struct phy_device *phydev); int genphy_c45_fast_retrain(struct phy_device *phydev, bool enable); int genphy_c45_plca_get_cfg(struct phy_device *phydev, struct phy_plca_cfg *plca_cfg); int genphy_c45_plca_set_cfg(struct phy_device *phydev, const struct phy_plca_cfg *plca_cfg); int genphy_c45_plca_get_status(struct phy_device *phydev, struct phy_plca_status *plca_st); int genphy_c45_eee_is_active(struct phy_device *phydev, unsigned long *adv, unsigned long *lp, bool *is_enabled); int genphy_c45_ethtool_get_eee(struct phy_device *phydev, struct ethtool_eee *data); int genphy_c45_ethtool_set_eee(struct phy_device *phydev, struct ethtool_eee *data); int genphy_c45_write_eee_adv(struct phy_device *phydev, unsigned long *adv); int genphy_c45_an_config_eee_aneg(struct phy_device *phydev); int genphy_c45_read_eee_adv(struct phy_device *phydev, unsigned long *adv); /* Generic C45 PHY driver */ extern struct phy_driver genphy_c45_driver; /* The gen10g_* functions are the old Clause 45 stub */ int gen10g_config_aneg(struct phy_device *phydev); static inline int phy_read_status(struct phy_device *phydev) { if (!phydev->drv) return -EIO; if (phydev->drv->read_status) return phydev->drv->read_status(phydev); else return genphy_read_status(phydev); } void phy_driver_unregister(struct phy_driver *drv); void phy_drivers_unregister(struct phy_driver *drv, int n); int phy_driver_register(struct phy_driver *new_driver, struct module *owner); int phy_drivers_register(struct phy_driver *new_driver, int n, struct module *owner); void phy_error(struct phy_device *phydev); void phy_state_machine(struct work_struct *work); void phy_queue_state_machine(struct phy_device *phydev, unsigned long jiffies); void phy_trigger_machine(struct phy_device *phydev); void phy_mac_interrupt(struct phy_device *phydev); void phy_start_machine(struct phy_device *phydev); void phy_stop_machine(struct phy_device *phydev); void phy_ethtool_ksettings_get(struct phy_device *phydev, struct ethtool_link_ksettings *cmd); int phy_ethtool_ksettings_set(struct phy_device *phydev, const struct ethtool_link_ksettings *cmd); int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd); int phy_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd); int phy_do_ioctl_running(struct net_device *dev, struct ifreq *ifr, int cmd); int phy_disable_interrupts(struct phy_device *phydev); void phy_request_interrupt(struct phy_device *phydev); void phy_free_interrupt(struct phy_device *phydev); void phy_print_status(struct phy_device *phydev); int phy_get_rate_matching(struct phy_device *phydev, phy_interface_t iface); void phy_set_max_speed(struct phy_device *phydev, u32 max_speed); void phy_remove_link_mode(struct phy_device *phydev, u32 link_mode); void phy_advertise_supported(struct phy_device *phydev); void phy_support_sym_pause(struct phy_device *phydev); void phy_support_asym_pause(struct phy_device *phydev); void phy_set_sym_pause(struct phy_device *phydev, bool rx, bool tx, bool autoneg); void phy_set_asym_pause(struct phy_device *phydev, bool rx, bool tx); bool phy_validate_pause(struct phy_device *phydev, struct ethtool_pauseparam *pp); void phy_get_pause(struct phy_device *phydev, bool *tx_pause, bool *rx_pause); s32 phy_get_internal_delay(struct phy_device *phydev, struct device *dev, const int *delay_values, int size, bool is_rx); void phy_resolve_pause(unsigned long *local_adv, unsigned long *partner_adv, bool *tx_pause, bool *rx_pause); int phy_register_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask, int (*run)(struct phy_device *)); int phy_register_fixup_for_id(const char *bus_id, int (*run)(struct phy_device *)); int phy_register_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask, int (*run)(struct phy_device *)); int phy_unregister_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask); int phy_unregister_fixup_for_id(const char *bus_id); int phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask); int phy_init_eee(struct phy_device *phydev, bool clk_stop_enable); int phy_get_eee_err(struct phy_device *phydev); int phy_ethtool_set_eee(struct phy_device *phydev, struct ethtool_eee *data); int phy_ethtool_get_eee(struct phy_device *phydev, struct ethtool_eee *data); int phy_ethtool_set_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol); void phy_ethtool_get_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol); int phy_ethtool_get_link_ksettings(struct net_device *ndev, struct ethtool_link_ksettings *cmd); int phy_ethtool_set_link_ksettings(struct net_device *ndev, const struct ethtool_link_ksettings *cmd); int phy_ethtool_nway_reset(struct net_device *ndev); int phy_package_join(struct phy_device *phydev, int base_addr, size_t priv_size); void phy_package_leave(struct phy_device *phydev); int devm_phy_package_join(struct device *dev, struct phy_device *phydev, int base_addr, size_t priv_size); int __init mdio_bus_init(void); void mdio_bus_exit(void); int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data); int phy_ethtool_get_sset_count(struct phy_device *phydev); int phy_ethtool_get_stats(struct phy_device *phydev, struct ethtool_stats *stats, u64 *data); int phy_ethtool_get_plca_cfg(struct phy_device *phydev, struct phy_plca_cfg *plca_cfg); int phy_ethtool_set_plca_cfg(struct phy_device *phydev, const struct phy_plca_cfg *plca_cfg, struct netlink_ext_ack *extack); int phy_ethtool_get_plca_status(struct phy_device *phydev, struct phy_plca_status *plca_st); int __phy_hwtstamp_get(struct phy_device *phydev, struct kernel_hwtstamp_config *config); int __phy_hwtstamp_set(struct phy_device *phydev, struct kernel_hwtstamp_config *config, struct netlink_ext_ack *extack); static inline int phy_package_address(struct phy_device *phydev, unsigned int addr_offset) { struct phy_package_shared *shared = phydev->shared; u8 base_addr = shared->base_addr; if (addr_offset >= PHY_MAX_ADDR - base_addr) return -EIO; /* we know that addr will be in the range 0..31 and thus the * implicit cast to a signed int is not a problem. */ return base_addr + addr_offset; } static inline int phy_package_read(struct phy_device *phydev, unsigned int addr_offset, u32 regnum) { int addr = phy_package_address(phydev, addr_offset); if (addr < 0) return addr; return mdiobus_read(phydev->mdio.bus, addr, regnum); } static inline int __phy_package_read(struct phy_device *phydev, unsigned int addr_offset, u32 regnum) { int addr = phy_package_address(phydev, addr_offset); if (addr < 0) return addr; return __mdiobus_read(phydev->mdio.bus, addr, regnum); } static inline int phy_package_write(struct phy_device *phydev, unsigned int addr_offset, u32 regnum, u16 val) { int addr = phy_package_address(phydev, addr_offset); if (addr < 0) return addr; return mdiobus_write(phydev->mdio.bus, addr, regnum, val); } static inline int __phy_package_write(struct phy_device *phydev, unsigned int addr_offset, u32 regnum, u16 val) { int addr = phy_package_address(phydev, addr_offset); if (addr < 0) return addr; return __mdiobus_write(phydev->mdio.bus, addr, regnum, val); } int __phy_package_read_mmd(struct phy_device *phydev, unsigned int addr_offset, int devad, u32 regnum); int phy_package_read_mmd(struct phy_device *phydev, unsigned int addr_offset, int devad, u32 regnum); int __phy_package_write_mmd(struct phy_device *phydev, unsigned int addr_offset, int devad, u32 regnum, u16 val); int phy_package_write_mmd(struct phy_device *phydev, unsigned int addr_offset, int devad, u32 regnum, u16 val); static inline bool __phy_package_set_once(struct phy_device *phydev, unsigned int b) { struct phy_package_shared *shared = phydev->shared; if (!shared) return false; return !test_and_set_bit(b, &shared->flags); } static inline bool phy_package_init_once(struct phy_device *phydev) { return __phy_package_set_once(phydev, PHY_SHARED_F_INIT_DONE); } static inline bool phy_package_probe_once(struct phy_device *phydev) { return __phy_package_set_once(phydev, PHY_SHARED_F_PROBE_DONE); } extern struct bus_type mdio_bus_type; struct mdio_board_info { const char *bus_id; char modalias[MDIO_NAME_SIZE]; int mdio_addr; const void *platform_data; }; #if IS_ENABLED(CONFIG_MDIO_DEVICE) int mdiobus_register_board_info(const struct mdio_board_info *info, unsigned int n); #else static inline int mdiobus_register_board_info(const struct mdio_board_info *i, unsigned int n) { return 0; } #endif /** * phy_module_driver() - Helper macro for registering PHY drivers * @__phy_drivers: array of PHY drivers to register * @__count: Numbers of members in array * * Helper macro for PHY drivers which do not do anything special in module * init/exit. Each module may only use this macro once, and calling it * replaces module_init() and module_exit(). */ #define phy_module_driver(__phy_drivers, __count) \ static int __init phy_module_init(void) \ { \ return phy_drivers_register(__phy_drivers, __count, THIS_MODULE); \ } \ module_init(phy_module_init); \ static void __exit phy_module_exit(void) \ { \ phy_drivers_unregister(__phy_drivers, __count); \ } \ module_exit(phy_module_exit) #define module_phy_driver(__phy_drivers) \ phy_module_driver(__phy_drivers, ARRAY_SIZE(__phy_drivers)) bool phy_driver_is_genphy(struct phy_device *phydev); bool phy_driver_is_genphy_10g(struct phy_device *phydev); #endif /* __PHY_H */
50 50 50 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 3 50 50 50 50 50 50 50 3 3 3 3 3 3 3 1 50 5 5 5 5 5 5 5 5 5 5 5 892 891 1115 674 38 485 203 203 29 202 9 9 9 3 3 31 31 31 31 22 22 3 22 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 // SPDX-License-Identifier: GPL-2.0-only /* * Helpers for formatting and printing strings * * Copyright 31 August 2008 James Bottomley * Copyright (C) 2013, Intel Corporation */ #include <linux/bug.h> #include <linux/kernel.h> #include <linux/math64.h> #include <linux/export.h> #include <linux/ctype.h> #include <linux/device.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/limits.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/string_helpers.h> /** * string_get_size - get the size in the specified units * @size: The size to be converted in blocks * @blk_size: Size of the block (use 1 for size in bytes) * @units: units to use (powers of 1000 or 1024) * @buf: buffer to format to * @len: length of buffer * * This function returns a string formatted to 3 significant figures * giving the size in the required units. @buf should have room for * at least 9 bytes and will always be zero terminated. * * Return value: number of characters of output that would have been written * (which may be greater than len, if output was truncated). */ int string_get_size(u64 size, u64 blk_size, const enum string_size_units units, char *buf, int len) { static const char *const units_10[] = { "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB" }; static const char *const units_2[] = { "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB" }; static const char *const *const units_str[] = { [STRING_UNITS_10] = units_10, [STRING_UNITS_2] = units_2, }; static const unsigned int divisor[] = { [STRING_UNITS_10] = 1000, [STRING_UNITS_2] = 1024, }; static const unsigned int rounding[] = { 500, 50, 5 }; int i = 0, j; u32 remainder = 0, sf_cap; char tmp[8]; const char *unit; tmp[0] = '\0'; if (blk_size == 0) size = 0; if (size == 0) goto out; /* This is Napier's algorithm. Reduce the original block size to * * coefficient * divisor[units]^i * * we do the reduction so both coefficients are just under 32 bits so * that multiplying them together won't overflow 64 bits and we keep * as much precision as possible in the numbers. * * Note: it's safe to throw away the remainders here because all the * precision is in the coefficients. */ while (blk_size >> 32) { do_div(blk_size, divisor[units]); i++; } while (size >> 32) { do_div(size, divisor[units]); i++; } /* now perform the actual multiplication keeping i as the sum of the * two logarithms */ size *= blk_size; /* and logarithmically reduce it until it's just under the divisor */ while (size >= divisor[units]) { remainder = do_div(size, divisor[units]); i++; } /* work out in j how many digits of precision we need from the * remainder */ sf_cap = size; for (j = 0; sf_cap*10 < 1000; j++) sf_cap *= 10; if (units == STRING_UNITS_2) { /* express the remainder as a decimal. It's currently the * numerator of a fraction whose denominator is * divisor[units], which is 1 << 10 for STRING_UNITS_2 */ remainder *= 1000; remainder >>= 10; } /* add a 5 to the digit below what will be printed to ensure * an arithmetical round up and carry it through to size */ remainder += rounding[j]; if (remainder >= 1000) { remainder -= 1000; size += 1; } if (j) { snprintf(tmp, sizeof(tmp), ".%03u", remainder); tmp[j+1] = '\0'; } out: if (i >= ARRAY_SIZE(units_2)) unit = "UNK"; else unit = units_str[units][i]; return snprintf(buf, len, "%u%s %s", (u32)size, tmp, unit); } EXPORT_SYMBOL(string_get_size); /** * parse_int_array_user - Split string into a sequence of integers * @from: The user space buffer to read from * @count: The maximum number of bytes to read * @array: Returned pointer to sequence of integers * * On success @array is allocated and initialized with a sequence of * integers extracted from the @from plus an additional element that * begins the sequence and specifies the integers count. * * Caller takes responsibility for freeing @array when it is no longer * needed. */ int parse_int_array_user(const char __user *from, size_t count, int **array) { int *ints, nints; char *buf; int ret = 0; buf = memdup_user_nul(from, count); if (IS_ERR(buf)) return PTR_ERR(buf); get_options(buf, 0, &nints); if (!nints) { ret = -ENOENT; goto free_buf; } ints = kcalloc(nints + 1, sizeof(*ints), GFP_KERNEL); if (!ints) { ret = -ENOMEM; goto free_buf; } get_options(buf, nints + 1, ints); *array = ints; free_buf: kfree(buf); return ret; } EXPORT_SYMBOL(parse_int_array_user); static bool unescape_space(char **src, char **dst) { char *p = *dst, *q = *src; switch (*q) { case 'n': *p = '\n'; break; case 'r': *p = '\r'; break; case 't': *p = '\t'; break; case 'v': *p = '\v'; break; case 'f': *p = '\f'; break; default: return false; } *dst += 1; *src += 1; return true; } static bool unescape_octal(char **src, char **dst) { char *p = *dst, *q = *src; u8 num; if (isodigit(*q) == 0) return false; num = (*q++) & 7; while (num < 32 && isodigit(*q) && (q - *src < 3)) { num <<= 3; num += (*q++) & 7; } *p = num; *dst += 1; *src = q; return true; } static bool unescape_hex(char **src, char **dst) { char *p = *dst, *q = *src; int digit; u8 num; if (*q++ != 'x') return false; num = digit = hex_to_bin(*q++); if (digit < 0) return false; digit = hex_to_bin(*q); if (digit >= 0) { q++; num = (num << 4) | digit; } *p = num; *dst += 1; *src = q; return true; } static bool unescape_special(char **src, char **dst) { char *p = *dst, *q = *src; switch (*q) { case '\"': *p = '\"'; break; case '\\': *p = '\\'; break; case 'a': *p = '\a'; break; case 'e': *p = '\e'; break; default: return false; } *dst += 1; *src += 1; return true; } /** * string_unescape - unquote characters in the given string * @src: source buffer (escaped) * @dst: destination buffer (unescaped) * @size: size of the destination buffer (0 to unlimit) * @flags: combination of the flags. * * Description: * The function unquotes characters in the given string. * * Because the size of the output will be the same as or less than the size of * the input, the transformation may be performed in place. * * Caller must provide valid source and destination pointers. Be aware that * destination buffer will always be NULL-terminated. Source string must be * NULL-terminated as well. The supported flags are:: * * UNESCAPE_SPACE: * '\f' - form feed * '\n' - new line * '\r' - carriage return * '\t' - horizontal tab * '\v' - vertical tab * UNESCAPE_OCTAL: * '\NNN' - byte with octal value NNN (1 to 3 digits) * UNESCAPE_HEX: * '\xHH' - byte with hexadecimal value HH (1 to 2 digits) * UNESCAPE_SPECIAL: * '\"' - double quote * '\\' - backslash * '\a' - alert (BEL) * '\e' - escape * UNESCAPE_ANY: * all previous together * * Return: * The amount of the characters processed to the destination buffer excluding * trailing '\0' is returned. */ int string_unescape(char *src, char *dst, size_t size, unsigned int flags) { char *out = dst; while (*src && --size) { if (src[0] == '\\' && src[1] != '\0' && size > 1) { src++; size--; if (flags & UNESCAPE_SPACE && unescape_space(&src, &out)) continue; if (flags & UNESCAPE_OCTAL && unescape_octal(&src, &out)) continue; if (flags & UNESCAPE_HEX && unescape_hex(&src, &out)) continue; if (flags & UNESCAPE_SPECIAL && unescape_special(&src, &out)) continue; *out++ = '\\'; } *out++ = *src++; } *out = '\0'; return out - dst; } EXPORT_SYMBOL(string_unescape); static bool escape_passthrough(unsigned char c, char **dst, char *end) { char *out = *dst; if (out < end) *out = c; *dst = out + 1; return true; } static bool escape_space(unsigned char c, char **dst, char *end) { char *out = *dst; unsigned char to; switch (c) { case '\n': to = 'n'; break; case '\r': to = 'r'; break; case '\t': to = 't'; break; case '\v': to = 'v'; break; case '\f': to = 'f'; break; default: return false; } if (out < end) *out = '\\'; ++out; if (out < end) *out = to; ++out; *dst = out; return true; } static bool escape_special(unsigned char c, char **dst, char *end) { char *out = *dst; unsigned char to; switch (c) { case '\\': to = '\\'; break; case '\a': to = 'a'; break; case '\e': to = 'e'; break; case '"': to = '"'; break; default: return false; } if (out < end) *out = '\\'; ++out; if (out < end) *out = to; ++out; *dst = out; return true; } static bool escape_null(unsigned char c, char **dst, char *end) { char *out = *dst; if (c) return false; if (out < end) *out = '\\'; ++out; if (out < end) *out = '0'; ++out; *dst = out; return true; } static bool escape_octal(unsigned char c, char **dst, char *end) { char *out = *dst; if (out < end) *out = '\\'; ++out; if (out < end) *out = ((c >> 6) & 0x07) + '0'; ++out; if (out < end) *out = ((c >> 3) & 0x07) + '0'; ++out; if (out < end) *out = ((c >> 0) & 0x07) + '0'; ++out; *dst = out; return true; } static bool escape_hex(unsigned char c, char **dst, char *end) { char *out = *dst; if (out < end) *out = '\\'; ++out; if (out < end) *out = 'x'; ++out; if (out < end) *out = hex_asc_hi(c); ++out; if (out < end) *out = hex_asc_lo(c); ++out; *dst = out; return true; } /** * string_escape_mem - quote characters in the given memory buffer * @src: source buffer (unescaped) * @isz: source buffer size * @dst: destination buffer (escaped) * @osz: destination buffer size * @flags: combination of the flags * @only: NULL-terminated string containing characters used to limit * the selected escape class. If characters are included in @only * that would not normally be escaped by the classes selected * in @flags, they will be copied to @dst unescaped. * * Description: * The process of escaping byte buffer includes several parts. They are applied * in the following sequence. * * 1. The character is not matched to the one from @only string and thus * must go as-is to the output. * 2. The character is matched to the printable and ASCII classes, if asked, * and in case of match it passes through to the output. * 3. The character is matched to the printable or ASCII class, if asked, * and in case of match it passes through to the output. * 4. The character is checked if it falls into the class given by @flags. * %ESCAPE_OCTAL and %ESCAPE_HEX are going last since they cover any * character. Note that they actually can't go together, otherwise * %ESCAPE_HEX will be ignored. * * Caller must provide valid source and destination pointers. Be aware that * destination buffer will not be NULL-terminated, thus caller have to append * it if needs. The supported flags are:: * * %ESCAPE_SPACE: (special white space, not space itself) * '\f' - form feed * '\n' - new line * '\r' - carriage return * '\t' - horizontal tab * '\v' - vertical tab * %ESCAPE_SPECIAL: * '\"' - double quote * '\\' - backslash * '\a' - alert (BEL) * '\e' - escape * %ESCAPE_NULL: * '\0' - null * %ESCAPE_OCTAL: * '\NNN' - byte with octal value NNN (3 digits) * %ESCAPE_ANY: * all previous together * %ESCAPE_NP: * escape only non-printable characters, checked by isprint() * %ESCAPE_ANY_NP: * all previous together * %ESCAPE_HEX: * '\xHH' - byte with hexadecimal value HH (2 digits) * %ESCAPE_NA: * escape only non-ascii characters, checked by isascii() * %ESCAPE_NAP: * escape only non-printable or non-ascii characters * %ESCAPE_APPEND: * append characters from @only to be escaped by the given classes * * %ESCAPE_APPEND would help to pass additional characters to the escaped, when * one of %ESCAPE_NP, %ESCAPE_NA, or %ESCAPE_NAP is provided. * * One notable caveat, the %ESCAPE_NAP, %ESCAPE_NP and %ESCAPE_NA have the * higher priority than the rest of the flags (%ESCAPE_NAP is the highest). * It doesn't make much sense to use either of them without %ESCAPE_OCTAL * or %ESCAPE_HEX, because they cover most of the other character classes. * %ESCAPE_NAP can utilize %ESCAPE_SPACE or %ESCAPE_SPECIAL in addition to * the above. * * Return: * The total size of the escaped output that would be generated for * the given input and flags. To check whether the output was * truncated, compare the return value to osz. There is room left in * dst for a '\0' terminator if and only if ret < osz. */ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz, unsigned int flags, const char *only) { char *p = dst; char *end = p + osz; bool is_dict = only && *only; bool is_append = flags & ESCAPE_APPEND; while (isz--) { unsigned char c = *src++; bool in_dict = is_dict && strchr(only, c); /* * Apply rules in the following sequence: * - the @only string is supplied and does not contain a * character under question * - the character is printable and ASCII, when @flags has * %ESCAPE_NAP bit set * - the character is printable, when @flags has * %ESCAPE_NP bit set * - the character is ASCII, when @flags has * %ESCAPE_NA bit set * - the character doesn't fall into a class of symbols * defined by given @flags * In these cases we just pass through a character to the * output buffer. * * When %ESCAPE_APPEND is passed, the characters from @only * have been excluded from the %ESCAPE_NAP, %ESCAPE_NP, and * %ESCAPE_NA cases. */ if (!(is_append || in_dict) && is_dict && escape_passthrough(c, &p, end)) continue; if (!(is_append && in_dict) && isascii(c) && isprint(c) && flags & ESCAPE_NAP && escape_passthrough(c, &p, end)) continue; if (!(is_append && in_dict) && isprint(c) && flags & ESCAPE_NP && escape_passthrough(c, &p, end)) continue; if (!(is_append && in_dict) && isascii(c) && flags & ESCAPE_NA && escape_passthrough(c, &p, end)) continue; if (flags & ESCAPE_SPACE && escape_space(c, &p, end)) continue; if (flags & ESCAPE_SPECIAL && escape_special(c, &p, end)) continue; if (flags & ESCAPE_NULL && escape_null(c, &p, end)) continue; /* ESCAPE_OCTAL and ESCAPE_HEX always go last */ if (flags & ESCAPE_OCTAL && escape_octal(c, &p, end)) continue; if (flags & ESCAPE_HEX && escape_hex(c, &p, end)) continue; escape_passthrough(c, &p, end); } return p - dst; } EXPORT_SYMBOL(string_escape_mem); /* * Return an allocated string that has been escaped of special characters * and double quotes, making it safe to log in quotes. */ char *kstrdup_quotable(const char *src, gfp_t gfp) { size_t slen, dlen; char *dst; const int flags = ESCAPE_HEX; const char esc[] = "\f\n\r\t\v\a\e\\\""; if (!src) return NULL; slen = strlen(src); dlen = string_escape_mem(src, slen, NULL, 0, flags, esc); dst = kmalloc(dlen + 1, gfp); if (!dst) return NULL; WARN_ON(string_escape_mem(src, slen, dst, dlen, flags, esc) != dlen); dst[dlen] = '\0'; return dst; } EXPORT_SYMBOL_GPL(kstrdup_quotable); /* * Returns allocated NULL-terminated string containing process * command line, with inter-argument NULLs replaced with spaces, * and other special characters escaped. */ char *kstrdup_quotable_cmdline(struct task_struct *task, gfp_t gfp) { char *buffer, *quoted; int i, res; buffer = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!buffer) return NULL; res = get_cmdline(task, buffer, PAGE_SIZE - 1); buffer[res] = '\0'; /* Collapse trailing NULLs, leave res pointing to last non-NULL. */ while (--res >= 0 && buffer[res] == '\0') ; /* Replace inter-argument NULLs. */ for (i = 0; i <= res; i++) if (buffer[i] == '\0') buffer[i] = ' '; /* Make sure result is printable. */ quoted = kstrdup_quotable(buffer, gfp); kfree(buffer); return quoted; } EXPORT_SYMBOL_GPL(kstrdup_quotable_cmdline); /* * Returns allocated NULL-terminated string containing pathname, * with special characters escaped, able to be safely logged. If * there is an error, the leading character will be "<". */ char *kstrdup_quotable_file(struct file *file, gfp_t gfp) { char *temp, *pathname; if (!file) return kstrdup("<unknown>", gfp); /* We add 11 spaces for ' (deleted)' to be appended */ temp = kmalloc(PATH_MAX + 11, GFP_KERNEL); if (!temp) return kstrdup("<no_memory>", gfp); pathname = file_path(file, temp, PATH_MAX + 11); if (IS_ERR(pathname)) pathname = kstrdup("<too_long>", gfp); else pathname = kstrdup_quotable(pathname, gfp); kfree(temp); return pathname; } EXPORT_SYMBOL_GPL(kstrdup_quotable_file); /* * Returns duplicate string in which the @old characters are replaced by @new. */ char *kstrdup_and_replace(const char *src, char old, char new, gfp_t gfp) { char *dst; dst = kstrdup(src, gfp); if (!dst) return NULL; return strreplace(dst, old, new); } EXPORT_SYMBOL_GPL(kstrdup_and_replace); /** * kasprintf_strarray - allocate and fill array of sequential strings * @gfp: flags for the slab allocator * @prefix: prefix to be used * @n: amount of lines to be allocated and filled * * Allocates and fills @n strings using pattern "%s-%zu", where prefix * is provided by caller. The caller is responsible to free them with * kfree_strarray() after use. * * Returns array of strings or NULL when memory can't be allocated. */ char **kasprintf_strarray(gfp_t gfp, const char *prefix, size_t n) { char **names; size_t i; names = kcalloc(n + 1, sizeof(char *), gfp); if (!names) return NULL; for (i = 0; i < n; i++) { names[i] = kasprintf(gfp, "%s-%zu", prefix, i); if (!names[i]) { kfree_strarray(names, i); return NULL; } } return names; } EXPORT_SYMBOL_GPL(kasprintf_strarray); /** * kfree_strarray - free a number of dynamically allocated strings contained * in an array and the array itself * * @array: Dynamically allocated array of strings to free. * @n: Number of strings (starting from the beginning of the array) to free. * * Passing a non-NULL @array and @n == 0 as well as NULL @array are valid * use-cases. If @array is NULL, the function does nothing. */ void kfree_strarray(char **array, size_t n) { unsigned int i; if (!array) return; for (i = 0; i < n; i++) kfree(array[i]); kfree(array); } EXPORT_SYMBOL_GPL(kfree_strarray); struct strarray { char **array; size_t n; }; static void devm_kfree_strarray(struct device *dev, void *res) { struct strarray *array = res; kfree_strarray(array->array, array->n); } char **devm_kasprintf_strarray(struct device *dev, const char *prefix, size_t n) { struct strarray *ptr; ptr = devres_alloc(devm_kfree_strarray, sizeof(*ptr), GFP_KERNEL); if (!ptr) return ERR_PTR(-ENOMEM); ptr->array = kasprintf_strarray(GFP_KERNEL, prefix, n); if (!ptr->array) { devres_free(ptr); return ERR_PTR(-ENOMEM); } ptr->n = n; devres_add(dev, ptr); return ptr->array; } EXPORT_SYMBOL_GPL(devm_kasprintf_strarray); /** * strscpy_pad() - Copy a C-string into a sized buffer * @dest: Where to copy the string to * @src: Where to copy the string from * @count: Size of destination buffer * * Copy the string, or as much of it as fits, into the dest buffer. The * behavior is undefined if the string buffers overlap. The destination * buffer is always %NUL terminated, unless it's zero-sized. * * If the source string is shorter than the destination buffer, zeros * the tail of the destination buffer. * * For full explanation of why you may want to consider using the * 'strscpy' functions please see the function docstring for strscpy(). * * Returns: * * The number of characters copied (not including the trailing %NUL) * * -E2BIG if count is 0 or @src was truncated. */ ssize_t strscpy_pad(char *dest, const char *src, size_t count) { ssize_t written; written = strscpy(dest, src, count); if (written < 0 || written == count - 1) return written; memset(dest + written + 1, 0, count - written - 1); return written; } EXPORT_SYMBOL(strscpy_pad); /** * skip_spaces - Removes leading whitespace from @str. * @str: The string to be stripped. * * Returns a pointer to the first non-whitespace character in @str. */ char *skip_spaces(const char *str) { while (isspace(*str)) ++str; return (char *)str; } EXPORT_SYMBOL(skip_spaces); /** * strim - Removes leading and trailing whitespace from @s. * @s: The string to be stripped. * * Note that the first trailing whitespace is replaced with a %NUL-terminator * in the given string @s. Returns a pointer to the first non-whitespace * character in @s. */ char *strim(char *s) { size_t size; char *end; size = strlen(s); if (!size) return s; end = s + size - 1; while (end >= s && isspace(*end)) end--; *(end + 1) = '\0'; return skip_spaces(s); } EXPORT_SYMBOL(strim); /** * sysfs_streq - return true if strings are equal, modulo trailing newline * @s1: one string * @s2: another string * * This routine returns true iff two strings are equal, treating both * NUL and newline-then-NUL as equivalent string terminations. It's * geared for use with sysfs input strings, which generally terminate * with newlines but are compared against values without newlines. */ bool sysfs_streq(const char *s1, const char *s2) { while (*s1 && *s1 == *s2) { s1++; s2++; } if (*s1 == *s2) return true; if (!*s1 && *s2 == '\n' && !s2[1]) return true; if (*s1 == '\n' && !s1[1] && !*s2) return true; return false; } EXPORT_SYMBOL(sysfs_streq); /** * match_string - matches given string in an array * @array: array of strings * @n: number of strings in the array or -1 for NULL terminated arrays * @string: string to match with * * This routine will look for a string in an array of strings up to the * n-th element in the array or until the first NULL element. * * Historically the value of -1 for @n, was used to search in arrays that * are NULL terminated. However, the function does not make a distinction * when finishing the search: either @n elements have been compared OR * the first NULL element was found. * * Return: * index of a @string in the @array if matches, or %-EINVAL otherwise. */ int match_string(const char * const *array, size_t n, const char *string) { int index; const char *item; for (index = 0; index < n; index++) { item = array[index]; if (!item) break; if (!strcmp(item, string)) return index; } return -EINVAL; } EXPORT_SYMBOL(match_string); /** * __sysfs_match_string - matches given string in an array * @array: array of strings * @n: number of strings in the array or -1 for NULL terminated arrays * @str: string to match with * * Returns index of @str in the @array or -EINVAL, just like match_string(). * Uses sysfs_streq instead of strcmp for matching. * * This routine will look for a string in an array of strings up to the * n-th element in the array or until the first NULL element. * * Historically the value of -1 for @n, was used to search in arrays that * are NULL terminated. However, the function does not make a distinction * when finishing the search: either @n elements have been compared OR * the first NULL element was found. */ int __sysfs_match_string(const char * const *array, size_t n, const char *str) { const char *item; int index; for (index = 0; index < n; index++) { item = array[index]; if (!item) break; if (sysfs_streq(item, str)) return index; } return -EINVAL; } EXPORT_SYMBOL(__sysfs_match_string); /** * strreplace - Replace all occurrences of character in string. * @str: The string to operate on. * @old: The character being replaced. * @new: The character @old is replaced with. * * Replaces the each @old character with a @new one in the given string @str. * * Return: pointer to the string @str itself. */ char *strreplace(char *str, char old, char new) { char *s = str; for (; *s; ++s) if (*s == old) *s = new; return str; } EXPORT_SYMBOL(strreplace); /** * memcpy_and_pad - Copy one buffer to another with padding * @dest: Where to copy to * @dest_len: The destination buffer size * @src: Where to copy from * @count: The number of bytes to copy * @pad: Character to use for padding if space is left in destination. */ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, int pad) { if (dest_len > count) { memcpy(dest, src, count); memset(dest + count, pad, dest_len - count); } else { memcpy(dest, src, dest_len); } } EXPORT_SYMBOL(memcpy_and_pad); #ifdef CONFIG_FORTIFY_SOURCE /* These are placeholders for fortify compile-time warnings. */ void __read_overflow2_field(size_t avail, size_t wanted) { } EXPORT_SYMBOL(__read_overflow2_field); void __write_overflow_field(size_t avail, size_t wanted) { } EXPORT_SYMBOL(__write_overflow_field); void fortify_panic(const char *name) { pr_emerg("detected buffer overflow in %s\n", name); BUG(); } EXPORT_SYMBOL(fortify_panic); #endif /* CONFIG_FORTIFY_SOURCE */
96 17 4 10 14 10 9 1 1 1 37 37 37 37 10 32 10 24 24 24 24 33 32 33 36 37 1 1 1 1 34 34 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 33 33 35 15 15 15 37 37 37 37 37 37 37 37 37 37 35 36 37 37 37 33 33 33 33 33 33 36 37 37 37 37 37 37 37 37 36 37 37 37 37 29 7 7 37 37 1 37 10 10 10 37 17 17 4 4 4 17 17 41 4 48 48 48 48 48 1 1 1 1 1 1 1 48 48 48 48 48 48 48 69 69 69 44 37 37 59 59 59 44 44 19 27 44 44 44 59 59 59 59 59 59 59 18 18 18 18 18 18 18 18 18 19 2 2 2 2 14 14 14 3 3 22 22 26 2 2 2 2 1 1 1 1 1 1 1 1 1 22 23 22 23 23 23 23 23 1 23 23 23 21 23 21 23 1 21 2 21 22 22 22 22 21 22 22 1 22 22 22 22 22 22 22 22 22 22 1 22 2 22 22 22 22 1 1 1 1 1 1 1 1 1 9 9 9 3 2 1 3 11 10 9 8 5 5 5 2 2 2 2 2 2 2 2 2 11 10 10 9 10 1 1 1 4 2 2 2 2 35 35 1 34 1 34 33 34 2 33 2 2 31 1 27 3 1 30 1 2 1 1 1 27 4 27 8 27 4 23 5 13 2 2 1 1 1 1 2 2 2 2 2 1 1 2 1 2 641 640 641 641 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 // SPDX-License-Identifier: GPL-2.0-only /* * mac80211_hwsim - software simulator of 802.11 radio(s) for mac80211 * Copyright (c) 2008, Jouni Malinen <j@w1.fi> * Copyright (c) 2011, Javier Lopez <jlopex@gmail.com> * Copyright (c) 2016 - 2017 Intel Deutschland GmbH * Copyright (C) 2018 - 2023 Intel Corporation */ /* * TODO: * - Add TSF sync and fix IBSS beacon transmission by adding * competition for "air time" at TBTT * - RX filtering based on filter configuration (data->rx_filter) */ #include <linux/list.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <net/dst.h> #include <net/xfrm.h> #include <net/mac80211.h> #include <net/ieee80211_radiotap.h> #include <linux/if_arp.h> #include <linux/rtnetlink.h> #include <linux/etherdevice.h> #include <linux/platform_device.h> #include <linux/debugfs.h> #include <linux/module.h> #include <linux/ktime.h> #include <net/genetlink.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/rhashtable.h> #include <linux/nospec.h> #include <linux/virtio.h> #include <linux/virtio_ids.h> #include <linux/virtio_config.h> #include "mac80211_hwsim.h" #define WARN_QUEUE 100 #define MAX_QUEUE 200 MODULE_AUTHOR("Jouni Malinen"); MODULE_DESCRIPTION("Software simulator of 802.11 radio(s) for mac80211"); MODULE_LICENSE("GPL"); static int radios = 2; module_param(radios, int, 0444); MODULE_PARM_DESC(radios, "Number of simulated radios"); static int channels = 1; module_param(channels, int, 0444); MODULE_PARM_DESC(channels, "Number of concurrent channels"); static bool paged_rx = false; module_param(paged_rx, bool, 0644); MODULE_PARM_DESC(paged_rx, "Use paged SKBs for RX instead of linear ones"); static bool rctbl = false; module_param(rctbl, bool, 0444); MODULE_PARM_DESC(rctbl, "Handle rate control table"); static bool support_p2p_device = true; module_param(support_p2p_device, bool, 0444); MODULE_PARM_DESC(support_p2p_device, "Support P2P-Device interface type"); static bool mlo; module_param(mlo, bool, 0444); MODULE_PARM_DESC(mlo, "Support MLO"); /** * enum hwsim_regtest - the type of regulatory tests we offer * * @HWSIM_REGTEST_DISABLED: No regulatory tests are performed, * this is the default value. * @HWSIM_REGTEST_DRIVER_REG_FOLLOW: Used for testing the driver regulatory * hint, only one driver regulatory hint will be sent as such the * secondary radios are expected to follow. * @HWSIM_REGTEST_DRIVER_REG_ALL: Used for testing the driver regulatory * request with all radios reporting the same regulatory domain. * @HWSIM_REGTEST_DIFF_COUNTRY: Used for testing the drivers calling * different regulatory domains requests. Expected behaviour is for * an intersection to occur but each device will still use their * respective regulatory requested domains. Subsequent radios will * use the resulting intersection. * @HWSIM_REGTEST_WORLD_ROAM: Used for testing the world roaming. We accomplish * this by using a custom beacon-capable regulatory domain for the first * radio. All other device world roam. * @HWSIM_REGTEST_CUSTOM_WORLD: Used for testing the custom world regulatory * domain requests. All radios will adhere to this custom world regulatory * domain. * @HWSIM_REGTEST_CUSTOM_WORLD_2: Used for testing 2 custom world regulatory * domain requests. The first radio will adhere to the first custom world * regulatory domain, the second one to the second custom world regulatory * domain. All other devices will world roam. * @HWSIM_REGTEST_STRICT_FOLLOW: Used for testing strict regulatory domain * settings, only the first radio will send a regulatory domain request * and use strict settings. The rest of the radios are expected to follow. * @HWSIM_REGTEST_STRICT_ALL: Used for testing strict regulatory domain * settings. All radios will adhere to this. * @HWSIM_REGTEST_STRICT_AND_DRIVER_REG: Used for testing strict regulatory * domain settings, combined with secondary driver regulatory domain * settings. The first radio will get a strict regulatory domain setting * using the first driver regulatory request and the second radio will use * non-strict settings using the second driver regulatory request. All * other devices should follow the intersection created between the * first two. * @HWSIM_REGTEST_ALL: Used for testing every possible mix. You will need * at least 6 radios for a complete test. We will test in this order: * 1 - driver custom world regulatory domain * 2 - second custom world regulatory domain * 3 - first driver regulatory domain request * 4 - second driver regulatory domain request * 5 - strict regulatory domain settings using the third driver regulatory * domain request * 6 and on - should follow the intersection of the 3rd, 4rth and 5th radio * regulatory requests. * * These are the different values you can use for the regtest * module parameter. This is useful to help test world roaming * and the driver regulatory_hint() call and combinations of these. * If you want to do specific alpha2 regulatory domain tests simply * use the userspace regulatory request as that will be respected as * well without the need of this module parameter. This is designed * only for testing the driver regulatory request, world roaming * and all possible combinations. */ enum hwsim_regtest { HWSIM_REGTEST_DISABLED = 0, HWSIM_REGTEST_DRIVER_REG_FOLLOW = 1, HWSIM_REGTEST_DRIVER_REG_ALL = 2, HWSIM_REGTEST_DIFF_COUNTRY = 3, HWSIM_REGTEST_WORLD_ROAM = 4, HWSIM_REGTEST_CUSTOM_WORLD = 5, HWSIM_REGTEST_CUSTOM_WORLD_2 = 6, HWSIM_REGTEST_STRICT_FOLLOW = 7, HWSIM_REGTEST_STRICT_ALL = 8, HWSIM_REGTEST_STRICT_AND_DRIVER_REG = 9, HWSIM_REGTEST_ALL = 10, }; /* Set to one of the HWSIM_REGTEST_* values above */ static int regtest = HWSIM_REGTEST_DISABLED; module_param(regtest, int, 0444); MODULE_PARM_DESC(regtest, "The type of regulatory test we want to run"); static const char *hwsim_alpha2s[] = { "FI", "AL", "US", "DE", "JP", "AL", }; static const struct ieee80211_regdomain hwsim_world_regdom_custom_01 = { .n_reg_rules = 5, .alpha2 = "99", .reg_rules = { REG_RULE(2412-10, 2462+10, 40, 0, 20, 0), REG_RULE(2484-10, 2484+10, 40, 0, 20, 0), REG_RULE(5150-10, 5240+10, 40, 0, 30, 0), REG_RULE(5745-10, 5825+10, 40, 0, 30, 0), REG_RULE(5855-10, 5925+10, 40, 0, 33, 0), } }; static const struct ieee80211_regdomain hwsim_world_regdom_custom_02 = { .n_reg_rules = 3, .alpha2 = "99", .reg_rules = { REG_RULE(2412-10, 2462+10, 40, 0, 20, 0), REG_RULE(5725-10, 5850+10, 40, 0, 30, NL80211_RRF_NO_IR), REG_RULE(5855-10, 5925+10, 40, 0, 33, 0), } }; static const struct ieee80211_regdomain hwsim_world_regdom_custom_03 = { .n_reg_rules = 6, .alpha2 = "99", .reg_rules = { REG_RULE(2412 - 10, 2462 + 10, 40, 0, 20, 0), REG_RULE(2484 - 10, 2484 + 10, 40, 0, 20, 0), REG_RULE(5150 - 10, 5240 + 10, 40, 0, 30, 0), REG_RULE(5745 - 10, 5825 + 10, 40, 0, 30, 0), REG_RULE(5855 - 10, 5925 + 10, 40, 0, 33, 0), REG_RULE(5955 - 10, 7125 + 10, 320, 0, 33, 0), } }; static const struct ieee80211_regdomain hwsim_world_regdom_custom_04 = { .n_reg_rules = 6, .alpha2 = "99", .reg_rules = { REG_RULE(2412 - 10, 2462 + 10, 40, 0, 20, 0), REG_RULE(2484 - 10, 2484 + 10, 40, 0, 20, 0), REG_RULE(5150 - 10, 5240 + 10, 80, 0, 30, 0), REG_RULE(5260 - 10, 5320 + 10, 80, 0, 30, NL80211_RRF_DFS_CONCURRENT | NL80211_RRF_DFS), REG_RULE(5745 - 10, 5825 + 10, 80, 0, 30, 0), REG_RULE(5855 - 10, 5925 + 10, 80, 0, 33, 0), } }; static const struct ieee80211_regdomain *hwsim_world_regdom_custom[] = { &hwsim_world_regdom_custom_01, &hwsim_world_regdom_custom_02, &hwsim_world_regdom_custom_03, &hwsim_world_regdom_custom_04, }; struct hwsim_vif_priv { u32 magic; u8 bssid[ETH_ALEN]; bool assoc; bool bcn_en; u16 aid; }; #define HWSIM_VIF_MAGIC 0x69537748 static inline void hwsim_check_magic(struct ieee80211_vif *vif) { struct hwsim_vif_priv *vp = (void *)vif->drv_priv; WARN(vp->magic != HWSIM_VIF_MAGIC, "Invalid VIF (%p) magic %#x, %pM, %d/%d\n", vif, vp->magic, vif->addr, vif->type, vif->p2p); } static inline void hwsim_set_magic(struct ieee80211_vif *vif) { struct hwsim_vif_priv *vp = (void *)vif->drv_priv; vp->magic = HWSIM_VIF_MAGIC; } static inline void hwsim_clear_magic(struct ieee80211_vif *vif) { struct hwsim_vif_priv *vp = (void *)vif->drv_priv; vp->magic = 0; } struct hwsim_sta_priv { u32 magic; unsigned int last_link; u16 active_links_rx; }; #define HWSIM_STA_MAGIC 0x6d537749 static inline void hwsim_check_sta_magic(struct ieee80211_sta *sta) { struct hwsim_sta_priv *sp = (void *)sta->drv_priv; WARN_ON(sp->magic != HWSIM_STA_MAGIC); } static inline void hwsim_set_sta_magic(struct ieee80211_sta *sta) { struct hwsim_sta_priv *sp = (void *)sta->drv_priv; sp->magic = HWSIM_STA_MAGIC; } static inline void hwsim_clear_sta_magic(struct ieee80211_sta *sta) { struct hwsim_sta_priv *sp = (void *)sta->drv_priv; sp->magic = 0; } struct hwsim_chanctx_priv { u32 magic; }; #define HWSIM_CHANCTX_MAGIC 0x6d53774a static inline void hwsim_check_chanctx_magic(struct ieee80211_chanctx_conf *c) { struct hwsim_chanctx_priv *cp = (void *)c->drv_priv; WARN_ON(cp->magic != HWSIM_CHANCTX_MAGIC); } static inline void hwsim_set_chanctx_magic(struct ieee80211_chanctx_conf *c) { struct hwsim_chanctx_priv *cp = (void *)c->drv_priv; cp->magic = HWSIM_CHANCTX_MAGIC; } static inline void hwsim_clear_chanctx_magic(struct ieee80211_chanctx_conf *c) { struct hwsim_chanctx_priv *cp = (void *)c->drv_priv; cp->magic = 0; } static unsigned int hwsim_net_id; static DEFINE_IDA(hwsim_netgroup_ida); struct hwsim_net { int netgroup; u32 wmediumd; }; static inline int hwsim_net_get_netgroup(struct net *net) { struct hwsim_net *hwsim_net = net_generic(net, hwsim_net_id); return hwsim_net->netgroup; } static inline int hwsim_net_set_netgroup(struct net *net) { struct hwsim_net *hwsim_net = net_generic(net, hwsim_net_id); hwsim_net->netgroup = ida_alloc(&hwsim_netgroup_ida, GFP_KERNEL); return hwsim_net->netgroup >= 0 ? 0 : -ENOMEM; } static inline u32 hwsim_net_get_wmediumd(struct net *net) { struct hwsim_net *hwsim_net = net_generic(net, hwsim_net_id); return hwsim_net->wmediumd; } static inline void hwsim_net_set_wmediumd(struct net *net, u32 portid) { struct hwsim_net *hwsim_net = net_generic(net, hwsim_net_id); hwsim_net->wmediumd = portid; } static struct class *hwsim_class; static struct net_device *hwsim_mon; /* global monitor netdev */ #define CHAN2G(_freq) { \ .band = NL80211_BAND_2GHZ, \ .center_freq = (_freq), \ .hw_value = (_freq), \ } #define CHAN5G(_freq) { \ .band = NL80211_BAND_5GHZ, \ .center_freq = (_freq), \ .hw_value = (_freq), \ } #define CHAN6G(_freq) { \ .band = NL80211_BAND_6GHZ, \ .center_freq = (_freq), \ .hw_value = (_freq), \ } static const struct ieee80211_channel hwsim_channels_2ghz[] = { CHAN2G(2412), /* Channel 1 */ CHAN2G(2417), /* Channel 2 */ CHAN2G(2422), /* Channel 3 */ CHAN2G(2427), /* Channel 4 */ CHAN2G(2432), /* Channel 5 */ CHAN2G(2437), /* Channel 6 */ CHAN2G(2442), /* Channel 7 */ CHAN2G(2447), /* Channel 8 */ CHAN2G(2452), /* Channel 9 */ CHAN2G(2457), /* Channel 10 */ CHAN2G(2462), /* Channel 11 */ CHAN2G(2467), /* Channel 12 */ CHAN2G(2472), /* Channel 13 */ CHAN2G(2484), /* Channel 14 */ }; static const struct ieee80211_channel hwsim_channels_5ghz[] = { CHAN5G(5180), /* Channel 36 */ CHAN5G(5200), /* Channel 40 */ CHAN5G(5220), /* Channel 44 */ CHAN5G(5240), /* Channel 48 */ CHAN5G(5260), /* Channel 52 */ CHAN5G(5280), /* Channel 56 */ CHAN5G(5300), /* Channel 60 */ CHAN5G(5320), /* Channel 64 */ CHAN5G(5500), /* Channel 100 */ CHAN5G(5520), /* Channel 104 */ CHAN5G(5540), /* Channel 108 */ CHAN5G(5560), /* Channel 112 */ CHAN5G(5580), /* Channel 116 */ CHAN5G(5600), /* Channel 120 */ CHAN5G(5620), /* Channel 124 */ CHAN5G(5640), /* Channel 128 */ CHAN5G(5660), /* Channel 132 */ CHAN5G(5680), /* Channel 136 */ CHAN5G(5700), /* Channel 140 */ CHAN5G(5745), /* Channel 149 */ CHAN5G(5765), /* Channel 153 */ CHAN5G(5785), /* Channel 157 */ CHAN5G(5805), /* Channel 161 */ CHAN5G(5825), /* Channel 165 */ CHAN5G(5845), /* Channel 169 */ CHAN5G(5855), /* Channel 171 */ CHAN5G(5860), /* Channel 172 */ CHAN5G(5865), /* Channel 173 */ CHAN5G(5870), /* Channel 174 */ CHAN5G(5875), /* Channel 175 */ CHAN5G(5880), /* Channel 176 */ CHAN5G(5885), /* Channel 177 */ CHAN5G(5890), /* Channel 178 */ CHAN5G(5895), /* Channel 179 */ CHAN5G(5900), /* Channel 180 */ CHAN5G(5905), /* Channel 181 */ CHAN5G(5910), /* Channel 182 */ CHAN5G(5915), /* Channel 183 */ CHAN5G(5920), /* Channel 184 */ CHAN5G(5925), /* Channel 185 */ }; static const struct ieee80211_channel hwsim_channels_6ghz[] = { CHAN6G(5955), /* Channel 1 */ CHAN6G(5975), /* Channel 5 */ CHAN6G(5995), /* Channel 9 */ CHAN6G(6015), /* Channel 13 */ CHAN6G(6035), /* Channel 17 */ CHAN6G(6055), /* Channel 21 */ CHAN6G(6075), /* Channel 25 */ CHAN6G(6095), /* Channel 29 */ CHAN6G(6115), /* Channel 33 */ CHAN6G(6135), /* Channel 37 */ CHAN6G(6155), /* Channel 41 */ CHAN6G(6175), /* Channel 45 */ CHAN6G(6195), /* Channel 49 */ CHAN6G(6215), /* Channel 53 */ CHAN6G(6235), /* Channel 57 */ CHAN6G(6255), /* Channel 61 */ CHAN6G(6275), /* Channel 65 */ CHAN6G(6295), /* Channel 69 */ CHAN6G(6315), /* Channel 73 */ CHAN6G(6335), /* Channel 77 */ CHAN6G(6355), /* Channel 81 */ CHAN6G(6375), /* Channel 85 */ CHAN6G(6395), /* Channel 89 */ CHAN6G(6415), /* Channel 93 */ CHAN6G(6435), /* Channel 97 */ CHAN6G(6455), /* Channel 181 */ CHAN6G(6475), /* Channel 105 */ CHAN6G(6495), /* Channel 109 */ CHAN6G(6515), /* Channel 113 */ CHAN6G(6535), /* Channel 117 */ CHAN6G(6555), /* Channel 121 */ CHAN6G(6575), /* Channel 125 */ CHAN6G(6595), /* Channel 129 */ CHAN6G(6615), /* Channel 133 */ CHAN6G(6635), /* Channel 137 */ CHAN6G(6655), /* Channel 141 */ CHAN6G(6675), /* Channel 145 */ CHAN6G(6695), /* Channel 149 */ CHAN6G(6715), /* Channel 153 */ CHAN6G(6735), /* Channel 157 */ CHAN6G(6755), /* Channel 161 */ CHAN6G(6775), /* Channel 165 */ CHAN6G(6795), /* Channel 169 */ CHAN6G(6815), /* Channel 173 */ CHAN6G(6835), /* Channel 177 */ CHAN6G(6855), /* Channel 181 */ CHAN6G(6875), /* Channel 185 */ CHAN6G(6895), /* Channel 189 */ CHAN6G(6915), /* Channel 193 */ CHAN6G(6935), /* Channel 197 */ CHAN6G(6955), /* Channel 201 */ CHAN6G(6975), /* Channel 205 */ CHAN6G(6995), /* Channel 209 */ CHAN6G(7015), /* Channel 213 */ CHAN6G(7035), /* Channel 217 */ CHAN6G(7055), /* Channel 221 */ CHAN6G(7075), /* Channel 225 */ CHAN6G(7095), /* Channel 229 */ CHAN6G(7115), /* Channel 233 */ }; #define NUM_S1G_CHANS_US 51 static struct ieee80211_channel hwsim_channels_s1g[NUM_S1G_CHANS_US]; static const struct ieee80211_sta_s1g_cap hwsim_s1g_cap = { .s1g = true, .cap = { S1G_CAP0_SGI_1MHZ | S1G_CAP0_SGI_2MHZ, 0, 0, S1G_CAP3_MAX_MPDU_LEN, 0, S1G_CAP5_AMPDU, 0, S1G_CAP7_DUP_1MHZ, S1G_CAP8_TWT_RESPOND | S1G_CAP8_TWT_REQUEST, 0}, .nss_mcs = { 0xfc | 1, /* MCS 7 for 1 SS */ /* RX Highest Supported Long GI Data Rate 0:7 */ 0, /* RX Highest Supported Long GI Data Rate 0:7 */ /* TX S1G MCS Map 0:6 */ 0xfa, /* TX S1G MCS Map :7 */ /* TX Highest Supported Long GI Data Rate 0:6 */ 0x80, /* TX Highest Supported Long GI Data Rate 7:8 */ /* Rx Single spatial stream and S1G-MCS Map for 1MHz */ /* Tx Single spatial stream and S1G-MCS Map for 1MHz */ 0 }, }; static void hwsim_init_s1g_channels(struct ieee80211_channel *chans) { int ch, freq; for (ch = 0; ch < NUM_S1G_CHANS_US; ch++) { freq = 902000 + (ch + 1) * 500; chans[ch].band = NL80211_BAND_S1GHZ; chans[ch].center_freq = KHZ_TO_MHZ(freq); chans[ch].freq_offset = freq % 1000; chans[ch].hw_value = ch + 1; } } static const struct ieee80211_rate hwsim_rates[] = { { .bitrate = 10 }, { .bitrate = 20, .flags = IEEE80211_RATE_SHORT_PREAMBLE }, { .bitrate = 55, .flags = IEEE80211_RATE_SHORT_PREAMBLE }, { .bitrate = 110, .flags = IEEE80211_RATE_SHORT_PREAMBLE }, { .bitrate = 60 }, { .bitrate = 90 }, { .bitrate = 120 }, { .bitrate = 180 }, { .bitrate = 240 }, { .bitrate = 360 }, { .bitrate = 480 }, { .bitrate = 540 } }; #define DEFAULT_RX_RSSI -50 static const u32 hwsim_ciphers[] = { WLAN_CIPHER_SUITE_WEP40, WLAN_CIPHER_SUITE_WEP104, WLAN_CIPHER_SUITE_TKIP, WLAN_CIPHER_SUITE_CCMP, WLAN_CIPHER_SUITE_CCMP_256, WLAN_CIPHER_SUITE_GCMP, WLAN_CIPHER_SUITE_GCMP_256, WLAN_CIPHER_SUITE_AES_CMAC, WLAN_CIPHER_SUITE_BIP_CMAC_256, WLAN_CIPHER_SUITE_BIP_GMAC_128, WLAN_CIPHER_SUITE_BIP_GMAC_256, }; #define OUI_QCA 0x001374 #define QCA_NL80211_SUBCMD_TEST 1 enum qca_nl80211_vendor_subcmds { QCA_WLAN_VENDOR_ATTR_TEST = 8, QCA_WLAN_VENDOR_ATTR_MAX = QCA_WLAN_VENDOR_ATTR_TEST }; static const struct nla_policy hwsim_vendor_test_policy[QCA_WLAN_VENDOR_ATTR_MAX + 1] = { [QCA_WLAN_VENDOR_ATTR_MAX] = { .type = NLA_U32 }, }; static int mac80211_hwsim_vendor_cmd_test(struct wiphy *wiphy, struct wireless_dev *wdev, const void *data, int data_len) { struct sk_buff *skb; struct nlattr *tb[QCA_WLAN_VENDOR_ATTR_MAX + 1]; int err; u32 val; err = nla_parse_deprecated(tb, QCA_WLAN_VENDOR_ATTR_MAX, data, data_len, hwsim_vendor_test_policy, NULL); if (err) return err; if (!tb[QCA_WLAN_VENDOR_ATTR_TEST]) return -EINVAL; val = nla_get_u32(tb[QCA_WLAN_VENDOR_ATTR_TEST]); wiphy_dbg(wiphy, "%s: test=%u\n", __func__, val); /* Send a vendor event as a test. Note that this would not normally be * done within a command handler, but rather, based on some other * trigger. For simplicity, this command is used to trigger the event * here. * * event_idx = 0 (index in mac80211_hwsim_vendor_commands) */ skb = cfg80211_vendor_event_alloc(wiphy, wdev, 100, 0, GFP_KERNEL); if (skb) { /* skb_put() or nla_put() will fill up data within * NL80211_ATTR_VENDOR_DATA. */ /* Add vendor data */ nla_put_u32(skb, QCA_WLAN_VENDOR_ATTR_TEST, val + 1); /* Send the event - this will call nla_nest_end() */ cfg80211_vendor_event(skb, GFP_KERNEL); } /* Send a response to the command */ skb = cfg80211_vendor_cmd_alloc_reply_skb(wiphy, 10); if (!skb) return -ENOMEM; /* skb_put() or nla_put() will fill up data within * NL80211_ATTR_VENDOR_DATA */ nla_put_u32(skb, QCA_WLAN_VENDOR_ATTR_TEST, val + 2); return cfg80211_vendor_cmd_reply(skb); } static struct wiphy_vendor_command mac80211_hwsim_vendor_commands[] = { { .info = { .vendor_id = OUI_QCA, .subcmd = QCA_NL80211_SUBCMD_TEST }, .flags = WIPHY_VENDOR_CMD_NEED_NETDEV, .doit = mac80211_hwsim_vendor_cmd_test, .policy = hwsim_vendor_test_policy, .maxattr = QCA_WLAN_VENDOR_ATTR_MAX, } }; /* Advertise support vendor specific events */ static const struct nl80211_vendor_cmd_info mac80211_hwsim_vendor_events[] = { { .vendor_id = OUI_QCA, .subcmd = 1 }, }; static DEFINE_SPINLOCK(hwsim_radio_lock); static LIST_HEAD(hwsim_radios); static struct rhashtable hwsim_radios_rht; static int hwsim_radio_idx; static int hwsim_radios_generation = 1; static struct platform_driver mac80211_hwsim_driver = { .driver = { .name = "mac80211_hwsim", }, }; struct mac80211_hwsim_link_data { u32 link_id; u64 beacon_int /* beacon interval in us */; struct hrtimer beacon_timer; }; struct mac80211_hwsim_data { struct list_head list; struct rhash_head rht; struct ieee80211_hw *hw; struct device *dev; struct ieee80211_supported_band bands[NUM_NL80211_BANDS]; struct ieee80211_channel channels_2ghz[ARRAY_SIZE(hwsim_channels_2ghz)]; struct ieee80211_channel channels_5ghz[ARRAY_SIZE(hwsim_channels_5ghz)]; struct ieee80211_channel channels_6ghz[ARRAY_SIZE(hwsim_channels_6ghz)]; struct ieee80211_channel channels_s1g[ARRAY_SIZE(hwsim_channels_s1g)]; struct ieee80211_rate rates[ARRAY_SIZE(hwsim_rates)]; struct ieee80211_iface_combination if_combination; struct ieee80211_iface_limit if_limits[3]; int n_if_limits; u32 ciphers[ARRAY_SIZE(hwsim_ciphers)]; struct mac_address addresses[2]; int channels, idx; bool use_chanctx; bool destroy_on_close; u32 portid; char alpha2[2]; const struct ieee80211_regdomain *regd; struct ieee80211_channel *tmp_chan; struct ieee80211_channel *roc_chan; u32 roc_duration; struct delayed_work roc_start; struct delayed_work roc_done; struct delayed_work hw_scan; struct cfg80211_scan_request *hw_scan_request; struct ieee80211_vif *hw_scan_vif; int scan_chan_idx; u8 scan_addr[ETH_ALEN]; struct { struct ieee80211_channel *channel; unsigned long next_start, start, end; } survey_data[ARRAY_SIZE(hwsim_channels_2ghz) + ARRAY_SIZE(hwsim_channels_5ghz) + ARRAY_SIZE(hwsim_channels_6ghz)]; struct ieee80211_channel *channel; enum nl80211_chan_width bw; unsigned int rx_filter; bool started, idle, scanning; struct mutex mutex; enum ps_mode { PS_DISABLED, PS_ENABLED, PS_AUTO_POLL, PS_MANUAL_POLL } ps; bool ps_poll_pending; struct dentry *debugfs; atomic_t pending_cookie; struct sk_buff_head pending; /* packets pending */ /* * Only radios in the same group can communicate together (the * channel has to match too). Each bit represents a group. A * radio can be in more than one group. */ u64 group; /* group shared by radios created in the same netns */ int netgroup; /* wmediumd portid responsible for netgroup of this radio */ u32 wmediumd; /* difference between this hw's clock and the real clock, in usecs */ s64 tsf_offset; s64 bcn_delta; /* absolute beacon transmission time. Used to cover up "tx" delay. */ u64 abs_bcn_ts; /* Stats */ u64 tx_pkts; u64 rx_pkts; u64 tx_bytes; u64 rx_bytes; u64 tx_dropped; u64 tx_failed; /* RSSI in rx status of the receiver */ int rx_rssi; /* only used when pmsr capability is supplied */ struct cfg80211_pmsr_capabilities pmsr_capa; struct cfg80211_pmsr_request *pmsr_request; struct wireless_dev *pmsr_request_wdev; struct mac80211_hwsim_link_data link_data[IEEE80211_MLD_MAX_NUM_LINKS]; }; static const struct rhashtable_params hwsim_rht_params = { .nelem_hint = 2, .automatic_shrinking = true, .key_len = ETH_ALEN, .key_offset = offsetof(struct mac80211_hwsim_data, addresses[1]), .head_offset = offsetof(struct mac80211_hwsim_data, rht), }; struct hwsim_radiotap_hdr { struct ieee80211_radiotap_header hdr; __le64 rt_tsft; u8 rt_flags; u8 rt_rate; __le16 rt_channel; __le16 rt_chbitmask; } __packed; struct hwsim_radiotap_ack_hdr { struct ieee80211_radiotap_header hdr; u8 rt_flags; u8 pad; __le16 rt_channel; __le16 rt_chbitmask; } __packed; static struct mac80211_hwsim_data *get_hwsim_data_ref_from_addr(const u8 *addr) { return rhashtable_lookup_fast(&hwsim_radios_rht, addr, hwsim_rht_params); } /* MAC80211_HWSIM netlink family */ static struct genl_family hwsim_genl_family; enum hwsim_multicast_groups { HWSIM_MCGRP_CONFIG, }; static const struct genl_multicast_group hwsim_mcgrps[] = { [HWSIM_MCGRP_CONFIG] = { .name = "config", }, }; /* MAC80211_HWSIM netlink policy */ static const struct nla_policy hwsim_rate_info_policy[HWSIM_RATE_INFO_ATTR_MAX + 1] = { [HWSIM_RATE_INFO_ATTR_FLAGS] = { .type = NLA_U8 }, [HWSIM_RATE_INFO_ATTR_MCS] = { .type = NLA_U8 }, [HWSIM_RATE_INFO_ATTR_LEGACY] = { .type = NLA_U16 }, [HWSIM_RATE_INFO_ATTR_NSS] = { .type = NLA_U8 }, [HWSIM_RATE_INFO_ATTR_BW] = { .type = NLA_U8 }, [HWSIM_RATE_INFO_ATTR_HE_GI] = { .type = NLA_U8 }, [HWSIM_RATE_INFO_ATTR_HE_DCM] = { .type = NLA_U8 }, [HWSIM_RATE_INFO_ATTR_HE_RU_ALLOC] = { .type = NLA_U8 }, [HWSIM_RATE_INFO_ATTR_N_BOUNDED_CH] = { .type = NLA_U8 }, [HWSIM_RATE_INFO_ATTR_EHT_GI] = { .type = NLA_U8 }, [HWSIM_RATE_INFO_ATTR_EHT_RU_ALLOC] = { .type = NLA_U8 }, }; static const struct nla_policy hwsim_ftm_result_policy[NL80211_PMSR_FTM_RESP_ATTR_MAX + 1] = { [NL80211_PMSR_FTM_RESP_ATTR_FAIL_REASON] = { .type = NLA_U32 }, [NL80211_PMSR_FTM_RESP_ATTR_BURST_INDEX] = { .type = NLA_U16 }, [NL80211_PMSR_FTM_RESP_ATTR_NUM_FTMR_ATTEMPTS] = { .type = NLA_U32 }, [NL80211_PMSR_FTM_RESP_ATTR_NUM_FTMR_SUCCESSES] = { .type = NLA_U32 }, [NL80211_PMSR_FTM_RESP_ATTR_BUSY_RETRY_TIME] = { .type = NLA_U8 }, [NL80211_PMSR_FTM_RESP_ATTR_NUM_BURSTS_EXP] = { .type = NLA_U8 }, [NL80211_PMSR_FTM_RESP_ATTR_BURST_DURATION] = { .type = NLA_U8 }, [NL80211_PMSR_FTM_RESP_ATTR_FTMS_PER_BURST] = { .type = NLA_U8 }, [NL80211_PMSR_FTM_RESP_ATTR_RSSI_AVG] = { .type = NLA_U32 }, [NL80211_PMSR_FTM_RESP_ATTR_RSSI_SPREAD] = { .type = NLA_U32 }, [NL80211_PMSR_FTM_RESP_ATTR_TX_RATE] = NLA_POLICY_NESTED(hwsim_rate_info_policy), [NL80211_PMSR_FTM_RESP_ATTR_RX_RATE] = NLA_POLICY_NESTED(hwsim_rate_info_policy), [NL80211_PMSR_FTM_RESP_ATTR_RTT_AVG] = { .type = NLA_U64 }, [NL80211_PMSR_FTM_RESP_ATTR_RTT_VARIANCE] = { .type = NLA_U64 }, [NL80211_PMSR_FTM_RESP_ATTR_RTT_SPREAD] = { .type = NLA_U64 }, [NL80211_PMSR_FTM_RESP_ATTR_DIST_AVG] = { .type = NLA_U64 }, [NL80211_PMSR_FTM_RESP_ATTR_DIST_VARIANCE] = { .type = NLA_U64 }, [NL80211_PMSR_FTM_RESP_ATTR_DIST_SPREAD] = { .type = NLA_U64 }, [NL80211_PMSR_FTM_RESP_ATTR_LCI] = { .type = NLA_STRING }, [NL80211_PMSR_FTM_RESP_ATTR_CIVICLOC] = { .type = NLA_STRING }, }; static const struct nla_policy hwsim_pmsr_resp_type_policy[NL80211_PMSR_TYPE_MAX + 1] = { [NL80211_PMSR_TYPE_FTM] = NLA_POLICY_NESTED(hwsim_ftm_result_policy), }; static const struct nla_policy hwsim_pmsr_resp_policy[NL80211_PMSR_RESP_ATTR_MAX + 1] = { [NL80211_PMSR_RESP_ATTR_STATUS] = { .type = NLA_U32 }, [NL80211_PMSR_RESP_ATTR_HOST_TIME] = { .type = NLA_U64 }, [NL80211_PMSR_RESP_ATTR_AP_TSF] = { .type = NLA_U64 }, [NL80211_PMSR_RESP_ATTR_FINAL] = { .type = NLA_FLAG }, [NL80211_PMSR_RESP_ATTR_DATA] = NLA_POLICY_NESTED(hwsim_pmsr_resp_type_policy), }; static const struct nla_policy hwsim_pmsr_peer_result_policy[NL80211_PMSR_PEER_ATTR_MAX + 1] = { [NL80211_PMSR_PEER_ATTR_ADDR] = NLA_POLICY_ETH_ADDR_COMPAT, [NL80211_PMSR_PEER_ATTR_CHAN] = { .type = NLA_REJECT }, [NL80211_PMSR_PEER_ATTR_REQ] = { .type = NLA_REJECT }, [NL80211_PMSR_PEER_ATTR_RESP] = NLA_POLICY_NESTED(hwsim_pmsr_resp_policy), }; static const struct nla_policy hwsim_pmsr_peers_result_policy[NL80211_PMSR_ATTR_MAX + 1] = { [NL80211_PMSR_ATTR_MAX_PEERS] = { .type = NLA_REJECT }, [NL80211_PMSR_ATTR_REPORT_AP_TSF] = { .type = NLA_REJECT }, [NL80211_PMSR_ATTR_RANDOMIZE_MAC_ADDR] = { .type = NLA_REJECT }, [NL80211_PMSR_ATTR_TYPE_CAPA] = { .type = NLA_REJECT }, [NL80211_PMSR_ATTR_PEERS] = NLA_POLICY_NESTED_ARRAY(hwsim_pmsr_peer_result_policy), }; static const struct nla_policy hwsim_ftm_capa_policy[NL80211_PMSR_FTM_CAPA_ATTR_MAX + 1] = { [NL80211_PMSR_FTM_CAPA_ATTR_ASAP] = { .type = NLA_FLAG }, [NL80211_PMSR_FTM_CAPA_ATTR_NON_ASAP] = { .type = NLA_FLAG }, [NL80211_PMSR_FTM_CAPA_ATTR_REQ_LCI] = { .type = NLA_FLAG }, [NL80211_PMSR_FTM_CAPA_ATTR_REQ_CIVICLOC] = { .type = NLA_FLAG }, [NL80211_PMSR_FTM_CAPA_ATTR_PREAMBLES] = { .type = NLA_U32 }, [NL80211_PMSR_FTM_CAPA_ATTR_BANDWIDTHS] = { .type = NLA_U32 }, [NL80211_PMSR_FTM_CAPA_ATTR_MAX_BURSTS_EXPONENT] = NLA_POLICY_MAX(NLA_U8, 15), [NL80211_PMSR_FTM_CAPA_ATTR_MAX_FTMS_PER_BURST] = NLA_POLICY_MAX(NLA_U8, 31), [NL80211_PMSR_FTM_CAPA_ATTR_TRIGGER_BASED] = { .type = NLA_FLAG }, [NL80211_PMSR_FTM_CAPA_ATTR_NON_TRIGGER_BASED] = { .type = NLA_FLAG }, }; static const struct nla_policy hwsim_pmsr_capa_type_policy[NL80211_PMSR_TYPE_MAX + 1] = { [NL80211_PMSR_TYPE_FTM] = NLA_POLICY_NESTED(hwsim_ftm_capa_policy), }; static const struct nla_policy hwsim_pmsr_capa_policy[NL80211_PMSR_ATTR_MAX + 1] = { [NL80211_PMSR_ATTR_MAX_PEERS] = { .type = NLA_U32 }, [NL80211_PMSR_ATTR_REPORT_AP_TSF] = { .type = NLA_FLAG }, [NL80211_PMSR_ATTR_RANDOMIZE_MAC_ADDR] = { .type = NLA_FLAG }, [NL80211_PMSR_ATTR_TYPE_CAPA] = NLA_POLICY_NESTED(hwsim_pmsr_capa_type_policy), [NL80211_PMSR_ATTR_PEERS] = { .type = NLA_REJECT }, // only for request. }; static const struct nla_policy hwsim_genl_policy[HWSIM_ATTR_MAX + 1] = { [HWSIM_ATTR_ADDR_RECEIVER] = NLA_POLICY_ETH_ADDR_COMPAT, [HWSIM_ATTR_ADDR_TRANSMITTER] = NLA_POLICY_ETH_ADDR_COMPAT, [HWSIM_ATTR_FRAME] = { .type = NLA_BINARY, .len = IEEE80211_MAX_DATA_LEN }, [HWSIM_ATTR_FLAGS] = { .type = NLA_U32 }, [HWSIM_ATTR_RX_RATE] = { .type = NLA_U32 }, [HWSIM_ATTR_SIGNAL] = { .type = NLA_U32 }, [HWSIM_ATTR_TX_INFO] = { .type = NLA_BINARY, .len = IEEE80211_TX_MAX_RATES * sizeof(struct hwsim_tx_rate)}, [HWSIM_ATTR_COOKIE] = { .type = NLA_U64 }, [HWSIM_ATTR_CHANNELS] = { .type = NLA_U32 }, [HWSIM_ATTR_RADIO_ID] = { .type = NLA_U32 }, [HWSIM_ATTR_REG_HINT_ALPHA2] = { .type = NLA_STRING, .len = 2 }, [HWSIM_ATTR_REG_CUSTOM_REG] = { .type = NLA_U32 }, [HWSIM_ATTR_REG_STRICT_REG] = { .type = NLA_FLAG }, [HWSIM_ATTR_SUPPORT_P2P_DEVICE] = { .type = NLA_FLAG }, [HWSIM_ATTR_USE_CHANCTX] = { .type = NLA_FLAG }, [HWSIM_ATTR_DESTROY_RADIO_ON_CLOSE] = { .type = NLA_FLAG }, [HWSIM_ATTR_RADIO_NAME] = { .type = NLA_STRING }, [HWSIM_ATTR_NO_VIF] = { .type = NLA_FLAG }, [HWSIM_ATTR_FREQ] = { .type = NLA_U32 }, [HWSIM_ATTR_TX_INFO_FLAGS] = { .type = NLA_BINARY }, [HWSIM_ATTR_PERM_ADDR] = NLA_POLICY_ETH_ADDR_COMPAT, [HWSIM_ATTR_IFTYPE_SUPPORT] = { .type = NLA_U32 }, [HWSIM_ATTR_CIPHER_SUPPORT] = { .type = NLA_BINARY }, [HWSIM_ATTR_MLO_SUPPORT] = { .type = NLA_FLAG }, [HWSIM_ATTR_PMSR_SUPPORT] = NLA_POLICY_NESTED(hwsim_pmsr_capa_policy), [HWSIM_ATTR_PMSR_RESULT] = NLA_POLICY_NESTED(hwsim_pmsr_peers_result_policy), }; #if IS_REACHABLE(CONFIG_VIRTIO) /* MAC80211_HWSIM virtio queues */ static struct virtqueue *hwsim_vqs[HWSIM_NUM_VQS]; static bool hwsim_virtio_enabled; static DEFINE_SPINLOCK(hwsim_virtio_lock); static void hwsim_virtio_rx_work(struct work_struct *work); static DECLARE_WORK(hwsim_virtio_rx, hwsim_virtio_rx_work); static int hwsim_tx_virtio(struct mac80211_hwsim_data *data, struct sk_buff *skb) { struct scatterlist sg[1]; unsigned long flags; int err; spin_lock_irqsave(&hwsim_virtio_lock, flags); if (!hwsim_virtio_enabled) { err = -ENODEV; goto out_free; } sg_init_one(sg, skb->head, skb_end_offset(skb)); err = virtqueue_add_outbuf(hwsim_vqs[HWSIM_VQ_TX], sg, 1, skb, GFP_ATOMIC); if (err) goto out_free; virtqueue_kick(hwsim_vqs[HWSIM_VQ_TX]); spin_unlock_irqrestore(&hwsim_virtio_lock, flags); return 0; out_free: spin_unlock_irqrestore(&hwsim_virtio_lock, flags); nlmsg_free(skb); return err; } #else /* cause a linker error if this ends up being needed */ extern int hwsim_tx_virtio(struct mac80211_hwsim_data *data, struct sk_buff *skb); #define hwsim_virtio_enabled false #endif static int hwsim_get_chanwidth(enum nl80211_chan_width bw) { switch (bw) { case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_20: return 20; case NL80211_CHAN_WIDTH_40: return 40; case NL80211_CHAN_WIDTH_80: return 80; case NL80211_CHAN_WIDTH_80P80: case NL80211_CHAN_WIDTH_160: return 160; case NL80211_CHAN_WIDTH_320: return 320; case NL80211_CHAN_WIDTH_5: return 5; case NL80211_CHAN_WIDTH_10: return 10; case NL80211_CHAN_WIDTH_1: return 1; case NL80211_CHAN_WIDTH_2: return 2; case NL80211_CHAN_WIDTH_4: return 4; case NL80211_CHAN_WIDTH_8: return 8; case NL80211_CHAN_WIDTH_16: return 16; } return INT_MAX; } static void mac80211_hwsim_tx_frame(struct ieee80211_hw *hw, struct sk_buff *skb, struct ieee80211_channel *chan); /* sysfs attributes */ static void hwsim_send_ps_poll(void *dat, u8 *mac, struct ieee80211_vif *vif) { struct mac80211_hwsim_data *data = dat; struct hwsim_vif_priv *vp = (void *)vif->drv_priv; struct sk_buff *skb; struct ieee80211_pspoll *pspoll; if (!vp->assoc) return; wiphy_dbg(data->hw->wiphy, "%s: send PS-Poll to %pM for aid %d\n", __func__, vp->bssid, vp->aid); skb = dev_alloc_skb(sizeof(*pspoll)); if (!skb) return; pspoll = skb_put(skb, sizeof(*pspoll)); pspoll->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_PSPOLL | IEEE80211_FCTL_PM); pspoll->aid = cpu_to_le16(0xc000 | vp->aid); memcpy(pspoll->bssid, vp->bssid, ETH_ALEN); memcpy(pspoll->ta, mac, ETH_ALEN); rcu_read_lock(); mac80211_hwsim_tx_frame(data->hw, skb, rcu_dereference(vif->bss_conf.chanctx_conf)->def.chan); rcu_read_unlock(); } static void hwsim_send_nullfunc(struct mac80211_hwsim_data *data, u8 *mac, struct ieee80211_vif *vif, int ps) { struct hwsim_vif_priv *vp = (void *)vif->drv_priv; struct sk_buff *skb; struct ieee80211_hdr *hdr; struct ieee80211_tx_info *cb; if (!vp->assoc) return; wiphy_dbg(data->hw->wiphy, "%s: send data::nullfunc to %pM ps=%d\n", __func__, vp->bssid, ps); skb = dev_alloc_skb(sizeof(*hdr)); if (!skb) return; hdr = skb_put(skb, sizeof(*hdr) - ETH_ALEN); hdr->frame_control = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC | IEEE80211_FCTL_TODS | (ps ? IEEE80211_FCTL_PM : 0)); hdr->duration_id = cpu_to_le16(0); memcpy(hdr->addr1, vp->bssid, ETH_ALEN); memcpy(hdr->addr2, mac, ETH_ALEN); memcpy(hdr->addr3, vp->bssid, ETH_ALEN); cb = IEEE80211_SKB_CB(skb); cb->control.rates[0].count = 1; cb->control.rates[1].idx = -1; rcu_read_lock(); mac80211_hwsim_tx_frame(data->hw, skb, rcu_dereference(vif->bss_conf.chanctx_conf)->def.chan); rcu_read_unlock(); } static void hwsim_send_nullfunc_ps(void *dat, u8 *mac, struct ieee80211_vif *vif) { struct mac80211_hwsim_data *data = dat; hwsim_send_nullfunc(data, mac, vif, 1); } static void hwsim_send_nullfunc_no_ps(void *dat, u8 *mac, struct ieee80211_vif *vif) { struct mac80211_hwsim_data *data = dat; hwsim_send_nullfunc(data, mac, vif, 0); } static int hwsim_fops_ps_read(void *dat, u64 *val) { struct mac80211_hwsim_data *data = dat; *val = data->ps; return 0; } static int hwsim_fops_ps_write(void *dat, u64 val) { struct mac80211_hwsim_data *data = dat; enum ps_mode old_ps; if (val != PS_DISABLED && val != PS_ENABLED && val != PS_AUTO_POLL && val != PS_MANUAL_POLL) return -EINVAL; if (val == PS_MANUAL_POLL) { if (data->ps != PS_ENABLED) return -EINVAL; local_bh_disable(); ieee80211_iterate_active_interfaces_atomic( data->hw, IEEE80211_IFACE_ITER_NORMAL, hwsim_send_ps_poll, data); local_bh_enable(); return 0; } old_ps = data->ps; data->ps = val; local_bh_disable(); if (old_ps == PS_DISABLED && val != PS_DISABLED) { ieee80211_iterate_active_interfaces_atomic( data->hw, IEEE80211_IFACE_ITER_NORMAL, hwsim_send_nullfunc_ps, data); } else if (old_ps != PS_DISABLED && val == PS_DISABLED) { ieee80211_iterate_active_interfaces_atomic( data->hw, IEEE80211_IFACE_ITER_NORMAL, hwsim_send_nullfunc_no_ps, data); } local_bh_enable(); return 0; } DEFINE_DEBUGFS_ATTRIBUTE(hwsim_fops_ps, hwsim_fops_ps_read, hwsim_fops_ps_write, "%llu\n"); static int hwsim_write_simulate_radar(void *dat, u64 val) { struct mac80211_hwsim_data *data = dat; ieee80211_radar_detected(data->hw); return 0; } DEFINE_DEBUGFS_ATTRIBUTE(hwsim_simulate_radar, NULL, hwsim_write_simulate_radar, "%llu\n"); static int hwsim_fops_group_read(void *dat, u64 *val) { struct mac80211_hwsim_data *data = dat; *val = data->group; return 0; } static int hwsim_fops_group_write(void *dat, u64 val) { struct mac80211_hwsim_data *data = dat; data->group = val; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(hwsim_fops_group, hwsim_fops_group_read, hwsim_fops_group_write, "%llx\n"); static int hwsim_fops_rx_rssi_read(void *dat, u64 *val) { struct mac80211_hwsim_data *data = dat; *val = data->rx_rssi; return 0; } static int hwsim_fops_rx_rssi_write(void *dat, u64 val) { struct mac80211_hwsim_data *data = dat; int rssi = (int)val; if (rssi >= 0 || rssi < -100) return -EINVAL; data->rx_rssi = rssi; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(hwsim_fops_rx_rssi, hwsim_fops_rx_rssi_read, hwsim_fops_rx_rssi_write, "%lld\n"); static netdev_tx_t hwsim_mon_xmit(struct sk_buff *skb, struct net_device *dev) { /* TODO: allow packet injection */ dev_kfree_skb(skb); return NETDEV_TX_OK; } static inline u64 mac80211_hwsim_get_tsf_raw(void) { return ktime_to_us(ktime_get_real()); } static __le64 __mac80211_hwsim_get_tsf(struct mac80211_hwsim_data *data) { u64 now = mac80211_hwsim_get_tsf_raw(); return cpu_to_le64(now + data->tsf_offset); } static u64 mac80211_hwsim_get_tsf(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { struct mac80211_hwsim_data *data = hw->priv; return le64_to_cpu(__mac80211_hwsim_get_tsf(data)); } static void mac80211_hwsim_set_tsf(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u64 tsf) { struct mac80211_hwsim_data *data = hw->priv; u64 now = mac80211_hwsim_get_tsf(hw, vif); /* MLD not supported here */ u32 bcn_int = data->link_data[0].beacon_int; u64 delta = abs(tsf - now); /* adjust after beaconing with new timestamp at old TBTT */ if (tsf > now) { data->tsf_offset += delta; data->bcn_delta = do_div(delta, bcn_int); } else { data->tsf_offset -= delta; data->bcn_delta = -(s64)do_div(delta, bcn_int); } } static void mac80211_hwsim_monitor_rx(struct ieee80211_hw *hw, struct sk_buff *tx_skb, struct ieee80211_channel *chan) { struct mac80211_hwsim_data *data = hw->priv; struct sk_buff *skb; struct hwsim_radiotap_hdr *hdr; u16 flags, bitrate; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx_skb); struct ieee80211_rate *txrate = ieee80211_get_tx_rate(hw, info); if (!txrate) bitrate = 0; else bitrate = txrate->bitrate; if (!netif_running(hwsim_mon)) return; skb = skb_copy_expand(tx_skb, sizeof(*hdr), 0, GFP_ATOMIC); if (skb == NULL) return; hdr = skb_push(skb, sizeof(*hdr)); hdr->hdr.it_version = PKTHDR_RADIOTAP_VERSION; hdr->hdr.it_pad = 0; hdr->hdr.it_len = cpu_to_le16(sizeof(*hdr)); hdr->hdr.it_present = cpu_to_le32((1 << IEEE80211_RADIOTAP_FLAGS) | (1 << IEEE80211_RADIOTAP_RATE) | (1 << IEEE80211_RADIOTAP_TSFT) | (1 << IEEE80211_RADIOTAP_CHANNEL)); hdr->rt_tsft = __mac80211_hwsim_get_tsf(data); hdr->rt_flags = 0; hdr->rt_rate = bitrate / 5; hdr->rt_channel = cpu_to_le16(chan->center_freq); flags = IEEE80211_CHAN_2GHZ; if (txrate && txrate->flags & IEEE80211_RATE_ERP_G) flags |= IEEE80211_CHAN_OFDM; else flags |= IEEE80211_CHAN_CCK; hdr->rt_chbitmask = cpu_to_le16(flags); skb->dev = hwsim_mon; skb_reset_mac_header(skb); skb->ip_summed = CHECKSUM_UNNECESSARY; skb->pkt_type = PACKET_OTHERHOST; skb->protocol = htons(ETH_P_802_2); memset(skb->cb, 0, sizeof(skb->cb)); netif_rx(skb); } static void mac80211_hwsim_monitor_ack(struct ieee80211_channel *chan, const u8 *addr) { struct sk_buff *skb; struct hwsim_radiotap_ack_hdr *hdr; u16 flags; struct ieee80211_hdr *hdr11; if (!netif_running(hwsim_mon)) return; skb = dev_alloc_skb(100); if (skb == NULL) return; hdr = skb_put(skb, sizeof(*hdr)); hdr->hdr.it_version = PKTHDR_RADIOTAP_VERSION; hdr->hdr.it_pad = 0; hdr->hdr.it_len = cpu_to_le16(sizeof(*hdr)); hdr->hdr.it_present = cpu_to_le32((1 << IEEE80211_RADIOTAP_FLAGS) | (1 << IEEE80211_RADIOTAP_CHANNEL)); hdr->rt_flags = 0; hdr->pad = 0; hdr->rt_channel = cpu_to_le16(chan->center_freq); flags = IEEE80211_CHAN_2GHZ; hdr->rt_chbitmask = cpu_to_le16(flags); hdr11 = skb_put(skb, 10); hdr11->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_ACK); hdr11->duration_id = cpu_to_le16(0); memcpy(hdr11->addr1, addr, ETH_ALEN); skb->dev = hwsim_mon; skb_reset_mac_header(skb); skb->ip_summed = CHECKSUM_UNNECESSARY; skb->pkt_type = PACKET_OTHERHOST; skb->protocol = htons(ETH_P_802_2); memset(skb->cb, 0, sizeof(skb->cb)); netif_rx(skb); } struct mac80211_hwsim_addr_match_data { u8 addr[ETH_ALEN]; bool ret; }; static void mac80211_hwsim_addr_iter(void *data, u8 *mac, struct ieee80211_vif *vif) { int i; struct mac80211_hwsim_addr_match_data *md = data; if (memcmp(mac, md->addr, ETH_ALEN) == 0) { md->ret = true; return; } /* Match the link address */ for (i = 0; i < ARRAY_SIZE(vif->link_conf); i++) { struct ieee80211_bss_conf *conf; conf = rcu_dereference(vif->link_conf[i]); if (!conf) continue; if (memcmp(conf->addr, md->addr, ETH_ALEN) == 0) { md->ret = true; return; } } } static bool mac80211_hwsim_addr_match(struct mac80211_hwsim_data *data, const u8 *addr) { struct mac80211_hwsim_addr_match_data md = { .ret = false, }; if (data->scanning && memcmp(addr, data->scan_addr, ETH_ALEN) == 0) return true; memcpy(md.addr, addr, ETH_ALEN); ieee80211_iterate_active_interfaces_atomic(data->hw, IEEE80211_IFACE_ITER_NORMAL, mac80211_hwsim_addr_iter, &md); return md.ret; } static bool hwsim_ps_rx_ok(struct mac80211_hwsim_data *data, struct sk_buff *skb) { switch (data->ps) { case PS_DISABLED: return true; case PS_ENABLED: return false; case PS_AUTO_POLL: /* TODO: accept (some) Beacons by default and other frames only * if pending PS-Poll has been sent */ return true; case PS_MANUAL_POLL: /* Allow unicast frames to own address if there is a pending * PS-Poll */ if (data->ps_poll_pending && mac80211_hwsim_addr_match(data, skb->data + 4)) { data->ps_poll_pending = false; return true; } return false; } return true; } static int hwsim_unicast_netgroup(struct mac80211_hwsim_data *data, struct sk_buff *skb, int portid) { struct net *net; bool found = false; int res = -ENOENT; rcu_read_lock(); for_each_net_rcu(net) { if (data->netgroup == hwsim_net_get_netgroup(net)) { res = genlmsg_unicast(net, skb, portid); found = true; break; } } rcu_read_unlock(); if (!found) nlmsg_free(skb); return res; } static void mac80211_hwsim_config_mac_nl(struct ieee80211_hw *hw, const u8 *addr, bool add) { struct mac80211_hwsim_data *data = hw->priv; u32 _portid = READ_ONCE(data->wmediumd); struct sk_buff *skb; void *msg_head; WARN_ON(!is_valid_ether_addr(addr)); if (!_portid && !hwsim_virtio_enabled) return; skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) return; msg_head = genlmsg_put(skb, 0, 0, &hwsim_genl_family, 0, add ? HWSIM_CMD_ADD_MAC_ADDR : HWSIM_CMD_DEL_MAC_ADDR); if (!msg_head) { pr_debug("mac80211_hwsim: problem with msg_head\n"); goto nla_put_failure; } if (nla_put(skb, HWSIM_ATTR_ADDR_TRANSMITTER, ETH_ALEN, data->addresses[1].addr)) goto nla_put_failure; if (nla_put(skb, HWSIM_ATTR_ADDR_RECEIVER, ETH_ALEN, addr)) goto nla_put_failure; genlmsg_end(skb, msg_head); if (hwsim_virtio_enabled) hwsim_tx_virtio(data, skb); else hwsim_unicast_netgroup(data, skb, _portid); return; nla_put_failure: nlmsg_free(skb); } static inline u16 trans_tx_rate_flags_ieee2hwsim(struct ieee80211_tx_rate *rate) { u16 result = 0; if (rate->flags & IEEE80211_TX_RC_USE_RTS_CTS) result |= MAC80211_HWSIM_TX_RC_USE_RTS_CTS; if (rate->flags & IEEE80211_TX_RC_USE_CTS_PROTECT) result |= MAC80211_HWSIM_TX_RC_USE_CTS_PROTECT; if (rate->flags & IEEE80211_TX_RC_USE_SHORT_PREAMBLE) result |= MAC80211_HWSIM_TX_RC_USE_SHORT_PREAMBLE; if (rate->flags & IEEE80211_TX_RC_MCS) result |= MAC80211_HWSIM_TX_RC_MCS; if (rate->flags & IEEE80211_TX_RC_GREEN_FIELD) result |= MAC80211_HWSIM_TX_RC_GREEN_FIELD; if (rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH) result |= MAC80211_HWSIM_TX_RC_40_MHZ_WIDTH; if (rate->flags & IEEE80211_TX_RC_DUP_DATA) result |= MAC80211_HWSIM_TX_RC_DUP_DATA; if (rate->flags & IEEE80211_TX_RC_SHORT_GI) result |= MAC80211_HWSIM_TX_RC_SHORT_GI; if (rate->flags & IEEE80211_TX_RC_VHT_MCS) result |= MAC80211_HWSIM_TX_RC_VHT_MCS; if (rate->flags & IEEE80211_TX_RC_80_MHZ_WIDTH) result |= MAC80211_HWSIM_TX_RC_80_MHZ_WIDTH; if (rate->flags & IEEE80211_TX_RC_160_MHZ_WIDTH) result |= MAC80211_HWSIM_TX_RC_160_MHZ_WIDTH; return result; } static void mac80211_hwsim_tx_frame_nl(struct ieee80211_hw *hw, struct sk_buff *my_skb, int dst_portid, struct ieee80211_channel *channel) { struct sk_buff *skb; struct mac80211_hwsim_data *data = hw->priv; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) my_skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(my_skb); void *msg_head; unsigned int hwsim_flags = 0; int i; struct hwsim_tx_rate tx_attempts[IEEE80211_TX_MAX_RATES]; struct hwsim_tx_rate_flag tx_attempts_flags[IEEE80211_TX_MAX_RATES]; uintptr_t cookie; if (data->ps != PS_DISABLED) hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PM); /* If the queue contains MAX_QUEUE skb's drop some */ if (skb_queue_len(&data->pending) >= MAX_QUEUE) { /* Dropping until WARN_QUEUE level */ while (skb_queue_len(&data->pending) >= WARN_QUEUE) { ieee80211_free_txskb(hw, skb_dequeue(&data->pending)); data->tx_dropped++; } } skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (skb == NULL) goto nla_put_failure; msg_head = genlmsg_put(skb, 0, 0, &hwsim_genl_family, 0, HWSIM_CMD_FRAME); if (msg_head == NULL) { pr_debug("mac80211_hwsim: problem with msg_head\n"); goto nla_put_failure; } if (nla_put(skb, HWSIM_ATTR_ADDR_TRANSMITTER, ETH_ALEN, data->addresses[1].addr)) goto nla_put_failure; /* We get the skb->data */ if (nla_put(skb, HWSIM_ATTR_FRAME, my_skb->len, my_skb->data)) goto nla_put_failure; /* We get the flags for this transmission, and we translate them to wmediumd flags */ if (info->flags & IEEE80211_TX_CTL_REQ_TX_STATUS) hwsim_flags |= HWSIM_TX_CTL_REQ_TX_STATUS; if (info->flags & IEEE80211_TX_CTL_NO_ACK) hwsim_flags |= HWSIM_TX_CTL_NO_ACK; if (nla_put_u32(skb, HWSIM_ATTR_FLAGS, hwsim_flags)) goto nla_put_failure; if (nla_put_u32(skb, HWSIM_ATTR_FREQ, channel->center_freq)) goto nla_put_failure; /* We get the tx control (rate and retries) info*/ for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { tx_attempts[i].idx = info->status.rates[i].idx; tx_attempts_flags[i].idx = info->status.rates[i].idx; tx_attempts[i].count = info->status.rates[i].count; tx_attempts_flags[i].flags = trans_tx_rate_flags_ieee2hwsim( &info->status.rates[i]); } if (nla_put(skb, HWSIM_ATTR_TX_INFO, sizeof(struct hwsim_tx_rate)*IEEE80211_TX_MAX_RATES, tx_attempts)) goto nla_put_failure; if (nla_put(skb, HWSIM_ATTR_TX_INFO_FLAGS, sizeof(struct hwsim_tx_rate_flag) * IEEE80211_TX_MAX_RATES, tx_attempts_flags)) goto nla_put_failure; /* We create a cookie to identify this skb */ cookie = atomic_inc_return(&data->pending_cookie); info->rate_driver_data[0] = (void *)cookie; if (nla_put_u64_64bit(skb, HWSIM_ATTR_COOKIE, cookie, HWSIM_ATTR_PAD)) goto nla_put_failure; genlmsg_end(skb, msg_head); if (hwsim_virtio_enabled) { if (hwsim_tx_virtio(data, skb)) goto err_free_txskb; } else { if (hwsim_unicast_netgroup(data, skb, dst_portid)) goto err_free_txskb; } /* Enqueue the packet */ skb_queue_tail(&data->pending, my_skb); data->tx_pkts++; data->tx_bytes += my_skb->len; return; nla_put_failure: nlmsg_free(skb); err_free_txskb: pr_debug("mac80211_hwsim: error occurred in %s\n", __func__); ieee80211_free_txskb(hw, my_skb); data->tx_failed++; } static bool hwsim_chans_compat(struct ieee80211_channel *c1, struct ieee80211_channel *c2) { if (!c1 || !c2) return false; return c1->center_freq == c2->center_freq; } struct tx_iter_data { struct ieee80211_channel *channel; bool receive; }; static void mac80211_hwsim_tx_iter(void *_data, u8 *addr, struct ieee80211_vif *vif) { struct tx_iter_data *data = _data; int i; for (i = 0; i < ARRAY_SIZE(vif->link_conf); i++) { struct ieee80211_bss_conf *conf; struct ieee80211_chanctx_conf *chanctx; conf = rcu_dereference(vif->link_conf[i]); if (!conf) continue; chanctx = rcu_dereference(conf->chanctx_conf); if (!chanctx) continue; if (!hwsim_chans_compat(data->channel, chanctx->def.chan)) continue; data->receive = true; return; } } static void mac80211_hwsim_add_vendor_rtap(struct sk_buff *skb) { /* * To enable this code, #define the HWSIM_RADIOTAP_OUI, * e.g. like this: * #define HWSIM_RADIOTAP_OUI "\x02\x00\x00" * (but you should use a valid OUI, not that) * * If anyone wants to 'donate' a radiotap OUI/subns code * please send a patch removing this #ifdef and changing * the values accordingly. */ #ifdef HWSIM_RADIOTAP_OUI struct ieee80211_radiotap_vendor_tlv *rtap; static const char vendor_data[8] = "ABCDEFGH"; // Make sure no padding is needed BUILD_BUG_ON(sizeof(vendor_data) % 4); /* this is last radiotap info before the mac header, so * skb_reset_mac_header for mac8022 to know the end of * the radiotap TLV/beginning of the 802.11 header */ skb_reset_mac_header(skb); /* * Note that this code requires the headroom in the SKB * that was allocated earlier. */ rtap = skb_push(skb, sizeof(*rtap) + sizeof(vendor_data)); rtap->len = cpu_to_le16(sizeof(*rtap) - sizeof(struct ieee80211_radiotap_tlv) + sizeof(vendor_data)); rtap->type = cpu_to_le16(IEEE80211_RADIOTAP_VENDOR_NAMESPACE); rtap->content.oui[0] = HWSIM_RADIOTAP_OUI[0]; rtap->content.oui[1] = HWSIM_RADIOTAP_OUI[1]; rtap->content.oui[2] = HWSIM_RADIOTAP_OUI[2]; rtap->content.oui_subtype = 127; /* clear reserved field */ rtap->content.reserved = 0; rtap->content.vendor_type = 0; memcpy(rtap->content.data, vendor_data, sizeof(vendor_data)); IEEE80211_SKB_RXCB(skb)->flag |= RX_FLAG_RADIOTAP_TLV_AT_END; #endif } static void mac80211_hwsim_rx(struct mac80211_hwsim_data *data, struct ieee80211_rx_status *rx_status, struct sk_buff *skb) { struct ieee80211_hdr *hdr = (void *)skb->data; if (!ieee80211_has_morefrags(hdr->frame_control) && !is_multicast_ether_addr(hdr->addr1) && (ieee80211_is_mgmt(hdr->frame_control) || ieee80211_is_data(hdr->frame_control))) { struct ieee80211_sta *sta; unsigned int link_id; rcu_read_lock(); sta = ieee80211_find_sta_by_link_addrs(data->hw, hdr->addr2, hdr->addr1, &link_id); if (sta) { struct hwsim_sta_priv *sp = (void *)sta->drv_priv; if (ieee80211_has_pm(hdr->frame_control)) sp->active_links_rx &= ~BIT(link_id); else sp->active_links_rx |= BIT(link_id); } rcu_read_unlock(); } memcpy(IEEE80211_SKB_RXCB(skb), rx_status, sizeof(*rx_status)); mac80211_hwsim_add_vendor_rtap(skb); data->rx_pkts++; data->rx_bytes += skb->len; ieee80211_rx_irqsafe(data->hw, skb); } static bool mac80211_hwsim_tx_frame_no_nl(struct ieee80211_hw *hw, struct sk_buff *skb, struct ieee80211_channel *chan) { struct mac80211_hwsim_data *data = hw->priv, *data2; bool ack = false; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_rx_status rx_status; u64 now; memset(&rx_status, 0, sizeof(rx_status)); rx_status.flag |= RX_FLAG_MACTIME_START; rx_status.freq = chan->center_freq; rx_status.freq_offset = chan->freq_offset ? 1 : 0; rx_status.band = chan->band; if (info->control.rates[0].flags & IEEE80211_TX_RC_VHT_MCS) { rx_status.rate_idx = ieee80211_rate_get_vht_mcs(&info->control.rates[0]); rx_status.nss = ieee80211_rate_get_vht_nss(&info->control.rates[0]); rx_status.encoding = RX_ENC_VHT; } else { rx_status.rate_idx = info->control.rates[0].idx; if (info->control.rates[0].flags & IEEE80211_TX_RC_MCS) rx_status.encoding = RX_ENC_HT; } if (info->control.rates[0].flags & IEEE80211_TX_RC_40_MHZ_WIDTH) rx_status.bw = RATE_INFO_BW_40; else if (info->control.rates[0].flags & IEEE80211_TX_RC_80_MHZ_WIDTH) rx_status.bw = RATE_INFO_BW_80; else if (info->control.rates[0].flags & IEEE80211_TX_RC_160_MHZ_WIDTH) rx_status.bw = RATE_INFO_BW_160; else rx_status.bw = RATE_INFO_BW_20; if (info->control.rates[0].flags & IEEE80211_TX_RC_SHORT_GI) rx_status.enc_flags |= RX_ENC_FLAG_SHORT_GI; /* TODO: simulate optional packet loss */ rx_status.signal = data->rx_rssi; if (info->control.vif) rx_status.signal += info->control.vif->bss_conf.txpower; if (data->ps != PS_DISABLED) hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PM); /* release the skb's source info */ skb_orphan(skb); skb_dst_drop(skb); skb->mark = 0; skb_ext_reset(skb); nf_reset_ct(skb); /* * Get absolute mactime here so all HWs RX at the "same time", and * absolute TX time for beacon mactime so the timestamp matches. * Giving beacons a different mactime than non-beacons looks messy, but * it helps the Toffset be exact and a ~10us mactime discrepancy * probably doesn't really matter. */ if (ieee80211_is_beacon(hdr->frame_control) || ieee80211_is_probe_resp(hdr->frame_control)) { rx_status.boottime_ns = ktime_get_boottime_ns(); now = data->abs_bcn_ts; } else { now = mac80211_hwsim_get_tsf_raw(); } /* Copy skb to all enabled radios that are on the current frequency */ spin_lock(&hwsim_radio_lock); list_for_each_entry(data2, &hwsim_radios, list) { struct sk_buff *nskb; struct tx_iter_data tx_iter_data = { .receive = false, .channel = chan, }; if (data == data2) continue; if (!data2->started || (data2->idle && !data2->tmp_chan) || !hwsim_ps_rx_ok(data2, skb)) continue; if (!(data->group & data2->group)) continue; if (data->netgroup != data2->netgroup) continue; if (!hwsim_chans_compat(chan, data2->tmp_chan) && !hwsim_chans_compat(chan, data2->channel)) { ieee80211_iterate_active_interfaces_atomic( data2->hw, IEEE80211_IFACE_ITER_NORMAL, mac80211_hwsim_tx_iter, &tx_iter_data); if (!tx_iter_data.receive) continue; } /* * reserve some space for our vendor and the normal * radiotap header, since we're copying anyway */ if (skb->len < PAGE_SIZE && paged_rx) { struct page *page = alloc_page(GFP_ATOMIC); if (!page) continue; nskb = dev_alloc_skb(128); if (!nskb) { __free_page(page); continue; } memcpy(page_address(page), skb->data, skb->len); skb_add_rx_frag(nskb, 0, page, 0, skb->len, skb->len); } else { nskb = skb_copy(skb, GFP_ATOMIC); if (!nskb) continue; } if (mac80211_hwsim_addr_match(data2, hdr->addr1)) ack = true; rx_status.mactime = now + data2->tsf_offset; mac80211_hwsim_rx(data2, &rx_status, nskb); } spin_unlock(&hwsim_radio_lock); return ack; } static struct ieee80211_bss_conf * mac80211_hwsim_select_tx_link(struct mac80211_hwsim_data *data, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct ieee80211_hdr *hdr, struct ieee80211_link_sta **link_sta) { struct hwsim_sta_priv *sp = (void *)sta->drv_priv; int i; if (!ieee80211_vif_is_mld(vif)) return &vif->bss_conf; WARN_ON(is_multicast_ether_addr(hdr->addr1)); if (WARN_ON_ONCE(!sta || !sta->valid_links)) return &vif->bss_conf; for (i = 0; i < ARRAY_SIZE(vif->link_conf); i++) { struct ieee80211_bss_conf *bss_conf; unsigned int link_id; /* round-robin the available link IDs */ link_id = (sp->last_link + i + 1) % ARRAY_SIZE(vif->link_conf); if (!(vif->active_links & BIT(link_id))) continue; if (!(sp->active_links_rx & BIT(link_id))) continue; *link_sta = rcu_dereference(sta->link[link_id]); if (!*link_sta) continue; bss_conf = rcu_dereference(vif->link_conf[link_id]); if (WARN_ON_ONCE(!bss_conf)) continue; /* can happen while switching links */ if (!rcu_access_pointer(bss_conf->chanctx_conf)) continue; sp->last_link = link_id; return bss_conf; } return NULL; } static void mac80211_hwsim_tx(struct ieee80211_hw *hw, struct ieee80211_tx_control *control, struct sk_buff *skb) { struct mac80211_hwsim_data *data = hw->priv; struct ieee80211_tx_info *txi = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (void *)skb->data; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_channel *channel; bool ack; enum nl80211_chan_width confbw = NL80211_CHAN_WIDTH_20_NOHT; u32 _portid, i; if (WARN_ON(skb->len < 10)) { /* Should not happen; just a sanity check for addr1 use */ ieee80211_free_txskb(hw, skb); return; } if (!data->use_chanctx) { channel = data->channel; confbw = data->bw; } else if (txi->hw_queue == 4) { channel = data->tmp_chan; } else { u8 link = u32_get_bits(IEEE80211_SKB_CB(skb)->control.flags, IEEE80211_TX_CTRL_MLO_LINK); struct ieee80211_vif *vif = txi->control.vif; struct ieee80211_link_sta *link_sta = NULL; struct ieee80211_sta *sta = control->sta; struct ieee80211_bss_conf *bss_conf; if (link != IEEE80211_LINK_UNSPECIFIED) { bss_conf = rcu_dereference(txi->control.vif->link_conf[link]); if (sta) link_sta = rcu_dereference(sta->link[link]); } else { bss_conf = mac80211_hwsim_select_tx_link(data, vif, sta, hdr, &link_sta); } if (unlikely(!bss_conf)) { /* if it's an MLO STA, it might have deactivated all * links temporarily - but we don't handle real PS in * this code yet, so just drop the frame in that case */ WARN(link != IEEE80211_LINK_UNSPECIFIED || !sta || !sta->mlo, "link:%d, sta:%pM, sta->mlo:%d\n", link, sta ? sta->addr : NULL, sta ? sta->mlo : -1); ieee80211_free_txskb(hw, skb); return; } if (sta && sta->mlo) { if (WARN_ON(!link_sta)) { ieee80211_free_txskb(hw, skb); return; } /* address translation to link addresses on TX */ ether_addr_copy(hdr->addr1, link_sta->addr); ether_addr_copy(hdr->addr2, bss_conf->addr); /* translate A3 only if it's the BSSID */ if (!ieee80211_has_tods(hdr->frame_control) && !ieee80211_has_fromds(hdr->frame_control)) { if (ether_addr_equal(hdr->addr3, sta->addr)) ether_addr_copy(hdr->addr3, link_sta->addr); else if (ether_addr_equal(hdr->addr3, vif->addr)) ether_addr_copy(hdr->addr3, bss_conf->addr); } /* no need to look at A4, if present it's SA */ } chanctx_conf = rcu_dereference(bss_conf->chanctx_conf); if (chanctx_conf) { channel = chanctx_conf->def.chan; confbw = chanctx_conf->def.width; } else { channel = NULL; } } if (WARN(!channel, "TX w/o channel - queue = %d\n", txi->hw_queue)) { ieee80211_free_txskb(hw, skb); return; } if (data->idle && !data->tmp_chan) { wiphy_dbg(hw->wiphy, "Trying to TX when idle - reject\n"); ieee80211_free_txskb(hw, skb); return; } if (txi->control.vif) hwsim_check_magic(txi->control.vif); if (control->sta) hwsim_check_sta_magic(control->sta); if (ieee80211_hw_check(hw, SUPPORTS_RC_TABLE)) ieee80211_get_tx_rates(txi->control.vif, control->sta, skb, txi->control.rates, ARRAY_SIZE(txi->control.rates)); for (i = 0; i < ARRAY_SIZE(txi->control.rates); i++) { u16 rflags = txi->control.rates[i].flags; /* initialize to data->bw for 5/10 MHz handling */ enum nl80211_chan_width bw = data->bw; if (txi->control.rates[i].idx == -1) break; if (rflags & IEEE80211_TX_RC_40_MHZ_WIDTH) bw = NL80211_CHAN_WIDTH_40; else if (rflags & IEEE80211_TX_RC_80_MHZ_WIDTH) bw = NL80211_CHAN_WIDTH_80; else if (rflags & IEEE80211_TX_RC_160_MHZ_WIDTH) bw = NL80211_CHAN_WIDTH_160; if (WARN_ON(hwsim_get_chanwidth(bw) > hwsim_get_chanwidth(confbw))) return; } if (skb->len >= 24 + 8 && ieee80211_is_probe_resp(hdr->frame_control)) { /* fake header transmission time */ struct ieee80211_mgmt *mgmt; struct ieee80211_rate *txrate; /* TODO: get MCS */ int bitrate = 100; u64 ts; mgmt = (struct ieee80211_mgmt *)skb->data; txrate = ieee80211_get_tx_rate(hw, txi); if (txrate) bitrate = txrate->bitrate; ts = mac80211_hwsim_get_tsf_raw(); mgmt->u.probe_resp.timestamp = cpu_to_le64(ts + data->tsf_offset + 24 * 8 * 10 / bitrate); } mac80211_hwsim_monitor_rx(hw, skb, channel); /* wmediumd mode check */ _portid = READ_ONCE(data->wmediumd); if (_portid || hwsim_virtio_enabled) return mac80211_hwsim_tx_frame_nl(hw, skb, _portid, channel); /* NO wmediumd detected, perfect medium simulation */ data->tx_pkts++; data->tx_bytes += skb->len; ack = mac80211_hwsim_tx_frame_no_nl(hw, skb, channel); if (ack && skb->len >= 16) mac80211_hwsim_monitor_ack(channel, hdr->addr2); ieee80211_tx_info_clear_status(txi); /* frame was transmitted at most favorable rate at first attempt */ txi->control.rates[0].count = 1; txi->control.rates[1].idx = -1; if (!(txi->flags & IEEE80211_TX_CTL_NO_ACK) && ack) txi->flags |= IEEE80211_TX_STAT_ACK; ieee80211_tx_status_irqsafe(hw, skb); } static int mac80211_hwsim_start(struct ieee80211_hw *hw) { struct mac80211_hwsim_data *data = hw->priv; wiphy_dbg(hw->wiphy, "%s\n", __func__); data->started = true; return 0; } static void mac80211_hwsim_stop(struct ieee80211_hw *hw) { struct mac80211_hwsim_data *data = hw->priv; int i; data->started = false; for (i = 0; i < ARRAY_SIZE(data->link_data); i++) hrtimer_cancel(&data->link_data[i].beacon_timer); while (!skb_queue_empty(&data->pending)) ieee80211_free_txskb(hw, skb_dequeue(&data->pending)); wiphy_dbg(hw->wiphy, "%s\n", __func__); } static int mac80211_hwsim_add_interface(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { wiphy_dbg(hw->wiphy, "%s (type=%d mac_addr=%pM)\n", __func__, ieee80211_vif_type_p2p(vif), vif->addr); hwsim_set_magic(vif); if (vif->type != NL80211_IFTYPE_MONITOR) mac80211_hwsim_config_mac_nl(hw, vif->addr, true); vif->cab_queue = 0; vif->hw_queue[IEEE80211_AC_VO] = 0; vif->hw_queue[IEEE80211_AC_VI] = 1; vif->hw_queue[IEEE80211_AC_BE] = 2; vif->hw_queue[IEEE80211_AC_BK] = 3; return 0; } static int mac80211_hwsim_change_interface(struct ieee80211_hw *hw, struct ieee80211_vif *vif, enum nl80211_iftype newtype, bool newp2p) { newtype = ieee80211_iftype_p2p(newtype, newp2p); wiphy_dbg(hw->wiphy, "%s (old type=%d, new type=%d, mac_addr=%pM)\n", __func__, ieee80211_vif_type_p2p(vif), newtype, vif->addr); hwsim_check_magic(vif); /* * interface may change from non-AP to AP in * which case this needs to be set up again */ vif->cab_queue = 0; return 0; } static void mac80211_hwsim_remove_interface( struct ieee80211_hw *hw, struct ieee80211_vif *vif) { wiphy_dbg(hw->wiphy, "%s (type=%d mac_addr=%pM)\n", __func__, ieee80211_vif_type_p2p(vif), vif->addr); hwsim_check_magic(vif); hwsim_clear_magic(vif); if (vif->type != NL80211_IFTYPE_MONITOR) mac80211_hwsim_config_mac_nl(hw, vif->addr, false); } static void mac80211_hwsim_tx_frame(struct ieee80211_hw *hw, struct sk_buff *skb, struct ieee80211_channel *chan) { struct mac80211_hwsim_data *data = hw->priv; u32 _portid = READ_ONCE(data->wmediumd); if (ieee80211_hw_check(hw, SUPPORTS_RC_TABLE)) { struct ieee80211_tx_info *txi = IEEE80211_SKB_CB(skb); ieee80211_get_tx_rates(txi->control.vif, NULL, skb, txi->control.rates, ARRAY_SIZE(txi->control.rates)); } mac80211_hwsim_monitor_rx(hw, skb, chan); if (_portid || hwsim_virtio_enabled) return mac80211_hwsim_tx_frame_nl(hw, skb, _portid, chan); data->tx_pkts++; data->tx_bytes += skb->len; mac80211_hwsim_tx_frame_no_nl(hw, skb, chan); dev_kfree_skb(skb); } static void __mac80211_hwsim_beacon_tx(struct ieee80211_bss_conf *link_conf, struct mac80211_hwsim_data *data, struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct sk_buff *skb) { struct ieee80211_tx_info *info; struct ieee80211_rate *txrate; struct ieee80211_mgmt *mgmt; /* TODO: get MCS */ int bitrate = 100; info = IEEE80211_SKB_CB(skb); if (ieee80211_hw_check(hw, SUPPORTS_RC_TABLE)) ieee80211_get_tx_rates(vif, NULL, skb, info->control.rates, ARRAY_SIZE(info->control.rates)); txrate = ieee80211_get_tx_rate(hw, info); if (txrate) bitrate = txrate->bitrate; mgmt = (struct ieee80211_mgmt *) skb->data; /* fake header transmission time */ data->abs_bcn_ts = mac80211_hwsim_get_tsf_raw(); if (ieee80211_is_s1g_beacon(mgmt->frame_control)) { struct ieee80211_ext *ext = (void *) mgmt; ext->u.s1g_beacon.timestamp = cpu_to_le32(data->abs_bcn_ts + data->tsf_offset + 10 * 8 * 10 / bitrate); } else { mgmt->u.beacon.timestamp = cpu_to_le64(data->abs_bcn_ts + data->tsf_offset + 24 * 8 * 10 / bitrate); } mac80211_hwsim_tx_frame(hw, skb, rcu_dereference(link_conf->chanctx_conf)->def.chan); } static void mac80211_hwsim_beacon_tx(void *arg, u8 *mac, struct ieee80211_vif *vif) { struct mac80211_hwsim_link_data *link_data = arg; u32 link_id = link_data->link_id; struct ieee80211_bss_conf *link_conf; struct mac80211_hwsim_data *data = container_of(link_data, struct mac80211_hwsim_data, link_data[link_id]); struct ieee80211_hw *hw = data->hw; struct sk_buff *skb; hwsim_check_magic(vif); link_conf = rcu_dereference(vif->link_conf[link_id]); if (!link_conf) return; if (vif->type != NL80211_IFTYPE_AP && vif->type != NL80211_IFTYPE_MESH_POINT && vif->type != NL80211_IFTYPE_ADHOC && vif->type != NL80211_IFTYPE_OCB) return; if (vif->mbssid_tx_vif && vif->mbssid_tx_vif != vif) return; if (vif->bss_conf.ema_ap) { struct ieee80211_ema_beacons *ema; u8 i = 0; ema = ieee80211_beacon_get_template_ema_list(hw, vif, link_id); if (!ema || !ema->cnt) return; for (i = 0; i < ema->cnt; i++) { __mac80211_hwsim_beacon_tx(link_conf, data, hw, vif, ema->bcn[i].skb); ema->bcn[i].skb = NULL; /* Already freed */ } ieee80211_beacon_free_ema_list(ema); } else { skb = ieee80211_beacon_get(hw, vif, link_id); if (!skb) return; __mac80211_hwsim_beacon_tx(link_conf, data, hw, vif, skb); } while ((skb = ieee80211_get_buffered_bc(hw, vif)) != NULL) { mac80211_hwsim_tx_frame(hw, skb, rcu_dereference(link_conf->chanctx_conf)->def.chan); } if (link_conf->csa_active && ieee80211_beacon_cntdwn_is_complete(vif)) ieee80211_csa_finish(vif); } static enum hrtimer_restart mac80211_hwsim_beacon(struct hrtimer *timer) { struct mac80211_hwsim_link_data *link_data = container_of(timer, struct mac80211_hwsim_link_data, beacon_timer); struct mac80211_hwsim_data *data = container_of(link_data, struct mac80211_hwsim_data, link_data[link_data->link_id]); struct ieee80211_hw *hw = data->hw; u64 bcn_int = link_data->beacon_int; if (!data->started) return HRTIMER_NORESTART; ieee80211_iterate_active_interfaces_atomic( hw, IEEE80211_IFACE_ITER_NORMAL, mac80211_hwsim_beacon_tx, link_data); /* beacon at new TBTT + beacon interval */ if (data->bcn_delta) { bcn_int -= data->bcn_delta; data->bcn_delta = 0; } hrtimer_forward_now(&link_data->beacon_timer, ns_to_ktime(bcn_int * NSEC_PER_USEC)); return HRTIMER_RESTART; } static const char * const hwsim_chanwidths[] = { [NL80211_CHAN_WIDTH_5] = "ht5", [NL80211_CHAN_WIDTH_10] = "ht10", [NL80211_CHAN_WIDTH_20_NOHT] = "noht", [NL80211_CHAN_WIDTH_20] = "ht20", [NL80211_CHAN_WIDTH_40] = "ht40", [NL80211_CHAN_WIDTH_80] = "vht80", [NL80211_CHAN_WIDTH_80P80] = "vht80p80", [NL80211_CHAN_WIDTH_160] = "vht160", [NL80211_CHAN_WIDTH_1] = "1MHz", [NL80211_CHAN_WIDTH_2] = "2MHz", [NL80211_CHAN_WIDTH_4] = "4MHz", [NL80211_CHAN_WIDTH_8] = "8MHz", [NL80211_CHAN_WIDTH_16] = "16MHz", }; static int mac80211_hwsim_config(struct ieee80211_hw *hw, u32 changed) { struct mac80211_hwsim_data *data = hw->priv; struct ieee80211_conf *conf = &hw->conf; static const char *smps_modes[IEEE80211_SMPS_NUM_MODES] = { [IEEE80211_SMPS_AUTOMATIC] = "auto", [IEEE80211_SMPS_OFF] = "off", [IEEE80211_SMPS_STATIC] = "static", [IEEE80211_SMPS_DYNAMIC] = "dynamic", }; int idx; if (conf->chandef.chan) wiphy_dbg(hw->wiphy, "%s (freq=%d(%d - %d)/%s idle=%d ps=%d smps=%s)\n", __func__, conf->chandef.chan->center_freq, conf->chandef.center_freq1, conf->chandef.center_freq2, hwsim_chanwidths[conf->chandef.width], !!(conf->flags & IEEE80211_CONF_IDLE), !!(conf->flags & IEEE80211_CONF_PS), smps_modes[conf->smps_mode]); else wiphy_dbg(hw->wiphy, "%s (freq=0 idle=%d ps=%d smps=%s)\n", __func__, !!(conf->flags & IEEE80211_CONF_IDLE), !!(conf->flags & IEEE80211_CONF_PS), smps_modes[conf->smps_mode]); data->idle = !!(conf->flags & IEEE80211_CONF_IDLE); WARN_ON(conf->chandef.chan && data->use_chanctx); mutex_lock(&data->mutex); if (data->scanning && conf->chandef.chan) { for (idx = 0; idx < ARRAY_SIZE(data->survey_data); idx++) { if (data->survey_data[idx].channel == data->channel) { data->survey_data[idx].start = data->survey_data[idx].next_start; data->survey_data[idx].end = jiffies; break; } } data->channel = conf->chandef.chan; data->bw = conf->chandef.width; for (idx = 0; idx < ARRAY_SIZE(data->survey_data); idx++) { if (data->survey_data[idx].channel && data->survey_data[idx].channel != data->channel) continue; data->survey_data[idx].channel = data->channel; data->survey_data[idx].next_start = jiffies; break; } } else { data->channel = conf->chandef.chan; data->bw = conf->chandef.width; } mutex_unlock(&data->mutex); for (idx = 0; idx < ARRAY_SIZE(data->link_data); idx++) { struct mac80211_hwsim_link_data *link_data = &data->link_data[idx]; if (!data->started || !link_data->beacon_int) { hrtimer_cancel(&link_data->beacon_timer); } else if (!hrtimer_is_queued(&link_data->beacon_timer)) { u64 tsf = mac80211_hwsim_get_tsf(hw, NULL); u32 bcn_int = link_data->beacon_int; u64 until_tbtt = bcn_int - do_div(tsf, bcn_int); hrtimer_start(&link_data->beacon_timer, ns_to_ktime(until_tbtt * NSEC_PER_USEC), HRTIMER_MODE_REL_SOFT); } } return 0; } static void mac80211_hwsim_configure_filter(struct ieee80211_hw *hw, unsigned int changed_flags, unsigned int *total_flags,u64 multicast) { struct mac80211_hwsim_data *data = hw->priv; wiphy_dbg(hw->wiphy, "%s\n", __func__); data->rx_filter = 0; if (*total_flags & FIF_ALLMULTI) data->rx_filter |= FIF_ALLMULTI; if (*total_flags & FIF_MCAST_ACTION) data->rx_filter |= FIF_MCAST_ACTION; *total_flags = data->rx_filter; } static void mac80211_hwsim_bcn_en_iter(void *data, u8 *mac, struct ieee80211_vif *vif) { unsigned int *count = data; struct hwsim_vif_priv *vp = (void *)vif->drv_priv; if (vp->bcn_en) (*count)++; } static void mac80211_hwsim_vif_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u64 changed) { struct hwsim_vif_priv *vp = (void *)vif->drv_priv; hwsim_check_magic(vif); wiphy_dbg(hw->wiphy, "%s(changed=0x%llx vif->addr=%pM)\n", __func__, changed, vif->addr); if (changed & BSS_CHANGED_ASSOC) { wiphy_dbg(hw->wiphy, " ASSOC: assoc=%d aid=%d\n", vif->cfg.assoc, vif->cfg.aid); vp->assoc = vif->cfg.assoc; vp->aid = vif->cfg.aid; } if (vif->type == NL80211_IFTYPE_STATION && changed & BSS_CHANGED_MLD_VALID_LINKS) { u16 usable_links = ieee80211_vif_usable_links(vif); if (vif->active_links != usable_links) ieee80211_set_active_links_async(vif, usable_links); } } static void mac80211_hwsim_link_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *info, u64 changed) { struct hwsim_vif_priv *vp = (void *)vif->drv_priv; struct mac80211_hwsim_data *data = hw->priv; unsigned int link_id = info->link_id; struct mac80211_hwsim_link_data *link_data = &data->link_data[link_id]; hwsim_check_magic(vif); wiphy_dbg(hw->wiphy, "%s(changed=0x%llx vif->addr=%pM, link id %u)\n", __func__, (unsigned long long)changed, vif->addr, link_id); if (changed & BSS_CHANGED_BSSID) { wiphy_dbg(hw->wiphy, "%s: BSSID changed: %pM\n", __func__, info->bssid); memcpy(vp->bssid, info->bssid, ETH_ALEN); } if (changed & BSS_CHANGED_BEACON_ENABLED) { wiphy_dbg(hw->wiphy, " BCN EN: %d (BI=%u)\n", info->enable_beacon, info->beacon_int); vp->bcn_en = info->enable_beacon; if (data->started && !hrtimer_is_queued(&link_data->beacon_timer) && info->enable_beacon) { u64 tsf, until_tbtt; u32 bcn_int; link_data->beacon_int = info->beacon_int * 1024; tsf = mac80211_hwsim_get_tsf(hw, vif); bcn_int = link_data->beacon_int; until_tbtt = bcn_int - do_div(tsf, bcn_int); hrtimer_start(&link_data->beacon_timer, ns_to_ktime(until_tbtt * NSEC_PER_USEC), HRTIMER_MODE_REL_SOFT); } else if (!info->enable_beacon) { unsigned int count = 0; ieee80211_iterate_active_interfaces_atomic( data->hw, IEEE80211_IFACE_ITER_NORMAL, mac80211_hwsim_bcn_en_iter, &count); wiphy_dbg(hw->wiphy, " beaconing vifs remaining: %u", count); if (count == 0) { hrtimer_cancel(&link_data->beacon_timer); link_data->beacon_int = 0; } } } if (changed & BSS_CHANGED_ERP_CTS_PROT) { wiphy_dbg(hw->wiphy, " ERP_CTS_PROT: %d\n", info->use_cts_prot); } if (changed & BSS_CHANGED_ERP_PREAMBLE) { wiphy_dbg(hw->wiphy, " ERP_PREAMBLE: %d\n", info->use_short_preamble); } if (changed & BSS_CHANGED_ERP_SLOT) { wiphy_dbg(hw->wiphy, " ERP_SLOT: %d\n", info->use_short_slot); } if (changed & BSS_CHANGED_HT) { wiphy_dbg(hw->wiphy, " HT: op_mode=0x%x\n", info->ht_operation_mode); } if (changed & BSS_CHANGED_BASIC_RATES) { wiphy_dbg(hw->wiphy, " BASIC_RATES: 0x%llx\n", (unsigned long long) info->basic_rates); } if (changed & BSS_CHANGED_TXPOWER) wiphy_dbg(hw->wiphy, " TX Power: %d dBm\n", info->txpower); } static void mac80211_hwsim_sta_rc_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, u32 changed) { struct mac80211_hwsim_data *data = hw->priv; u32 bw = U32_MAX; int link_id; rcu_read_lock(); for (link_id = 0; link_id < ARRAY_SIZE(vif->link_conf); link_id++) { enum nl80211_chan_width confbw = NL80211_CHAN_WIDTH_20_NOHT; struct ieee80211_bss_conf *vif_conf; struct ieee80211_link_sta *link_sta; link_sta = rcu_dereference(sta->link[link_id]); if (!link_sta) continue; switch (link_sta->bandwidth) { #define C(_bw) case IEEE80211_STA_RX_BW_##_bw: bw = _bw; break C(20); C(40); C(80); C(160); C(320); #undef C } if (!data->use_chanctx) { confbw = data->bw; } else { struct ieee80211_chanctx_conf *chanctx_conf; vif_conf = rcu_dereference(vif->link_conf[link_id]); if (WARN_ON(!vif_conf)) continue; chanctx_conf = rcu_dereference(vif_conf->chanctx_conf); if (!WARN_ON(!chanctx_conf)) confbw = chanctx_conf->def.width; } WARN(bw > hwsim_get_chanwidth(confbw), "intf %pM [link=%d]: bad STA %pM bandwidth %d MHz (%d) > channel config %d MHz (%d)\n", vif->addr, link_id, sta->addr, bw, sta->deflink.bandwidth, hwsim_get_chanwidth(data->bw), data->bw); } rcu_read_unlock(); } static int mac80211_hwsim_sta_add(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta) { struct hwsim_sta_priv *sp = (void *)sta->drv_priv; hwsim_check_magic(vif); hwsim_set_sta_magic(sta); mac80211_hwsim_sta_rc_update(hw, vif, sta, 0); if (sta->valid_links) { WARN(hweight16(sta->valid_links) > 1, "expect to add STA with single link, have 0x%x\n", sta->valid_links); sp->active_links_rx = sta->valid_links; } return 0; } static int mac80211_hwsim_sta_remove(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta) { hwsim_check_magic(vif); hwsim_clear_sta_magic(sta); return 0; } static int mac80211_hwsim_sta_state(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, enum ieee80211_sta_state old_state, enum ieee80211_sta_state new_state) { if (new_state == IEEE80211_STA_NOTEXIST) return mac80211_hwsim_sta_remove(hw, vif, sta); if (old_state == IEEE80211_STA_NOTEXIST) return mac80211_hwsim_sta_add(hw, vif, sta); /* * when client is authorized (AP station marked as such), * enable all links */ if (vif->type == NL80211_IFTYPE_STATION && new_state == IEEE80211_STA_AUTHORIZED && !sta->tdls) ieee80211_set_active_links_async(vif, ieee80211_vif_usable_links(vif)); return 0; } static void mac80211_hwsim_sta_notify(struct ieee80211_hw *hw, struct ieee80211_vif *vif, enum sta_notify_cmd cmd, struct ieee80211_sta *sta) { hwsim_check_magic(vif); switch (cmd) { case STA_NOTIFY_SLEEP: case STA_NOTIFY_AWAKE: /* TODO: make good use of these flags */ break; default: WARN(1, "Invalid sta notify: %d\n", cmd); break; } } static int mac80211_hwsim_set_tim(struct ieee80211_hw *hw, struct ieee80211_sta *sta, bool set) { hwsim_check_sta_magic(sta); return 0; } static int mac80211_hwsim_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif, unsigned int link_id, u16 queue, const struct ieee80211_tx_queue_params *params) { wiphy_dbg(hw->wiphy, "%s (queue=%d txop=%d cw_min=%d cw_max=%d aifs=%d)\n", __func__, queue, params->txop, params->cw_min, params->cw_max, params->aifs); return 0; } static int mac80211_hwsim_get_survey(struct ieee80211_hw *hw, int idx, struct survey_info *survey) { struct mac80211_hwsim_data *hwsim = hw->priv; if (idx < 0 || idx >= ARRAY_SIZE(hwsim->survey_data)) return -ENOENT; mutex_lock(&hwsim->mutex); survey->channel = hwsim->survey_data[idx].channel; if (!survey->channel) { mutex_unlock(&hwsim->mutex); return -ENOENT; } /* * Magically conjured dummy values --- this is only ok for simulated hardware. * * A real driver which cannot determine real values noise MUST NOT * report any, especially not a magically conjured ones :-) */ survey->filled = SURVEY_INFO_NOISE_DBM | SURVEY_INFO_TIME | SURVEY_INFO_TIME_BUSY; survey->noise = -92; survey->time = jiffies_to_msecs(hwsim->survey_data[idx].end - hwsim->survey_data[idx].start); /* report 12.5% of channel time is used */ survey->time_busy = survey->time/8; mutex_unlock(&hwsim->mutex); return 0; } #ifdef CONFIG_NL80211_TESTMODE /* * This section contains example code for using netlink * attributes with the testmode command in nl80211. */ /* These enums need to be kept in sync with userspace */ enum hwsim_testmode_attr { __HWSIM_TM_ATTR_INVALID = 0, HWSIM_TM_ATTR_CMD = 1, HWSIM_TM_ATTR_PS = 2, /* keep last */ __HWSIM_TM_ATTR_AFTER_LAST, HWSIM_TM_ATTR_MAX = __HWSIM_TM_ATTR_AFTER_LAST - 1 }; enum hwsim_testmode_cmd { HWSIM_TM_CMD_SET_PS = 0, HWSIM_TM_CMD_GET_PS = 1, HWSIM_TM_CMD_STOP_QUEUES = 2, HWSIM_TM_CMD_WAKE_QUEUES = 3, }; static const struct nla_policy hwsim_testmode_policy[HWSIM_TM_ATTR_MAX + 1] = { [HWSIM_TM_ATTR_CMD] = { .type = NLA_U32 }, [HWSIM_TM_ATTR_PS] = { .type = NLA_U32 }, }; static int mac80211_hwsim_testmode_cmd(struct ieee80211_hw *hw, struct ieee80211_vif *vif, void *data, int len) { struct mac80211_hwsim_data *hwsim = hw->priv; struct nlattr *tb[HWSIM_TM_ATTR_MAX + 1]; struct sk_buff *skb; int err, ps; err = nla_parse_deprecated(tb, HWSIM_TM_ATTR_MAX, data, len, hwsim_testmode_policy, NULL); if (err) return err; if (!tb[HWSIM_TM_ATTR_CMD]) return -EINVAL; switch (nla_get_u32(tb[HWSIM_TM_ATTR_CMD])) { case HWSIM_TM_CMD_SET_PS: if (!tb[HWSIM_TM_ATTR_PS]) return -EINVAL; ps = nla_get_u32(tb[HWSIM_TM_ATTR_PS]); return hwsim_fops_ps_write(hwsim, ps); case HWSIM_TM_CMD_GET_PS: skb = cfg80211_testmode_alloc_reply_skb(hw->wiphy, nla_total_size(sizeof(u32))); if (!skb) return -ENOMEM; if (nla_put_u32(skb, HWSIM_TM_ATTR_PS, hwsim->ps)) goto nla_put_failure; return cfg80211_testmode_reply(skb); case HWSIM_TM_CMD_STOP_QUEUES: ieee80211_stop_queues(hw); return 0; case HWSIM_TM_CMD_WAKE_QUEUES: ieee80211_wake_queues(hw); return 0; default: return -EOPNOTSUPP; } nla_put_failure: kfree_skb(skb); return -ENOBUFS; } #endif static int mac80211_hwsim_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_ampdu_params *params) { struct ieee80211_sta *sta = params->sta; enum ieee80211_ampdu_mlme_action action = params->action; u16 tid = params->tid; switch (action) { case IEEE80211_AMPDU_TX_START: return IEEE80211_AMPDU_TX_START_IMMEDIATE; case IEEE80211_AMPDU_TX_STOP_CONT: case IEEE80211_AMPDU_TX_STOP_FLUSH: case IEEE80211_AMPDU_TX_STOP_FLUSH_CONT: ieee80211_stop_tx_ba_cb_irqsafe(vif, sta->addr, tid); break; case IEEE80211_AMPDU_TX_OPERATIONAL: break; case IEEE80211_AMPDU_RX_START: case IEEE80211_AMPDU_RX_STOP: break; default: return -EOPNOTSUPP; } return 0; } static void mac80211_hwsim_flush(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u32 queues, bool drop) { /* Not implemented, queues only on kernel side */ } static void hw_scan_work(struct work_struct *work) { struct mac80211_hwsim_data *hwsim = container_of(work, struct mac80211_hwsim_data, hw_scan.work); struct cfg80211_scan_request *req = hwsim->hw_scan_request; int dwell, i; mutex_lock(&hwsim->mutex); if (hwsim->scan_chan_idx >= req->n_channels) { struct cfg80211_scan_info info = { .aborted = false, }; wiphy_dbg(hwsim->hw->wiphy, "hw scan complete\n"); ieee80211_scan_completed(hwsim->hw, &info); hwsim->hw_scan_request = NULL; hwsim->hw_scan_vif = NULL; hwsim->tmp_chan = NULL; mutex_unlock(&hwsim->mutex); mac80211_hwsim_config_mac_nl(hwsim->hw, hwsim->scan_addr, false); return; } wiphy_dbg(hwsim->hw->wiphy, "hw scan %d MHz\n", req->channels[hwsim->scan_chan_idx]->center_freq); hwsim->tmp_chan = req->channels[hwsim->scan_chan_idx]; if (hwsim->tmp_chan->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR) || !req->n_ssids) { dwell = 120; } else { dwell = 30; /* send probes */ for (i = 0; i < req->n_ssids; i++) { struct sk_buff *probe; struct ieee80211_mgmt *mgmt; probe = ieee80211_probereq_get(hwsim->hw, hwsim->scan_addr, req->ssids[i].ssid, req->ssids[i].ssid_len, req->ie_len); if (!probe) continue; mgmt = (struct ieee80211_mgmt *) probe->data; memcpy(mgmt->da, req->bssid, ETH_ALEN); memcpy(mgmt->bssid, req->bssid, ETH_ALEN); if (req->ie_len) skb_put_data(probe, req->ie, req->ie_len); rcu_read_lock(); if (!ieee80211_tx_prepare_skb(hwsim->hw, hwsim->hw_scan_vif, probe, hwsim->tmp_chan->band, NULL)) { rcu_read_unlock(); kfree_skb(probe); continue; } local_bh_disable(); mac80211_hwsim_tx_frame(hwsim->hw, probe, hwsim->tmp_chan); rcu_read_unlock(); local_bh_enable(); } } ieee80211_queue_delayed_work(hwsim->hw, &hwsim->hw_scan, msecs_to_jiffies(dwell)); hwsim->survey_data[hwsim->scan_chan_idx].channel = hwsim->tmp_chan; hwsim->survey_data[hwsim->scan_chan_idx].start = jiffies; hwsim->survey_data[hwsim->scan_chan_idx].end = jiffies + msecs_to_jiffies(dwell); hwsim->scan_chan_idx++; mutex_unlock(&hwsim->mutex); } static int mac80211_hwsim_hw_scan(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_scan_request *hw_req) { struct mac80211_hwsim_data *hwsim = hw->priv; struct cfg80211_scan_request *req = &hw_req->req; mutex_lock(&hwsim->mutex); if (WARN_ON(hwsim->tmp_chan || hwsim->hw_scan_request)) { mutex_unlock(&hwsim->mutex); return -EBUSY; } hwsim->hw_scan_request = req; hwsim->hw_scan_vif = vif; hwsim->scan_chan_idx = 0; if (req->flags & NL80211_SCAN_FLAG_RANDOM_ADDR) get_random_mask_addr(hwsim->scan_addr, hw_req->req.mac_addr, hw_req->req.mac_addr_mask); else memcpy(hwsim->scan_addr, vif->addr, ETH_ALEN); memset(hwsim->survey_data, 0, sizeof(hwsim->survey_data)); mutex_unlock(&hwsim->mutex); mac80211_hwsim_config_mac_nl(hw, hwsim->scan_addr, true); wiphy_dbg(hw->wiphy, "hwsim hw_scan request\n"); ieee80211_queue_delayed_work(hwsim->hw, &hwsim->hw_scan, 0); return 0; } static void mac80211_hwsim_cancel_hw_scan(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { struct mac80211_hwsim_data *hwsim = hw->priv; struct cfg80211_scan_info info = { .aborted = true, }; wiphy_dbg(hw->wiphy, "hwsim cancel_hw_scan\n"); cancel_delayed_work_sync(&hwsim->hw_scan); mutex_lock(&hwsim->mutex); ieee80211_scan_completed(hwsim->hw, &info); hwsim->tmp_chan = NULL; hwsim->hw_scan_request = NULL; hwsim->hw_scan_vif = NULL; mutex_unlock(&hwsim->mutex); } static void mac80211_hwsim_sw_scan(struct ieee80211_hw *hw, struct ieee80211_vif *vif, const u8 *mac_addr) { struct mac80211_hwsim_data *hwsim = hw->priv; mutex_lock(&hwsim->mutex); if (hwsim->scanning) { pr_debug("two hwsim sw_scans detected!\n"); goto out; } pr_debug("hwsim sw_scan request, prepping stuff\n"); memcpy(hwsim->scan_addr, mac_addr, ETH_ALEN); mac80211_hwsim_config_mac_nl(hw, hwsim->scan_addr, true); hwsim->scanning = true; memset(hwsim->survey_data, 0, sizeof(hwsim->survey_data)); out: mutex_unlock(&hwsim->mutex); } static void mac80211_hwsim_sw_scan_complete(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { struct mac80211_hwsim_data *hwsim = hw->priv; mutex_lock(&hwsim->mutex); pr_debug("hwsim sw_scan_complete\n"); hwsim->scanning = false; mac80211_hwsim_config_mac_nl(hw, hwsim->scan_addr, false); eth_zero_addr(hwsim->scan_addr); mutex_unlock(&hwsim->mutex); } static void hw_roc_start(struct work_struct *work) { struct mac80211_hwsim_data *hwsim = container_of(work, struct mac80211_hwsim_data, roc_start.work); mutex_lock(&hwsim->mutex); wiphy_dbg(hwsim->hw->wiphy, "hwsim ROC begins\n"); hwsim->tmp_chan = hwsim->roc_chan; ieee80211_ready_on_channel(hwsim->hw); ieee80211_queue_delayed_work(hwsim->hw, &hwsim->roc_done, msecs_to_jiffies(hwsim->roc_duration)); mutex_unlock(&hwsim->mutex); } static void hw_roc_done(struct work_struct *work) { struct mac80211_hwsim_data *hwsim = container_of(work, struct mac80211_hwsim_data, roc_done.work); mutex_lock(&hwsim->mutex); ieee80211_remain_on_channel_expired(hwsim->hw); hwsim->tmp_chan = NULL; mutex_unlock(&hwsim->mutex); wiphy_dbg(hwsim->hw->wiphy, "hwsim ROC expired\n"); } static int mac80211_hwsim_roc(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_channel *chan, int duration, enum ieee80211_roc_type type) { struct mac80211_hwsim_data *hwsim = hw->priv; mutex_lock(&hwsim->mutex); if (WARN_ON(hwsim->tmp_chan || hwsim->hw_scan_request)) { mutex_unlock(&hwsim->mutex); return -EBUSY; } hwsim->roc_chan = chan; hwsim->roc_duration = duration; mutex_unlock(&hwsim->mutex); wiphy_dbg(hw->wiphy, "hwsim ROC (%d MHz, %d ms)\n", chan->center_freq, duration); ieee80211_queue_delayed_work(hw, &hwsim->roc_start, HZ/50); return 0; } static int mac80211_hwsim_croc(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { struct mac80211_hwsim_data *hwsim = hw->priv; cancel_delayed_work_sync(&hwsim->roc_start); cancel_delayed_work_sync(&hwsim->roc_done); mutex_lock(&hwsim->mutex); hwsim->tmp_chan = NULL; mutex_unlock(&hwsim->mutex); wiphy_dbg(hw->wiphy, "hwsim ROC canceled\n"); return 0; } static int mac80211_hwsim_add_chanctx(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *ctx) { hwsim_set_chanctx_magic(ctx); wiphy_dbg(hw->wiphy, "add channel context control: %d MHz/width: %d/cfreqs:%d/%d MHz\n", ctx->def.chan->center_freq, ctx->def.width, ctx->def.center_freq1, ctx->def.center_freq2); return 0; } static void mac80211_hwsim_remove_chanctx(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *ctx) { wiphy_dbg(hw->wiphy, "remove channel context control: %d MHz/width: %d/cfreqs:%d/%d MHz\n", ctx->def.chan->center_freq, ctx->def.width, ctx->def.center_freq1, ctx->def.center_freq2); hwsim_check_chanctx_magic(ctx); hwsim_clear_chanctx_magic(ctx); } static void mac80211_hwsim_change_chanctx(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *ctx, u32 changed) { hwsim_check_chanctx_magic(ctx); wiphy_dbg(hw->wiphy, "change channel context control: %d MHz/width: %d/cfreqs:%d/%d MHz\n", ctx->def.chan->center_freq, ctx->def.width, ctx->def.center_freq1, ctx->def.center_freq2); } static int mac80211_hwsim_assign_vif_chanctx(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf, struct ieee80211_chanctx_conf *ctx) { hwsim_check_magic(vif); hwsim_check_chanctx_magic(ctx); /* if we activate a link while already associated wake it up */ if (vif->type == NL80211_IFTYPE_STATION && vif->cfg.assoc) { struct sk_buff *skb; skb = ieee80211_nullfunc_get(hw, vif, link_conf->link_id, true); if (skb) { local_bh_disable(); mac80211_hwsim_tx_frame(hw, skb, ctx->def.chan); local_bh_enable(); } } return 0; } static void mac80211_hwsim_unassign_vif_chanctx(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf, struct ieee80211_chanctx_conf *ctx) { hwsim_check_magic(vif); hwsim_check_chanctx_magic(ctx); /* if we deactivate a link while associated suspend it first */ if (vif->type == NL80211_IFTYPE_STATION && vif->cfg.assoc) { struct sk_buff *skb; skb = ieee80211_nullfunc_get(hw, vif, link_conf->link_id, true); if (skb) { struct ieee80211_hdr *hdr = (void *)skb->data; hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PM); local_bh_disable(); mac80211_hwsim_tx_frame(hw, skb, ctx->def.chan); local_bh_enable(); } } } static const char mac80211_hwsim_gstrings_stats[][ETH_GSTRING_LEN] = { "tx_pkts_nic", "tx_bytes_nic", "rx_pkts_nic", "rx_bytes_nic", "d_tx_dropped", "d_tx_failed", "d_ps_mode", "d_group", }; #define MAC80211_HWSIM_SSTATS_LEN ARRAY_SIZE(mac80211_hwsim_gstrings_stats) static void mac80211_hwsim_get_et_strings(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u32 sset, u8 *data) { if (sset == ETH_SS_STATS) memcpy(data, mac80211_hwsim_gstrings_stats, sizeof(mac80211_hwsim_gstrings_stats)); } static int mac80211_hwsim_get_et_sset_count(struct ieee80211_hw *hw, struct ieee80211_vif *vif, int sset) { if (sset == ETH_SS_STATS) return MAC80211_HWSIM_SSTATS_LEN; return 0; } static void mac80211_hwsim_get_et_stats(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ethtool_stats *stats, u64 *data) { struct mac80211_hwsim_data *ar = hw->priv; int i = 0; data[i++] = ar->tx_pkts; data[i++] = ar->tx_bytes; data[i++] = ar->rx_pkts; data[i++] = ar->rx_bytes; data[i++] = ar->tx_dropped; data[i++] = ar->tx_failed; data[i++] = ar->ps; data[i++] = ar->group; WARN_ON(i != MAC80211_HWSIM_SSTATS_LEN); } static int mac80211_hwsim_tx_last_beacon(struct ieee80211_hw *hw) { return 1; } static int mac80211_hwsim_set_rts_threshold(struct ieee80211_hw *hw, u32 value) { return -EOPNOTSUPP; } static int mac80211_hwsim_change_vif_links(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u16 old_links, u16 new_links, struct ieee80211_bss_conf *old[IEEE80211_MLD_MAX_NUM_LINKS]) { unsigned long rem = old_links & ~new_links; unsigned long add = new_links & ~old_links; int i; if (!old_links) rem |= BIT(0); if (!new_links) add |= BIT(0); for_each_set_bit(i, &rem, IEEE80211_MLD_MAX_NUM_LINKS) mac80211_hwsim_config_mac_nl(hw, old[i]->addr, false); for_each_set_bit(i, &add, IEEE80211_MLD_MAX_NUM_LINKS) { struct ieee80211_bss_conf *link_conf; link_conf = link_conf_dereference_protected(vif, i); if (WARN_ON(!link_conf)) continue; mac80211_hwsim_config_mac_nl(hw, link_conf->addr, true); } return 0; } static int mac80211_hwsim_change_sta_links(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, u16 old_links, u16 new_links) { struct hwsim_sta_priv *sp = (void *)sta->drv_priv; hwsim_check_sta_magic(sta); if (vif->type == NL80211_IFTYPE_STATION) sp->active_links_rx = new_links; return 0; } static int mac80211_hwsim_send_pmsr_ftm_request_peer(struct sk_buff *msg, struct cfg80211_pmsr_ftm_request_peer *request) { struct nlattr *ftm; if (!request->requested) return -EINVAL; ftm = nla_nest_start(msg, NL80211_PMSR_TYPE_FTM); if (!ftm) return -ENOBUFS; if (nla_put_u32(msg, NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE, request->preamble)) return -ENOBUFS; if (nla_put_u16(msg, NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD, request->burst_period)) return -ENOBUFS; if (request->asap && nla_put_flag(msg, NL80211_PMSR_FTM_REQ_ATTR_ASAP)) return -ENOBUFS; if (request->request_lci && nla_put_flag(msg, NL80211_PMSR_FTM_REQ_ATTR_REQUEST_LCI)) return -ENOBUFS; if (request->request_civicloc && nla_put_flag(msg, NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC)) return -ENOBUFS; if (request->trigger_based && nla_put_flag(msg, NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED)) return -ENOBUFS; if (request->non_trigger_based && nla_put_flag(msg, NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED)) return -ENOBUFS; if (request->lmr_feedback && nla_put_flag(msg, NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK)) return -ENOBUFS; if (nla_put_u8(msg, NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP, request->num_bursts_exp)) return -ENOBUFS; if (nla_put_u8(msg, NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION, request->burst_duration)) return -ENOBUFS; if (nla_put_u8(msg, NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST, request->ftms_per_burst)) return -ENOBUFS; if (nla_put_u8(msg, NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES, request->ftmr_retries)) return -ENOBUFS; if (nla_put_u8(msg, NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION, request->burst_duration)) return -ENOBUFS; if (nla_put_u8(msg, NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR, request->bss_color)) return -ENOBUFS; nla_nest_end(msg, ftm); return 0; } static int mac80211_hwsim_send_pmsr_request_peer(struct sk_buff *msg, struct cfg80211_pmsr_request_peer *request) { struct nlattr *peer, *chandef, *req, *data; int err; peer = nla_nest_start(msg, NL80211_PMSR_ATTR_PEERS); if (!peer) return -ENOBUFS; if (nla_put(msg, NL80211_PMSR_PEER_ATTR_ADDR, ETH_ALEN, request->addr)) return -ENOBUFS; chandef = nla_nest_start(msg, NL80211_PMSR_PEER_ATTR_CHAN); if (!chandef) return -ENOBUFS; err = nl80211_send_chandef(msg, &request->chandef); if (err) return err; nla_nest_end(msg, chandef); req = nla_nest_start(msg, NL80211_PMSR_PEER_ATTR_REQ); if (!req) return -ENOBUFS; if (request->report_ap_tsf && nla_put_flag(msg, NL80211_PMSR_REQ_ATTR_GET_AP_TSF)) return -ENOBUFS; data = nla_nest_start(msg, NL80211_PMSR_REQ_ATTR_DATA); if (!data) return -ENOBUFS; err = mac80211_hwsim_send_pmsr_ftm_request_peer(msg, &request->ftm); if (err) return err; nla_nest_end(msg, data); nla_nest_end(msg, req); nla_nest_end(msg, peer); return 0; } static int mac80211_hwsim_send_pmsr_request(struct sk_buff *msg, struct cfg80211_pmsr_request *request) { struct nlattr *pmsr; int err; pmsr = nla_nest_start(msg, NL80211_ATTR_PEER_MEASUREMENTS); if (!pmsr) return -ENOBUFS; if (nla_put_u32(msg, NL80211_ATTR_TIMEOUT, request->timeout)) return -ENOBUFS; if (!is_zero_ether_addr(request->mac_addr)) { if (nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, request->mac_addr)) return -ENOBUFS; if (nla_put(msg, NL80211_ATTR_MAC_MASK, ETH_ALEN, request->mac_addr_mask)) return -ENOBUFS; } for (int i = 0; i < request->n_peers; i++) { err = mac80211_hwsim_send_pmsr_request_peer(msg, &request->peers[i]); if (err) return err; } nla_nest_end(msg, pmsr); return 0; } static int mac80211_hwsim_start_pmsr(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_pmsr_request *request) { struct mac80211_hwsim_data *data; struct sk_buff *skb = NULL; struct nlattr *pmsr; void *msg_head; u32 _portid; int err = 0; data = hw->priv; _portid = READ_ONCE(data->wmediumd); if (!_portid && !hwsim_virtio_enabled) return -EOPNOTSUPP; mutex_lock(&data->mutex); if (data->pmsr_request) { err = -EBUSY; goto out_free; } skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) { err = -ENOMEM; goto out_free; } msg_head = genlmsg_put(skb, 0, 0, &hwsim_genl_family, 0, HWSIM_CMD_START_PMSR); if (nla_put(skb, HWSIM_ATTR_ADDR_TRANSMITTER, ETH_ALEN, data->addresses[1].addr)) { err = -ENOMEM; goto out_free; } pmsr = nla_nest_start(skb, HWSIM_ATTR_PMSR_REQUEST); if (!pmsr) { err = -ENOMEM; goto out_free; } err = mac80211_hwsim_send_pmsr_request(skb, request); if (err) goto out_free; nla_nest_end(skb, pmsr); genlmsg_end(skb, msg_head); if (hwsim_virtio_enabled) hwsim_tx_virtio(data, skb); else hwsim_unicast_netgroup(data, skb, _portid); data->pmsr_request = request; data->pmsr_request_wdev = ieee80211_vif_to_wdev(vif); out_free: if (err && skb) nlmsg_free(skb); mutex_unlock(&data->mutex); return err; } static void mac80211_hwsim_abort_pmsr(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_pmsr_request *request) { struct mac80211_hwsim_data *data; struct sk_buff *skb = NULL; struct nlattr *pmsr; void *msg_head; u32 _portid; int err = 0; data = hw->priv; _portid = READ_ONCE(data->wmediumd); if (!_portid && !hwsim_virtio_enabled) return; mutex_lock(&data->mutex); if (data->pmsr_request != request) { err = -EINVAL; goto out; } skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) { err = -ENOMEM; goto out; } msg_head = genlmsg_put(skb, 0, 0, &hwsim_genl_family, 0, HWSIM_CMD_ABORT_PMSR); if (nla_put(skb, HWSIM_ATTR_ADDR_TRANSMITTER, ETH_ALEN, data->addresses[1].addr)) goto out; pmsr = nla_nest_start(skb, HWSIM_ATTR_PMSR_REQUEST); if (!pmsr) { err = -ENOMEM; goto out; } err = mac80211_hwsim_send_pmsr_request(skb, request); if (err) goto out; err = nla_nest_end(skb, pmsr); if (err) goto out; genlmsg_end(skb, msg_head); if (hwsim_virtio_enabled) hwsim_tx_virtio(data, skb); else hwsim_unicast_netgroup(data, skb, _portid); out: if (err && skb) nlmsg_free(skb); mutex_unlock(&data->mutex); } static int mac80211_hwsim_parse_rate_info(struct nlattr *rateattr, struct rate_info *rate_info, struct genl_info *info) { struct nlattr *tb[HWSIM_RATE_INFO_ATTR_MAX + 1]; int ret; ret = nla_parse_nested(tb, HWSIM_RATE_INFO_ATTR_MAX, rateattr, hwsim_rate_info_policy, info->extack); if (ret) return ret; if (tb[HWSIM_RATE_INFO_ATTR_FLAGS]) rate_info->flags = nla_get_u8(tb[HWSIM_RATE_INFO_ATTR_FLAGS]); if (tb[HWSIM_RATE_INFO_ATTR_MCS]) rate_info->mcs = nla_get_u8(tb[HWSIM_RATE_INFO_ATTR_MCS]); if (tb[HWSIM_RATE_INFO_ATTR_LEGACY]) rate_info->legacy = nla_get_u16(tb[HWSIM_RATE_INFO_ATTR_LEGACY]); if (tb[HWSIM_RATE_INFO_ATTR_NSS]) rate_info->nss = nla_get_u8(tb[HWSIM_RATE_INFO_ATTR_NSS]); if (tb[HWSIM_RATE_INFO_ATTR_BW]) rate_info->bw = nla_get_u8(tb[HWSIM_RATE_INFO_ATTR_BW]); if (tb[HWSIM_RATE_INFO_ATTR_HE_GI]) rate_info->he_gi = nla_get_u8(tb[HWSIM_RATE_INFO_ATTR_HE_GI]); if (tb[HWSIM_RATE_INFO_ATTR_HE_DCM]) rate_info->he_dcm = nla_get_u8(tb[HWSIM_RATE_INFO_ATTR_HE_DCM]); if (tb[HWSIM_RATE_INFO_ATTR_HE_RU_ALLOC]) rate_info->he_ru_alloc = nla_get_u8(tb[HWSIM_RATE_INFO_ATTR_HE_RU_ALLOC]); if (tb[HWSIM_RATE_INFO_ATTR_N_BOUNDED_CH]) rate_info->n_bonded_ch = nla_get_u8(tb[HWSIM_RATE_INFO_ATTR_N_BOUNDED_CH]); if (tb[HWSIM_RATE_INFO_ATTR_EHT_GI]) rate_info->eht_gi = nla_get_u8(tb[HWSIM_RATE_INFO_ATTR_EHT_GI]); if (tb[HWSIM_RATE_INFO_ATTR_EHT_RU_ALLOC]) rate_info->eht_ru_alloc = nla_get_u8(tb[HWSIM_RATE_INFO_ATTR_EHT_RU_ALLOC]); return 0; } static int mac80211_hwsim_parse_ftm_result(struct nlattr *ftm, struct cfg80211_pmsr_ftm_result *result, struct genl_info *info) { struct nlattr *tb[NL80211_PMSR_FTM_RESP_ATTR_MAX + 1]; int ret; ret = nla_parse_nested(tb, NL80211_PMSR_FTM_RESP_ATTR_MAX, ftm, hwsim_ftm_result_policy, info->extack); if (ret) return ret; if (tb[NL80211_PMSR_FTM_RESP_ATTR_FAIL_REASON]) result->failure_reason = nla_get_u32(tb[NL80211_PMSR_FTM_RESP_ATTR_FAIL_REASON]); if (tb[NL80211_PMSR_FTM_RESP_ATTR_BURST_INDEX]) result->burst_index = nla_get_u16(tb[NL80211_PMSR_FTM_RESP_ATTR_BURST_INDEX]); if (tb[NL80211_PMSR_FTM_RESP_ATTR_NUM_FTMR_ATTEMPTS]) { result->num_ftmr_attempts_valid = 1; result->num_ftmr_attempts = nla_get_u32(tb[NL80211_PMSR_FTM_RESP_ATTR_NUM_FTMR_ATTEMPTS]); } if (tb[NL80211_PMSR_FTM_RESP_ATTR_NUM_FTMR_SUCCESSES]) { result->num_ftmr_successes_valid = 1; result->num_ftmr_successes = nla_get_u32(tb[NL80211_PMSR_FTM_RESP_ATTR_NUM_FTMR_SUCCESSES]); } if (tb[NL80211_PMSR_FTM_RESP_ATTR_BUSY_RETRY_TIME]) result->busy_retry_time = nla_get_u8(tb[NL80211_PMSR_FTM_RESP_ATTR_BUSY_RETRY_TIME]); if (tb[NL80211_PMSR_FTM_RESP_ATTR_NUM_BURSTS_EXP]) result->num_bursts_exp = nla_get_u8(tb[NL80211_PMSR_FTM_RESP_ATTR_NUM_BURSTS_EXP]); if (tb[NL80211_PMSR_FTM_RESP_ATTR_BURST_DURATION]) result->burst_duration = nla_get_u8(tb[NL80211_PMSR_FTM_RESP_ATTR_BURST_DURATION]); if (tb[NL80211_PMSR_FTM_RESP_ATTR_FTMS_PER_BURST]) result->ftms_per_burst = nla_get_u8(tb[NL80211_PMSR_FTM_RESP_ATTR_FTMS_PER_BURST]); if (tb[NL80211_PMSR_FTM_RESP_ATTR_RSSI_AVG]) { result->rssi_avg_valid = 1; result->rssi_avg = nla_get_s32(tb[NL80211_PMSR_FTM_RESP_ATTR_RSSI_AVG]); } if (tb[NL80211_PMSR_FTM_RESP_ATTR_RSSI_SPREAD]) { result->rssi_spread_valid = 1; result->rssi_spread = nla_get_s32(tb[NL80211_PMSR_FTM_RESP_ATTR_RSSI_SPREAD]); } if (tb[NL80211_PMSR_FTM_RESP_ATTR_TX_RATE]) { result->tx_rate_valid = 1; ret = mac80211_hwsim_parse_rate_info(tb[NL80211_PMSR_FTM_RESP_ATTR_TX_RATE], &result->tx_rate, info); if (ret) return ret; } if (tb[NL80211_PMSR_FTM_RESP_ATTR_RX_RATE]) { result->rx_rate_valid = 1; ret = mac80211_hwsim_parse_rate_info(tb[NL80211_PMSR_FTM_RESP_ATTR_RX_RATE], &result->rx_rate, info); if (ret) return ret; } if (tb[NL80211_PMSR_FTM_RESP_ATTR_RTT_AVG]) { result->rtt_avg_valid = 1; result->rtt_avg = nla_get_u64(tb[NL80211_PMSR_FTM_RESP_ATTR_RTT_AVG]); } if (tb[NL80211_PMSR_FTM_RESP_ATTR_RTT_VARIANCE]) { result->rtt_variance_valid = 1; result->rtt_variance = nla_get_u64(tb[NL80211_PMSR_FTM_RESP_ATTR_RTT_VARIANCE]); } if (tb[NL80211_PMSR_FTM_RESP_ATTR_RTT_SPREAD]) { result->rtt_spread_valid = 1; result->rtt_spread = nla_get_u64(tb[NL80211_PMSR_FTM_RESP_ATTR_RTT_SPREAD]); } if (tb[NL80211_PMSR_FTM_RESP_ATTR_DIST_AVG]) { result->dist_avg_valid = 1; result->dist_avg = nla_get_u64(tb[NL80211_PMSR_FTM_RESP_ATTR_DIST_AVG]); } if (tb[NL80211_PMSR_FTM_RESP_ATTR_DIST_VARIANCE]) { result->dist_variance_valid = 1; result->dist_variance = nla_get_u64(tb[NL80211_PMSR_FTM_RESP_ATTR_DIST_VARIANCE]); } if (tb[NL80211_PMSR_FTM_RESP_ATTR_DIST_SPREAD]) { result->dist_spread_valid = 1; result->dist_spread = nla_get_u64(tb[NL80211_PMSR_FTM_RESP_ATTR_DIST_SPREAD]); } if (tb[NL80211_PMSR_FTM_RESP_ATTR_LCI]) { result->lci = nla_data(tb[NL80211_PMSR_FTM_RESP_ATTR_LCI]); result->lci_len = nla_len(tb[NL80211_PMSR_FTM_RESP_ATTR_LCI]); } if (tb[NL80211_PMSR_FTM_RESP_ATTR_CIVICLOC]) { result->civicloc = nla_data(tb[NL80211_PMSR_FTM_RESP_ATTR_CIVICLOC]); result->civicloc_len = nla_len(tb[NL80211_PMSR_FTM_RESP_ATTR_CIVICLOC]); } return 0; } static int mac80211_hwsim_parse_pmsr_resp(struct nlattr *resp, struct cfg80211_pmsr_result *result, struct genl_info *info) { struct nlattr *tb[NL80211_PMSR_RESP_ATTR_MAX + 1]; struct nlattr *pmsr; int rem; int ret; ret = nla_parse_nested(tb, NL80211_PMSR_RESP_ATTR_MAX, resp, hwsim_pmsr_resp_policy, info->extack); if (ret) return ret; if (tb[NL80211_PMSR_RESP_ATTR_STATUS]) result->status = nla_get_u32(tb[NL80211_PMSR_RESP_ATTR_STATUS]); if (tb[NL80211_PMSR_RESP_ATTR_HOST_TIME]) result->host_time = nla_get_u64(tb[NL80211_PMSR_RESP_ATTR_HOST_TIME]); if (tb[NL80211_PMSR_RESP_ATTR_AP_TSF]) { result->ap_tsf_valid = 1; result->ap_tsf = nla_get_u64(tb[NL80211_PMSR_RESP_ATTR_AP_TSF]); } result->final = !!tb[NL80211_PMSR_RESP_ATTR_FINAL]; if (!tb[NL80211_PMSR_RESP_ATTR_DATA]) return 0; nla_for_each_nested(pmsr, tb[NL80211_PMSR_RESP_ATTR_DATA], rem) { switch (nla_type(pmsr)) { case NL80211_PMSR_TYPE_FTM: result->type = NL80211_PMSR_TYPE_FTM; ret = mac80211_hwsim_parse_ftm_result(pmsr, &result->ftm, info); if (ret) return ret; break; default: NL_SET_ERR_MSG_ATTR(info->extack, pmsr, "Unknown pmsr resp type"); return -EINVAL; } } return 0; } static int mac80211_hwsim_parse_pmsr_result(struct nlattr *peer, struct cfg80211_pmsr_result *result, struct genl_info *info) { struct nlattr *tb[NL80211_PMSR_PEER_ATTR_MAX + 1]; int ret; if (!peer) return -EINVAL; ret = nla_parse_nested(tb, NL80211_PMSR_PEER_ATTR_MAX, peer, hwsim_pmsr_peer_result_policy, info->extack); if (ret) return ret; if (tb[NL80211_PMSR_PEER_ATTR_ADDR]) memcpy(result->addr, nla_data(tb[NL80211_PMSR_PEER_ATTR_ADDR]), ETH_ALEN); if (tb[NL80211_PMSR_PEER_ATTR_RESP]) { ret = mac80211_hwsim_parse_pmsr_resp(tb[NL80211_PMSR_PEER_ATTR_RESP], result, info); if (ret) return ret; } return 0; }; static int hwsim_pmsr_report_nl(struct sk_buff *msg, struct genl_info *info) { struct mac80211_hwsim_data *data; struct nlattr *peers, *peer; struct nlattr *reqattr; const u8 *src; int err; int rem; if (!info->attrs[HWSIM_ATTR_ADDR_TRANSMITTER]) return -EINVAL; src = nla_data(info->attrs[HWSIM_ATTR_ADDR_TRANSMITTER]); data = get_hwsim_data_ref_from_addr(src); if (!data) return -EINVAL; mutex_lock(&data->mutex); if (!data->pmsr_request) { err = -EINVAL; goto out; } reqattr = info->attrs[HWSIM_ATTR_PMSR_RESULT]; if (!reqattr) { err = -EINVAL; goto out; } peers = nla_find_nested(reqattr, NL80211_PMSR_ATTR_PEERS); if (!peers) { err = -EINVAL; goto out; } nla_for_each_nested(peer, peers, rem) { struct cfg80211_pmsr_result result; err = mac80211_hwsim_parse_pmsr_result(peer, &result, info); if (err) goto out; cfg80211_pmsr_report(data->pmsr_request_wdev, data->pmsr_request, &result, GFP_KERNEL); } cfg80211_pmsr_complete(data->pmsr_request_wdev, data->pmsr_request, GFP_KERNEL); err = 0; out: data->pmsr_request = NULL; data->pmsr_request_wdev = NULL; mutex_unlock(&data->mutex); return err; } #define HWSIM_COMMON_OPS \ .tx = mac80211_hwsim_tx, \ .wake_tx_queue = ieee80211_handle_wake_tx_queue, \ .start = mac80211_hwsim_start, \ .stop = mac80211_hwsim_stop, \ .add_interface = mac80211_hwsim_add_interface, \ .change_interface = mac80211_hwsim_change_interface, \ .remove_interface = mac80211_hwsim_remove_interface, \ .config = mac80211_hwsim_config, \ .configure_filter = mac80211_hwsim_configure_filter, \ .vif_cfg_changed = mac80211_hwsim_vif_info_changed, \ .link_info_changed = mac80211_hwsim_link_info_changed, \ .tx_last_beacon = mac80211_hwsim_tx_last_beacon, \ .sta_notify = mac80211_hwsim_sta_notify, \ .sta_rc_update = mac80211_hwsim_sta_rc_update, \ .conf_tx = mac80211_hwsim_conf_tx, \ .get_survey = mac80211_hwsim_get_survey, \ CFG80211_TESTMODE_CMD(mac80211_hwsim_testmode_cmd) \ .ampdu_action = mac80211_hwsim_ampdu_action, \ .flush = mac80211_hwsim_flush, \ .get_et_sset_count = mac80211_hwsim_get_et_sset_count, \ .get_et_stats = mac80211_hwsim_get_et_stats, \ .get_et_strings = mac80211_hwsim_get_et_strings, \ .start_pmsr = mac80211_hwsim_start_pmsr, \ .abort_pmsr = mac80211_hwsim_abort_pmsr, #define HWSIM_NON_MLO_OPS \ .sta_add = mac80211_hwsim_sta_add, \ .sta_remove = mac80211_hwsim_sta_remove, \ .set_tim = mac80211_hwsim_set_tim, \ .get_tsf = mac80211_hwsim_get_tsf, \ .set_tsf = mac80211_hwsim_set_tsf, static const struct ieee80211_ops mac80211_hwsim_ops = { HWSIM_COMMON_OPS HWSIM_NON_MLO_OPS .sw_scan_start = mac80211_hwsim_sw_scan, .sw_scan_complete = mac80211_hwsim_sw_scan_complete, }; #define HWSIM_CHANCTX_OPS \ .hw_scan = mac80211_hwsim_hw_scan, \ .cancel_hw_scan = mac80211_hwsim_cancel_hw_scan, \ .remain_on_channel = mac80211_hwsim_roc, \ .cancel_remain_on_channel = mac80211_hwsim_croc, \ .add_chanctx = mac80211_hwsim_add_chanctx, \ .remove_chanctx = mac80211_hwsim_remove_chanctx, \ .change_chanctx = mac80211_hwsim_change_chanctx, \ .assign_vif_chanctx = mac80211_hwsim_assign_vif_chanctx,\ .unassign_vif_chanctx = mac80211_hwsim_unassign_vif_chanctx, static const struct ieee80211_ops mac80211_hwsim_mchan_ops = { HWSIM_COMMON_OPS HWSIM_NON_MLO_OPS HWSIM_CHANCTX_OPS }; static const struct ieee80211_ops mac80211_hwsim_mlo_ops = { HWSIM_COMMON_OPS HWSIM_CHANCTX_OPS .set_rts_threshold = mac80211_hwsim_set_rts_threshold, .change_vif_links = mac80211_hwsim_change_vif_links, .change_sta_links = mac80211_hwsim_change_sta_links, .sta_state = mac80211_hwsim_sta_state, }; struct hwsim_new_radio_params { unsigned int channels; const char *reg_alpha2; const struct ieee80211_regdomain *regd; bool reg_strict; bool p2p_device; bool use_chanctx; bool destroy_on_close; const char *hwname; bool no_vif; const u8 *perm_addr; u32 iftypes; u32 *ciphers; u8 n_ciphers; bool mlo; const struct cfg80211_pmsr_capabilities *pmsr_capa; }; static void hwsim_mcast_config_msg(struct sk_buff *mcast_skb, struct genl_info *info) { if (info) genl_notify(&hwsim_genl_family, mcast_skb, info, HWSIM_MCGRP_CONFIG, GFP_KERNEL); else genlmsg_multicast(&hwsim_genl_family, mcast_skb, 0, HWSIM_MCGRP_CONFIG, GFP_KERNEL); } static int append_radio_msg(struct sk_buff *skb, int id, struct hwsim_new_radio_params *param) { int ret; ret = nla_put_u32(skb, HWSIM_ATTR_RADIO_ID, id); if (ret < 0) return ret; if (param->channels) { ret = nla_put_u32(skb, HWSIM_ATTR_CHANNELS, param->channels); if (ret < 0) return ret; } if (param->reg_alpha2) { ret = nla_put(skb, HWSIM_ATTR_REG_HINT_ALPHA2, 2, param->reg_alpha2); if (ret < 0) return ret; } if (param->regd) { int i; for (i = 0; i < ARRAY_SIZE(hwsim_world_regdom_custom); i++) { if (hwsim_world_regdom_custom[i] != param->regd) continue; ret = nla_put_u32(skb, HWSIM_ATTR_REG_CUSTOM_REG, i); if (ret < 0) return ret; break; } } if (param->reg_strict) { ret = nla_put_flag(skb, HWSIM_ATTR_REG_STRICT_REG); if (ret < 0) return ret; } if (param->p2p_device) { ret = nla_put_flag(skb, HWSIM_ATTR_SUPPORT_P2P_DEVICE); if (ret < 0) return ret; } if (param->use_chanctx) { ret = nla_put_flag(skb, HWSIM_ATTR_USE_CHANCTX); if (ret < 0) return ret; } if (param->hwname) { ret = nla_put(skb, HWSIM_ATTR_RADIO_NAME, strlen(param->hwname), param->hwname); if (ret < 0) return ret; } return 0; } static void hwsim_mcast_new_radio(int id, struct genl_info *info, struct hwsim_new_radio_params *param) { struct sk_buff *mcast_skb; void *data; mcast_skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!mcast_skb) return; data = genlmsg_put(mcast_skb, 0, 0, &hwsim_genl_family, 0, HWSIM_CMD_NEW_RADIO); if (!data) goto out_err; if (append_radio_msg(mcast_skb, id, param) < 0) goto out_err; genlmsg_end(mcast_skb, data); hwsim_mcast_config_msg(mcast_skb, info); return; out_err: nlmsg_free(mcast_skb); } static const struct ieee80211_sband_iftype_data sband_capa_2ghz[] = { { .types_mask = BIT(NL80211_IFTYPE_STATION), .he_cap = { .has_he = true, .he_cap_elem = { .mac_cap_info[0] = IEEE80211_HE_MAC_CAP0_HTC_HE, .mac_cap_info[1] = IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US | IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8, .mac_cap_info[2] = IEEE80211_HE_MAC_CAP2_BSR | IEEE80211_HE_MAC_CAP2_MU_CASCADING | IEEE80211_HE_MAC_CAP2_ACK_EN, .mac_cap_info[3] = IEEE80211_HE_MAC_CAP3_OMI_CONTROL | IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3, .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU, .phy_cap_info[0] = IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G, .phy_cap_info[1] = IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK | IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A | IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD | IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS, .phy_cap_info[2] = IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US | IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ | IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ | IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO | IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO, /* Leave all the other PHY capability bytes * unset, as DCM, beam forming, RU and PPE * threshold information are not supported */ }, .he_mcs_nss_supp = { .rx_mcs_80 = cpu_to_le16(0xfffa), .tx_mcs_80 = cpu_to_le16(0xfffa), .rx_mcs_160 = cpu_to_le16(0xffff), .tx_mcs_160 = cpu_to_le16(0xffff), .rx_mcs_80p80 = cpu_to_le16(0xffff), .tx_mcs_80p80 = cpu_to_le16(0xffff), }, }, .eht_cap = { .has_eht = true, .eht_cap_elem = { .mac_cap_info[0] = IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS | IEEE80211_EHT_MAC_CAP0_OM_CONTROL | IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1, .phy_cap_info[0] = IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ | IEEE80211_EHT_PHY_CAP0_NDP_4_EHT_LFT_32_GI | IEEE80211_EHT_PHY_CAP0_PARTIAL_BW_UL_MU_MIMO | IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER | IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE, .phy_cap_info[3] = IEEE80211_EHT_PHY_CAP3_NG_16_SU_FEEDBACK | IEEE80211_EHT_PHY_CAP3_NG_16_MU_FEEDBACK | IEEE80211_EHT_PHY_CAP3_CODEBOOK_4_2_SU_FDBK | IEEE80211_EHT_PHY_CAP3_CODEBOOK_7_5_MU_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_SU_BF_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_MU_BF_PART_BW_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_CQI_FDBK, .phy_cap_info[4] = IEEE80211_EHT_PHY_CAP4_PART_BW_DL_MU_MIMO | IEEE80211_EHT_PHY_CAP4_PSR_SR_SUPP | IEEE80211_EHT_PHY_CAP4_POWER_BOOST_FACT_SUPP | IEEE80211_EHT_PHY_CAP4_EHT_MU_PPDU_4_EHT_LTF_08_GI | IEEE80211_EHT_PHY_CAP4_MAX_NC_MASK, .phy_cap_info[5] = IEEE80211_EHT_PHY_CAP5_NON_TRIG_CQI_FEEDBACK | IEEE80211_EHT_PHY_CAP5_TX_LESS_242_TONE_RU_SUPP | IEEE80211_EHT_PHY_CAP5_RX_LESS_242_TONE_RU_SUPP | IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT | IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_MASK | IEEE80211_EHT_PHY_CAP5_MAX_NUM_SUPP_EHT_LTF_MASK, .phy_cap_info[6] = IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK | IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK, .phy_cap_info[7] = IEEE80211_EHT_PHY_CAP7_20MHZ_STA_RX_NDP_WIDER_BW, }, /* For all MCS and bandwidth, set 8 NSS for both Tx and * Rx */ .eht_mcs_nss_supp = { /* * Since B0, B1, B2 and B3 are not set in * the supported channel width set field in the * HE PHY capabilities information field the * device is a 20MHz only device on 2.4GHz band. */ .only_20mhz = { .rx_tx_mcs7_max_nss = 0x88, .rx_tx_mcs9_max_nss = 0x88, .rx_tx_mcs11_max_nss = 0x88, .rx_tx_mcs13_max_nss = 0x88, }, }, /* PPE threshold information is not supported */ }, }, { .types_mask = BIT(NL80211_IFTYPE_AP), .he_cap = { .has_he = true, .he_cap_elem = { .mac_cap_info[0] = IEEE80211_HE_MAC_CAP0_HTC_HE, .mac_cap_info[1] = IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US | IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8, .mac_cap_info[2] = IEEE80211_HE_MAC_CAP2_BSR | IEEE80211_HE_MAC_CAP2_MU_CASCADING | IEEE80211_HE_MAC_CAP2_ACK_EN, .mac_cap_info[3] = IEEE80211_HE_MAC_CAP3_OMI_CONTROL | IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3, .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU, .phy_cap_info[0] = IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G, .phy_cap_info[1] = IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK | IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A | IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD | IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS, .phy_cap_info[2] = IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US | IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ | IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ | IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO | IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO, /* Leave all the other PHY capability bytes * unset, as DCM, beam forming, RU and PPE * threshold information are not supported */ }, .he_mcs_nss_supp = { .rx_mcs_80 = cpu_to_le16(0xfffa), .tx_mcs_80 = cpu_to_le16(0xfffa), .rx_mcs_160 = cpu_to_le16(0xffff), .tx_mcs_160 = cpu_to_le16(0xffff), .rx_mcs_80p80 = cpu_to_le16(0xffff), .tx_mcs_80p80 = cpu_to_le16(0xffff), }, }, .eht_cap = { .has_eht = true, .eht_cap_elem = { .mac_cap_info[0] = IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS | IEEE80211_EHT_MAC_CAP0_OM_CONTROL | IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1, .phy_cap_info[0] = IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ | IEEE80211_EHT_PHY_CAP0_NDP_4_EHT_LFT_32_GI | IEEE80211_EHT_PHY_CAP0_PARTIAL_BW_UL_MU_MIMO | IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER | IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE, .phy_cap_info[3] = IEEE80211_EHT_PHY_CAP3_NG_16_SU_FEEDBACK | IEEE80211_EHT_PHY_CAP3_NG_16_MU_FEEDBACK | IEEE80211_EHT_PHY_CAP3_CODEBOOK_4_2_SU_FDBK | IEEE80211_EHT_PHY_CAP3_CODEBOOK_7_5_MU_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_SU_BF_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_MU_BF_PART_BW_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_CQI_FDBK, .phy_cap_info[4] = IEEE80211_EHT_PHY_CAP4_PART_BW_DL_MU_MIMO | IEEE80211_EHT_PHY_CAP4_PSR_SR_SUPP | IEEE80211_EHT_PHY_CAP4_POWER_BOOST_FACT_SUPP | IEEE80211_EHT_PHY_CAP4_EHT_MU_PPDU_4_EHT_LTF_08_GI | IEEE80211_EHT_PHY_CAP4_MAX_NC_MASK, .phy_cap_info[5] = IEEE80211_EHT_PHY_CAP5_NON_TRIG_CQI_FEEDBACK | IEEE80211_EHT_PHY_CAP5_TX_LESS_242_TONE_RU_SUPP | IEEE80211_EHT_PHY_CAP5_RX_LESS_242_TONE_RU_SUPP | IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT | IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_MASK | IEEE80211_EHT_PHY_CAP5_MAX_NUM_SUPP_EHT_LTF_MASK, .phy_cap_info[6] = IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK | IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK, .phy_cap_info[7] = IEEE80211_EHT_PHY_CAP7_20MHZ_STA_RX_NDP_WIDER_BW, }, /* For all MCS and bandwidth, set 8 NSS for both Tx and * Rx */ .eht_mcs_nss_supp = { /* * Since B0, B1, B2 and B3 are not set in * the supported channel width set field in the * HE PHY capabilities information field the * device is a 20MHz only device on 2.4GHz band. */ .only_20mhz = { .rx_tx_mcs7_max_nss = 0x88, .rx_tx_mcs9_max_nss = 0x88, .rx_tx_mcs11_max_nss = 0x88, .rx_tx_mcs13_max_nss = 0x88, }, }, /* PPE threshold information is not supported */ }, }, #ifdef CONFIG_MAC80211_MESH { .types_mask = BIT(NL80211_IFTYPE_MESH_POINT), .he_cap = { .has_he = true, .he_cap_elem = { .mac_cap_info[0] = IEEE80211_HE_MAC_CAP0_HTC_HE, .mac_cap_info[1] = IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8, .mac_cap_info[2] = IEEE80211_HE_MAC_CAP2_ACK_EN, .mac_cap_info[3] = IEEE80211_HE_MAC_CAP3_OMI_CONTROL | IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3, .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU, .phy_cap_info[0] = IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G, .phy_cap_info[1] = IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK | IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A | IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD | IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS, .phy_cap_info[2] = 0, /* Leave all the other PHY capability bytes * unset, as DCM, beam forming, RU and PPE * threshold information are not supported */ }, .he_mcs_nss_supp = { .rx_mcs_80 = cpu_to_le16(0xfffa), .tx_mcs_80 = cpu_to_le16(0xfffa), .rx_mcs_160 = cpu_to_le16(0xffff), .tx_mcs_160 = cpu_to_le16(0xffff), .rx_mcs_80p80 = cpu_to_le16(0xffff), .tx_mcs_80p80 = cpu_to_le16(0xffff), }, }, }, #endif }; static const struct ieee80211_sband_iftype_data sband_capa_5ghz[] = { { /* TODO: should we support other types, e.g., P2P? */ .types_mask = BIT(NL80211_IFTYPE_STATION), .he_cap = { .has_he = true, .he_cap_elem = { .mac_cap_info[0] = IEEE80211_HE_MAC_CAP0_HTC_HE, .mac_cap_info[1] = IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US | IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8, .mac_cap_info[2] = IEEE80211_HE_MAC_CAP2_BSR | IEEE80211_HE_MAC_CAP2_MU_CASCADING | IEEE80211_HE_MAC_CAP2_ACK_EN, .mac_cap_info[3] = IEEE80211_HE_MAC_CAP3_OMI_CONTROL | IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3, .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU, .phy_cap_info[0] = IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G, .phy_cap_info[1] = IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK | IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A | IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD | IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS, .phy_cap_info[2] = IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US | IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ | IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ | IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO | IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO, /* Leave all the other PHY capability bytes * unset, as DCM, beam forming, RU and PPE * threshold information are not supported */ }, .he_mcs_nss_supp = { .rx_mcs_80 = cpu_to_le16(0xfffa), .tx_mcs_80 = cpu_to_le16(0xfffa), .rx_mcs_160 = cpu_to_le16(0xfffa), .tx_mcs_160 = cpu_to_le16(0xfffa), .rx_mcs_80p80 = cpu_to_le16(0xfffa), .tx_mcs_80p80 = cpu_to_le16(0xfffa), }, }, .eht_cap = { .has_eht = true, .eht_cap_elem = { .mac_cap_info[0] = IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS | IEEE80211_EHT_MAC_CAP0_OM_CONTROL | IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1, .phy_cap_info[0] = IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ | IEEE80211_EHT_PHY_CAP0_NDP_4_EHT_LFT_32_GI | IEEE80211_EHT_PHY_CAP0_PARTIAL_BW_UL_MU_MIMO | IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER | IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE | IEEE80211_EHT_PHY_CAP0_BEAMFORMEE_SS_80MHZ_MASK, .phy_cap_info[1] = IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_80MHZ_MASK | IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_160MHZ_MASK, .phy_cap_info[2] = IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_80MHZ_MASK | IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_160MHZ_MASK, .phy_cap_info[3] = IEEE80211_EHT_PHY_CAP3_NG_16_SU_FEEDBACK | IEEE80211_EHT_PHY_CAP3_NG_16_MU_FEEDBACK | IEEE80211_EHT_PHY_CAP3_CODEBOOK_4_2_SU_FDBK | IEEE80211_EHT_PHY_CAP3_CODEBOOK_7_5_MU_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_SU_BF_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_MU_BF_PART_BW_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_CQI_FDBK, .phy_cap_info[4] = IEEE80211_EHT_PHY_CAP4_PART_BW_DL_MU_MIMO | IEEE80211_EHT_PHY_CAP4_PSR_SR_SUPP | IEEE80211_EHT_PHY_CAP4_POWER_BOOST_FACT_SUPP | IEEE80211_EHT_PHY_CAP4_EHT_MU_PPDU_4_EHT_LTF_08_GI | IEEE80211_EHT_PHY_CAP4_MAX_NC_MASK, .phy_cap_info[5] = IEEE80211_EHT_PHY_CAP5_NON_TRIG_CQI_FEEDBACK | IEEE80211_EHT_PHY_CAP5_TX_LESS_242_TONE_RU_SUPP | IEEE80211_EHT_PHY_CAP5_RX_LESS_242_TONE_RU_SUPP | IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT | IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_MASK | IEEE80211_EHT_PHY_CAP5_MAX_NUM_SUPP_EHT_LTF_MASK, .phy_cap_info[6] = IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK | IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK, .phy_cap_info[7] = IEEE80211_EHT_PHY_CAP7_20MHZ_STA_RX_NDP_WIDER_BW | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ | IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ | IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ, }, /* For all MCS and bandwidth, set 8 NSS for both Tx and * Rx */ .eht_mcs_nss_supp = { /* * As B1 and B2 are set in the supported * channel width set field in the HE PHY * capabilities information field include all * the following MCS/NSS. */ .bw._80 = { .rx_tx_mcs9_max_nss = 0x88, .rx_tx_mcs11_max_nss = 0x88, .rx_tx_mcs13_max_nss = 0x88, }, .bw._160 = { .rx_tx_mcs9_max_nss = 0x88, .rx_tx_mcs11_max_nss = 0x88, .rx_tx_mcs13_max_nss = 0x88, }, }, /* PPE threshold information is not supported */ }, }, { .types_mask = BIT(NL80211_IFTYPE_AP), .he_cap = { .has_he = true, .he_cap_elem = { .mac_cap_info[0] = IEEE80211_HE_MAC_CAP0_HTC_HE, .mac_cap_info[1] = IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US | IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8, .mac_cap_info[2] = IEEE80211_HE_MAC_CAP2_BSR | IEEE80211_HE_MAC_CAP2_MU_CASCADING | IEEE80211_HE_MAC_CAP2_ACK_EN, .mac_cap_info[3] = IEEE80211_HE_MAC_CAP3_OMI_CONTROL | IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3, .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU, .phy_cap_info[0] = IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G, .phy_cap_info[1] = IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK | IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A | IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD | IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS, .phy_cap_info[2] = IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US | IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ | IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ | IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO | IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO, /* Leave all the other PHY capability bytes * unset, as DCM, beam forming, RU and PPE * threshold information are not supported */ }, .he_mcs_nss_supp = { .rx_mcs_80 = cpu_to_le16(0xfffa), .tx_mcs_80 = cpu_to_le16(0xfffa), .rx_mcs_160 = cpu_to_le16(0xfffa), .tx_mcs_160 = cpu_to_le16(0xfffa), .rx_mcs_80p80 = cpu_to_le16(0xfffa), .tx_mcs_80p80 = cpu_to_le16(0xfffa), }, }, .eht_cap = { .has_eht = true, .eht_cap_elem = { .mac_cap_info[0] = IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS | IEEE80211_EHT_MAC_CAP0_OM_CONTROL | IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1, .phy_cap_info[0] = IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ | IEEE80211_EHT_PHY_CAP0_NDP_4_EHT_LFT_32_GI | IEEE80211_EHT_PHY_CAP0_PARTIAL_BW_UL_MU_MIMO | IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER | IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE | IEEE80211_EHT_PHY_CAP0_BEAMFORMEE_SS_80MHZ_MASK, .phy_cap_info[1] = IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_80MHZ_MASK | IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_160MHZ_MASK, .phy_cap_info[2] = IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_80MHZ_MASK | IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_160MHZ_MASK, .phy_cap_info[3] = IEEE80211_EHT_PHY_CAP3_NG_16_SU_FEEDBACK | IEEE80211_EHT_PHY_CAP3_NG_16_MU_FEEDBACK | IEEE80211_EHT_PHY_CAP3_CODEBOOK_4_2_SU_FDBK | IEEE80211_EHT_PHY_CAP3_CODEBOOK_7_5_MU_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_SU_BF_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_MU_BF_PART_BW_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_CQI_FDBK, .phy_cap_info[4] = IEEE80211_EHT_PHY_CAP4_PART_BW_DL_MU_MIMO | IEEE80211_EHT_PHY_CAP4_PSR_SR_SUPP | IEEE80211_EHT_PHY_CAP4_POWER_BOOST_FACT_SUPP | IEEE80211_EHT_PHY_CAP4_EHT_MU_PPDU_4_EHT_LTF_08_GI | IEEE80211_EHT_PHY_CAP4_MAX_NC_MASK, .phy_cap_info[5] = IEEE80211_EHT_PHY_CAP5_NON_TRIG_CQI_FEEDBACK | IEEE80211_EHT_PHY_CAP5_TX_LESS_242_TONE_RU_SUPP | IEEE80211_EHT_PHY_CAP5_RX_LESS_242_TONE_RU_SUPP | IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT | IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_MASK | IEEE80211_EHT_PHY_CAP5_MAX_NUM_SUPP_EHT_LTF_MASK, .phy_cap_info[6] = IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK | IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK, .phy_cap_info[7] = IEEE80211_EHT_PHY_CAP7_20MHZ_STA_RX_NDP_WIDER_BW | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ | IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ | IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ, }, /* For all MCS and bandwidth, set 8 NSS for both Tx and * Rx */ .eht_mcs_nss_supp = { /* * As B1 and B2 are set in the supported * channel width set field in the HE PHY * capabilities information field include all * the following MCS/NSS. */ .bw._80 = { .rx_tx_mcs9_max_nss = 0x88, .rx_tx_mcs11_max_nss = 0x88, .rx_tx_mcs13_max_nss = 0x88, }, .bw._160 = { .rx_tx_mcs9_max_nss = 0x88, .rx_tx_mcs11_max_nss = 0x88, .rx_tx_mcs13_max_nss = 0x88, }, }, /* PPE threshold information is not supported */ }, }, #ifdef CONFIG_MAC80211_MESH { /* TODO: should we support other types, e.g., IBSS?*/ .types_mask = BIT(NL80211_IFTYPE_MESH_POINT), .he_cap = { .has_he = true, .he_cap_elem = { .mac_cap_info[0] = IEEE80211_HE_MAC_CAP0_HTC_HE, .mac_cap_info[1] = IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8, .mac_cap_info[2] = IEEE80211_HE_MAC_CAP2_ACK_EN, .mac_cap_info[3] = IEEE80211_HE_MAC_CAP3_OMI_CONTROL | IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3, .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU, .phy_cap_info[0] = IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G, .phy_cap_info[1] = IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK | IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A | IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD | IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS, .phy_cap_info[2] = 0, /* Leave all the other PHY capability bytes * unset, as DCM, beam forming, RU and PPE * threshold information are not supported */ }, .he_mcs_nss_supp = { .rx_mcs_80 = cpu_to_le16(0xfffa), .tx_mcs_80 = cpu_to_le16(0xfffa), .rx_mcs_160 = cpu_to_le16(0xfffa), .tx_mcs_160 = cpu_to_le16(0xfffa), .rx_mcs_80p80 = cpu_to_le16(0xfffa), .tx_mcs_80p80 = cpu_to_le16(0xfffa), }, }, }, #endif }; static const struct ieee80211_sband_iftype_data sband_capa_6ghz[] = { { /* TODO: should we support other types, e.g., P2P? */ .types_mask = BIT(NL80211_IFTYPE_STATION), .he_6ghz_capa = { .capa = cpu_to_le16(IEEE80211_HE_6GHZ_CAP_MIN_MPDU_START | IEEE80211_HE_6GHZ_CAP_MAX_AMPDU_LEN_EXP | IEEE80211_HE_6GHZ_CAP_MAX_MPDU_LEN | IEEE80211_HE_6GHZ_CAP_SM_PS | IEEE80211_HE_6GHZ_CAP_RD_RESPONDER | IEEE80211_HE_6GHZ_CAP_TX_ANTPAT_CONS | IEEE80211_HE_6GHZ_CAP_RX_ANTPAT_CONS), }, .he_cap = { .has_he = true, .he_cap_elem = { .mac_cap_info[0] = IEEE80211_HE_MAC_CAP0_HTC_HE, .mac_cap_info[1] = IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US | IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8, .mac_cap_info[2] = IEEE80211_HE_MAC_CAP2_BSR | IEEE80211_HE_MAC_CAP2_MU_CASCADING | IEEE80211_HE_MAC_CAP2_ACK_EN, .mac_cap_info[3] = IEEE80211_HE_MAC_CAP3_OMI_CONTROL | IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3, .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU, .phy_cap_info[0] = IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G, .phy_cap_info[1] = IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK | IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A | IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD | IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS, .phy_cap_info[2] = IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US | IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ | IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ | IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO | IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO, /* Leave all the other PHY capability bytes * unset, as DCM, beam forming, RU and PPE * threshold information are not supported */ }, .he_mcs_nss_supp = { .rx_mcs_80 = cpu_to_le16(0xfffa), .tx_mcs_80 = cpu_to_le16(0xfffa), .rx_mcs_160 = cpu_to_le16(0xfffa), .tx_mcs_160 = cpu_to_le16(0xfffa), .rx_mcs_80p80 = cpu_to_le16(0xfffa), .tx_mcs_80p80 = cpu_to_le16(0xfffa), }, }, .eht_cap = { .has_eht = true, .eht_cap_elem = { .mac_cap_info[0] = IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS | IEEE80211_EHT_MAC_CAP0_OM_CONTROL | IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1, .phy_cap_info[0] = IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ | IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ | IEEE80211_EHT_PHY_CAP0_NDP_4_EHT_LFT_32_GI | IEEE80211_EHT_PHY_CAP0_PARTIAL_BW_UL_MU_MIMO | IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER | IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE | IEEE80211_EHT_PHY_CAP0_BEAMFORMEE_SS_80MHZ_MASK, .phy_cap_info[1] = IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_80MHZ_MASK | IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_160MHZ_MASK | IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_320MHZ_MASK, .phy_cap_info[2] = IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_80MHZ_MASK | IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_160MHZ_MASK | IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_320MHZ_MASK, .phy_cap_info[3] = IEEE80211_EHT_PHY_CAP3_NG_16_SU_FEEDBACK | IEEE80211_EHT_PHY_CAP3_NG_16_MU_FEEDBACK | IEEE80211_EHT_PHY_CAP3_CODEBOOK_4_2_SU_FDBK | IEEE80211_EHT_PHY_CAP3_CODEBOOK_7_5_MU_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_SU_BF_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_MU_BF_PART_BW_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_CQI_FDBK, .phy_cap_info[4] = IEEE80211_EHT_PHY_CAP4_PART_BW_DL_MU_MIMO | IEEE80211_EHT_PHY_CAP4_PSR_SR_SUPP | IEEE80211_EHT_PHY_CAP4_POWER_BOOST_FACT_SUPP | IEEE80211_EHT_PHY_CAP4_EHT_MU_PPDU_4_EHT_LTF_08_GI | IEEE80211_EHT_PHY_CAP4_MAX_NC_MASK, .phy_cap_info[5] = IEEE80211_EHT_PHY_CAP5_NON_TRIG_CQI_FEEDBACK | IEEE80211_EHT_PHY_CAP5_TX_LESS_242_TONE_RU_SUPP | IEEE80211_EHT_PHY_CAP5_RX_LESS_242_TONE_RU_SUPP | IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT | IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_MASK | IEEE80211_EHT_PHY_CAP5_MAX_NUM_SUPP_EHT_LTF_MASK, .phy_cap_info[6] = IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK | IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK | IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP, .phy_cap_info[7] = IEEE80211_EHT_PHY_CAP7_20MHZ_STA_RX_NDP_WIDER_BW | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_320MHZ | IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ | IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ | IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_320MHZ, }, /* For all MCS and bandwidth, set 8 NSS for both Tx and * Rx */ .eht_mcs_nss_supp = { /* * As B1 and B2 are set in the supported * channel width set field in the HE PHY * capabilities information field and 320MHz in * 6GHz is supported include all the following * MCS/NSS. */ .bw._80 = { .rx_tx_mcs9_max_nss = 0x88, .rx_tx_mcs11_max_nss = 0x88, .rx_tx_mcs13_max_nss = 0x88, }, .bw._160 = { .rx_tx_mcs9_max_nss = 0x88, .rx_tx_mcs11_max_nss = 0x88, .rx_tx_mcs13_max_nss = 0x88, }, .bw._320 = { .rx_tx_mcs9_max_nss = 0x88, .rx_tx_mcs11_max_nss = 0x88, .rx_tx_mcs13_max_nss = 0x88, }, }, /* PPE threshold information is not supported */ }, }, { .types_mask = BIT(NL80211_IFTYPE_AP), .he_6ghz_capa = { .capa = cpu_to_le16(IEEE80211_HE_6GHZ_CAP_MIN_MPDU_START | IEEE80211_HE_6GHZ_CAP_MAX_AMPDU_LEN_EXP | IEEE80211_HE_6GHZ_CAP_MAX_MPDU_LEN | IEEE80211_HE_6GHZ_CAP_SM_PS | IEEE80211_HE_6GHZ_CAP_RD_RESPONDER | IEEE80211_HE_6GHZ_CAP_TX_ANTPAT_CONS | IEEE80211_HE_6GHZ_CAP_RX_ANTPAT_CONS), }, .he_cap = { .has_he = true, .he_cap_elem = { .mac_cap_info[0] = IEEE80211_HE_MAC_CAP0_HTC_HE, .mac_cap_info[1] = IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US | IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8, .mac_cap_info[2] = IEEE80211_HE_MAC_CAP2_BSR | IEEE80211_HE_MAC_CAP2_MU_CASCADING | IEEE80211_HE_MAC_CAP2_ACK_EN, .mac_cap_info[3] = IEEE80211_HE_MAC_CAP3_OMI_CONTROL | IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3, .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU, .phy_cap_info[0] = IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G, .phy_cap_info[1] = IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK | IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A | IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD | IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS, .phy_cap_info[2] = IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US | IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ | IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ | IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO | IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO, /* Leave all the other PHY capability bytes * unset, as DCM, beam forming, RU and PPE * threshold information are not supported */ }, .he_mcs_nss_supp = { .rx_mcs_80 = cpu_to_le16(0xfffa), .tx_mcs_80 = cpu_to_le16(0xfffa), .rx_mcs_160 = cpu_to_le16(0xfffa), .tx_mcs_160 = cpu_to_le16(0xfffa), .rx_mcs_80p80 = cpu_to_le16(0xfffa), .tx_mcs_80p80 = cpu_to_le16(0xfffa), }, }, .eht_cap = { .has_eht = true, .eht_cap_elem = { .mac_cap_info[0] = IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS | IEEE80211_EHT_MAC_CAP0_OM_CONTROL | IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1, .phy_cap_info[0] = IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ | IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ | IEEE80211_EHT_PHY_CAP0_NDP_4_EHT_LFT_32_GI | IEEE80211_EHT_PHY_CAP0_PARTIAL_BW_UL_MU_MIMO | IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER | IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE | IEEE80211_EHT_PHY_CAP0_BEAMFORMEE_SS_80MHZ_MASK, .phy_cap_info[1] = IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_80MHZ_MASK | IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_160MHZ_MASK | IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_320MHZ_MASK, .phy_cap_info[2] = IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_80MHZ_MASK | IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_160MHZ_MASK | IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_320MHZ_MASK, .phy_cap_info[3] = IEEE80211_EHT_PHY_CAP3_NG_16_SU_FEEDBACK | IEEE80211_EHT_PHY_CAP3_NG_16_MU_FEEDBACK | IEEE80211_EHT_PHY_CAP3_CODEBOOK_4_2_SU_FDBK | IEEE80211_EHT_PHY_CAP3_CODEBOOK_7_5_MU_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_SU_BF_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_MU_BF_PART_BW_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_CQI_FDBK, .phy_cap_info[4] = IEEE80211_EHT_PHY_CAP4_PART_BW_DL_MU_MIMO | IEEE80211_EHT_PHY_CAP4_PSR_SR_SUPP | IEEE80211_EHT_PHY_CAP4_POWER_BOOST_FACT_SUPP | IEEE80211_EHT_PHY_CAP4_EHT_MU_PPDU_4_EHT_LTF_08_GI | IEEE80211_EHT_PHY_CAP4_MAX_NC_MASK, .phy_cap_info[5] = IEEE80211_EHT_PHY_CAP5_NON_TRIG_CQI_FEEDBACK | IEEE80211_EHT_PHY_CAP5_TX_LESS_242_TONE_RU_SUPP | IEEE80211_EHT_PHY_CAP5_RX_LESS_242_TONE_RU_SUPP | IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT | IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_MASK | IEEE80211_EHT_PHY_CAP5_MAX_NUM_SUPP_EHT_LTF_MASK, .phy_cap_info[6] = IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK | IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK | IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP, .phy_cap_info[7] = IEEE80211_EHT_PHY_CAP7_20MHZ_STA_RX_NDP_WIDER_BW | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_320MHZ | IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ | IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ | IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_320MHZ, }, /* For all MCS and bandwidth, set 8 NSS for both Tx and * Rx */ .eht_mcs_nss_supp = { /* * As B1 and B2 are set in the supported * channel width set field in the HE PHY * capabilities information field and 320MHz in * 6GHz is supported include all the following * MCS/NSS. */ .bw._80 = { .rx_tx_mcs9_max_nss = 0x88, .rx_tx_mcs11_max_nss = 0x88, .rx_tx_mcs13_max_nss = 0x88, }, .bw._160 = { .rx_tx_mcs9_max_nss = 0x88, .rx_tx_mcs11_max_nss = 0x88, .rx_tx_mcs13_max_nss = 0x88, }, .bw._320 = { .rx_tx_mcs9_max_nss = 0x88, .rx_tx_mcs11_max_nss = 0x88, .rx_tx_mcs13_max_nss = 0x88, }, }, /* PPE threshold information is not supported */ }, }, #ifdef CONFIG_MAC80211_MESH { /* TODO: should we support other types, e.g., IBSS?*/ .types_mask = BIT(NL80211_IFTYPE_MESH_POINT), .he_6ghz_capa = { .capa = cpu_to_le16(IEEE80211_HE_6GHZ_CAP_MIN_MPDU_START | IEEE80211_HE_6GHZ_CAP_MAX_AMPDU_LEN_EXP | IEEE80211_HE_6GHZ_CAP_MAX_MPDU_LEN | IEEE80211_HE_6GHZ_CAP_SM_PS | IEEE80211_HE_6GHZ_CAP_RD_RESPONDER | IEEE80211_HE_6GHZ_CAP_TX_ANTPAT_CONS | IEEE80211_HE_6GHZ_CAP_RX_ANTPAT_CONS), }, .he_cap = { .has_he = true, .he_cap_elem = { .mac_cap_info[0] = IEEE80211_HE_MAC_CAP0_HTC_HE, .mac_cap_info[1] = IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8, .mac_cap_info[2] = IEEE80211_HE_MAC_CAP2_ACK_EN, .mac_cap_info[3] = IEEE80211_HE_MAC_CAP3_OMI_CONTROL | IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3, .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU, .phy_cap_info[0] = IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G, .phy_cap_info[1] = IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK | IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A | IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD | IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS, .phy_cap_info[2] = 0, /* Leave all the other PHY capability bytes * unset, as DCM, beam forming, RU and PPE * threshold information are not supported */ }, .he_mcs_nss_supp = { .rx_mcs_80 = cpu_to_le16(0xfffa), .tx_mcs_80 = cpu_to_le16(0xfffa), .rx_mcs_160 = cpu_to_le16(0xfffa), .tx_mcs_160 = cpu_to_le16(0xfffa), .rx_mcs_80p80 = cpu_to_le16(0xfffa), .tx_mcs_80p80 = cpu_to_le16(0xfffa), }, }, }, #endif }; static void mac80211_hwsim_sband_capab(struct ieee80211_supported_band *sband) { switch (sband->band) { case NL80211_BAND_2GHZ: ieee80211_set_sband_iftype_data(sband, sband_capa_2ghz); break; case NL80211_BAND_5GHZ: ieee80211_set_sband_iftype_data(sband, sband_capa_5ghz); break; case NL80211_BAND_6GHZ: ieee80211_set_sband_iftype_data(sband, sband_capa_6ghz); break; default: break; } } #ifdef CONFIG_MAC80211_MESH #define HWSIM_MESH_BIT BIT(NL80211_IFTYPE_MESH_POINT) #else #define HWSIM_MESH_BIT 0 #endif #define HWSIM_DEFAULT_IF_LIMIT \ (BIT(NL80211_IFTYPE_STATION) | \ BIT(NL80211_IFTYPE_P2P_CLIENT) | \ BIT(NL80211_IFTYPE_AP) | \ BIT(NL80211_IFTYPE_P2P_GO) | \ HWSIM_MESH_BIT) #define HWSIM_IFTYPE_SUPPORT_MASK \ (BIT(NL80211_IFTYPE_STATION) | \ BIT(NL80211_IFTYPE_AP) | \ BIT(NL80211_IFTYPE_P2P_CLIENT) | \ BIT(NL80211_IFTYPE_P2P_GO) | \ BIT(NL80211_IFTYPE_ADHOC) | \ BIT(NL80211_IFTYPE_MESH_POINT) | \ BIT(NL80211_IFTYPE_OCB)) static int mac80211_hwsim_new_radio(struct genl_info *info, struct hwsim_new_radio_params *param) { int err; u8 addr[ETH_ALEN]; struct mac80211_hwsim_data *data; struct ieee80211_hw *hw; enum nl80211_band band; const struct ieee80211_ops *ops = &mac80211_hwsim_ops; struct net *net; int idx, i; int n_limits = 0; if (WARN_ON(param->channels > 1 && !param->use_chanctx)) return -EINVAL; spin_lock_bh(&hwsim_radio_lock); idx = hwsim_radio_idx++; spin_unlock_bh(&hwsim_radio_lock); if (param->mlo) ops = &mac80211_hwsim_mlo_ops; else if (param->use_chanctx) ops = &mac80211_hwsim_mchan_ops; hw = ieee80211_alloc_hw_nm(sizeof(*data), ops, param->hwname); if (!hw) { pr_debug("mac80211_hwsim: ieee80211_alloc_hw failed\n"); err = -ENOMEM; goto failed; } /* ieee80211_alloc_hw_nm may have used a default name */ param->hwname = wiphy_name(hw->wiphy); if (info) net = genl_info_net(info); else net = &init_net; wiphy_net_set(hw->wiphy, net); data = hw->priv; data->hw = hw; data->dev = device_create(hwsim_class, NULL, 0, hw, "hwsim%d", idx); if (IS_ERR(data->dev)) { printk(KERN_DEBUG "mac80211_hwsim: device_create failed (%ld)\n", PTR_ERR(data->dev)); err = -ENOMEM; goto failed_drvdata; } data->dev->driver = &mac80211_hwsim_driver.driver; err = device_bind_driver(data->dev); if (err != 0) { pr_debug("mac80211_hwsim: device_bind_driver failed (%d)\n", err); goto failed_bind; } skb_queue_head_init(&data->pending); SET_IEEE80211_DEV(hw, data->dev); if (!param->perm_addr) { eth_zero_addr(addr); addr[0] = 0x02; addr[3] = idx >> 8; addr[4] = idx; memcpy(data->addresses[0].addr, addr, ETH_ALEN); /* Why need here second address ? */ memcpy(data->addresses[1].addr, addr, ETH_ALEN); data->addresses[1].addr[0] |= 0x40; hw->wiphy->n_addresses = 2; hw->wiphy->addresses = data->addresses; /* possible address clash is checked at hash table insertion */ } else { memcpy(data->addresses[0].addr, param->perm_addr, ETH_ALEN); /* compatibility with automatically generated mac addr */ memcpy(data->addresses[1].addr, param->perm_addr, ETH_ALEN); hw->wiphy->n_addresses = 2; hw->wiphy->addresses = data->addresses; } data->channels = param->channels; data->use_chanctx = param->use_chanctx; data->idx = idx; data->destroy_on_close = param->destroy_on_close; if (info) data->portid = info->snd_portid; /* setup interface limits, only on interface types we support */ if (param->iftypes & BIT(NL80211_IFTYPE_ADHOC)) { data->if_limits[n_limits].max = 1; data->if_limits[n_limits].types = BIT(NL80211_IFTYPE_ADHOC); n_limits++; } if (param->iftypes & HWSIM_DEFAULT_IF_LIMIT) { data->if_limits[n_limits].max = 2048; /* * For this case, we may only support a subset of * HWSIM_DEFAULT_IF_LIMIT, therefore we only want to add the * bits that both param->iftype & HWSIM_DEFAULT_IF_LIMIT have. */ data->if_limits[n_limits].types = HWSIM_DEFAULT_IF_LIMIT & param->iftypes; n_limits++; } if (param->iftypes & BIT(NL80211_IFTYPE_P2P_DEVICE)) { data->if_limits[n_limits].max = 1; data->if_limits[n_limits].types = BIT(NL80211_IFTYPE_P2P_DEVICE); n_limits++; } if (data->use_chanctx) { hw->wiphy->max_scan_ssids = 255; hw->wiphy->max_scan_ie_len = IEEE80211_MAX_DATA_LEN; hw->wiphy->max_remain_on_channel_duration = 1000; data->if_combination.radar_detect_widths = 0; data->if_combination.num_different_channels = data->channels; } else { data->if_combination.num_different_channels = 1; data->if_combination.radar_detect_widths = BIT(NL80211_CHAN_WIDTH_5) | BIT(NL80211_CHAN_WIDTH_10) | BIT(NL80211_CHAN_WIDTH_20_NOHT) | BIT(NL80211_CHAN_WIDTH_20) | BIT(NL80211_CHAN_WIDTH_40) | BIT(NL80211_CHAN_WIDTH_80) | BIT(NL80211_CHAN_WIDTH_160); } if (!n_limits) { err = -EINVAL; goto failed_hw; } data->if_combination.max_interfaces = 0; for (i = 0; i < n_limits; i++) data->if_combination.max_interfaces += data->if_limits[i].max; data->if_combination.n_limits = n_limits; data->if_combination.limits = data->if_limits; /* * If we actually were asked to support combinations, * advertise them - if there's only a single thing like * only IBSS then don't advertise it as combinations. */ if (data->if_combination.max_interfaces > 1) { hw->wiphy->iface_combinations = &data->if_combination; hw->wiphy->n_iface_combinations = 1; } if (param->ciphers) { memcpy(data->ciphers, param->ciphers, param->n_ciphers * sizeof(u32)); hw->wiphy->cipher_suites = data->ciphers; hw->wiphy->n_cipher_suites = param->n_ciphers; } hw->wiphy->mbssid_max_interfaces = 8; hw->wiphy->ema_max_profile_periodicity = 3; data->rx_rssi = DEFAULT_RX_RSSI; INIT_DELAYED_WORK(&data->roc_start, hw_roc_start); INIT_DELAYED_WORK(&data->roc_done, hw_roc_done); INIT_DELAYED_WORK(&data->hw_scan, hw_scan_work); hw->queues = 5; hw->offchannel_tx_hw_queue = 4; ieee80211_hw_set(hw, SUPPORT_FAST_XMIT); ieee80211_hw_set(hw, CHANCTX_STA_CSA); ieee80211_hw_set(hw, SUPPORTS_HT_CCK_RATES); ieee80211_hw_set(hw, QUEUE_CONTROL); ieee80211_hw_set(hw, WANT_MONITOR_VIF); ieee80211_hw_set(hw, AMPDU_AGGREGATION); ieee80211_hw_set(hw, MFP_CAPABLE); ieee80211_hw_set(hw, SIGNAL_DBM); ieee80211_hw_set(hw, SUPPORTS_PS); ieee80211_hw_set(hw, REPORTS_TX_ACK_STATUS); ieee80211_hw_set(hw, TDLS_WIDER_BW); ieee80211_hw_set(hw, SUPPORTS_MULTI_BSSID); if (param->mlo) { hw->wiphy->flags |= WIPHY_FLAG_SUPPORTS_MLO; ieee80211_hw_set(hw, HAS_RATE_CONTROL); ieee80211_hw_set(hw, SUPPORTS_DYNAMIC_PS); ieee80211_hw_set(hw, CONNECTION_MONITOR); ieee80211_hw_set(hw, AP_LINK_PS); } else { ieee80211_hw_set(hw, HOST_BROADCAST_PS_BUFFERING); ieee80211_hw_set(hw, PS_NULLFUNC_STACK); if (rctbl) ieee80211_hw_set(hw, SUPPORTS_RC_TABLE); } hw->wiphy->flags &= ~WIPHY_FLAG_PS_ON_BY_DEFAULT; hw->wiphy->flags |= WIPHY_FLAG_SUPPORTS_TDLS | WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL | WIPHY_FLAG_AP_UAPSD | WIPHY_FLAG_SUPPORTS_5_10_MHZ | WIPHY_FLAG_HAS_CHANNEL_SWITCH; hw->wiphy->features |= NL80211_FEATURE_ACTIVE_MONITOR | NL80211_FEATURE_AP_MODE_CHAN_WIDTH_CHANGE | NL80211_FEATURE_STATIC_SMPS | NL80211_FEATURE_DYNAMIC_SMPS | NL80211_FEATURE_SCAN_RANDOM_MAC_ADDR; wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_VHT_IBSS); wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_BEACON_PROTECTION); wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS); wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_BEACON_RATE_LEGACY); wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER); wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT); hw->wiphy->interface_modes = param->iftypes; /* ask mac80211 to reserve space for magic */ hw->vif_data_size = sizeof(struct hwsim_vif_priv); hw->sta_data_size = sizeof(struct hwsim_sta_priv); hw->chanctx_data_size = sizeof(struct hwsim_chanctx_priv); memcpy(data->channels_2ghz, hwsim_channels_2ghz, sizeof(hwsim_channels_2ghz)); memcpy(data->channels_5ghz, hwsim_channels_5ghz, sizeof(hwsim_channels_5ghz)); memcpy(data->channels_6ghz, hwsim_channels_6ghz, sizeof(hwsim_channels_6ghz)); memcpy(data->channels_s1g, hwsim_channels_s1g, sizeof(hwsim_channels_s1g)); memcpy(data->rates, hwsim_rates, sizeof(hwsim_rates)); for (band = NL80211_BAND_2GHZ; band < NUM_NL80211_BANDS; band++) { struct ieee80211_supported_band *sband = &data->bands[band]; sband->band = band; switch (band) { case NL80211_BAND_2GHZ: sband->channels = data->channels_2ghz; sband->n_channels = ARRAY_SIZE(hwsim_channels_2ghz); sband->bitrates = data->rates; sband->n_bitrates = ARRAY_SIZE(hwsim_rates); break; case NL80211_BAND_5GHZ: sband->channels = data->channels_5ghz; sband->n_channels = ARRAY_SIZE(hwsim_channels_5ghz); sband->bitrates = data->rates + 4; sband->n_bitrates = ARRAY_SIZE(hwsim_rates) - 4; sband->vht_cap.vht_supported = true; sband->vht_cap.cap = IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454 | IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ | IEEE80211_VHT_CAP_RXLDPC | IEEE80211_VHT_CAP_SHORT_GI_80 | IEEE80211_VHT_CAP_SHORT_GI_160 | IEEE80211_VHT_CAP_TXSTBC | IEEE80211_VHT_CAP_RXSTBC_4 | IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK; sband->vht_cap.vht_mcs.rx_mcs_map = cpu_to_le16(IEEE80211_VHT_MCS_SUPPORT_0_9 << 0 | IEEE80211_VHT_MCS_SUPPORT_0_9 << 2 | IEEE80211_VHT_MCS_SUPPORT_0_9 << 4 | IEEE80211_VHT_MCS_SUPPORT_0_9 << 6 | IEEE80211_VHT_MCS_SUPPORT_0_9 << 8 | IEEE80211_VHT_MCS_SUPPORT_0_9 << 10 | IEEE80211_VHT_MCS_SUPPORT_0_9 << 12 | IEEE80211_VHT_MCS_SUPPORT_0_9 << 14); sband->vht_cap.vht_mcs.tx_mcs_map = sband->vht_cap.vht_mcs.rx_mcs_map; break; case NL80211_BAND_6GHZ: sband->channels = data->channels_6ghz; sband->n_channels = ARRAY_SIZE(hwsim_channels_6ghz); sband->bitrates = data->rates + 4; sband->n_bitrates = ARRAY_SIZE(hwsim_rates) - 4; break; case NL80211_BAND_S1GHZ: memcpy(&sband->s1g_cap, &hwsim_s1g_cap, sizeof(sband->s1g_cap)); sband->channels = data->channels_s1g; sband->n_channels = ARRAY_SIZE(hwsim_channels_s1g); break; default: continue; } if (band != NL80211_BAND_6GHZ){ sband->ht_cap.ht_supported = true; sband->ht_cap.cap = IEEE80211_HT_CAP_SUP_WIDTH_20_40 | IEEE80211_HT_CAP_GRN_FLD | IEEE80211_HT_CAP_SGI_20 | IEEE80211_HT_CAP_SGI_40 | IEEE80211_HT_CAP_DSSSCCK40; sband->ht_cap.ampdu_factor = 0x3; sband->ht_cap.ampdu_density = 0x6; memset(&sband->ht_cap.mcs, 0, sizeof(sband->ht_cap.mcs)); sband->ht_cap.mcs.rx_mask[0] = 0xff; sband->ht_cap.mcs.rx_mask[1] = 0xff; sband->ht_cap.mcs.tx_params = IEEE80211_HT_MCS_TX_DEFINED; } mac80211_hwsim_sband_capab(sband); hw->wiphy->bands[band] = sband; } /* By default all radios belong to the first group */ data->group = 1; mutex_init(&data->mutex); data->netgroup = hwsim_net_get_netgroup(net); data->wmediumd = hwsim_net_get_wmediumd(net); /* Enable frame retransmissions for lossy channels */ hw->max_rates = 4; hw->max_rate_tries = 11; hw->wiphy->vendor_commands = mac80211_hwsim_vendor_commands; hw->wiphy->n_vendor_commands = ARRAY_SIZE(mac80211_hwsim_vendor_commands); hw->wiphy->vendor_events = mac80211_hwsim_vendor_events; hw->wiphy->n_vendor_events = ARRAY_SIZE(mac80211_hwsim_vendor_events); if (param->reg_strict) hw->wiphy->regulatory_flags |= REGULATORY_STRICT_REG; if (param->regd) { data->regd = param->regd; hw->wiphy->regulatory_flags |= REGULATORY_CUSTOM_REG; wiphy_apply_custom_regulatory(hw->wiphy, param->regd); /* give the regulatory workqueue a chance to run */ schedule_timeout_interruptible(1); } /* TODO: Add param */ wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_DFS_CONCURRENT); if (param->no_vif) ieee80211_hw_set(hw, NO_AUTO_VIF); wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_CQM_RSSI_LIST); for (i = 0; i < ARRAY_SIZE(data->link_data); i++) { hrtimer_init(&data->link_data[i].beacon_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_SOFT); data->link_data[i].beacon_timer.function = mac80211_hwsim_beacon; data->link_data[i].link_id = i; } err = ieee80211_register_hw(hw); if (err < 0) { pr_debug("mac80211_hwsim: ieee80211_register_hw failed (%d)\n", err); goto failed_hw; } wiphy_dbg(hw->wiphy, "hwaddr %pM registered\n", hw->wiphy->perm_addr); if (param->reg_alpha2) { data->alpha2[0] = param->reg_alpha2[0]; data->alpha2[1] = param->reg_alpha2[1]; regulatory_hint(hw->wiphy, param->reg_alpha2); } data->debugfs = debugfs_create_dir("hwsim", hw->wiphy->debugfsdir); debugfs_create_file("ps", 0666, data->debugfs, data, &hwsim_fops_ps); debugfs_create_file("group", 0666, data->debugfs, data, &hwsim_fops_group); debugfs_create_file("rx_rssi", 0666, data->debugfs, data, &hwsim_fops_rx_rssi); if (!data->use_chanctx) debugfs_create_file("dfs_simulate_radar", 0222, data->debugfs, data, &hwsim_simulate_radar); if (param->pmsr_capa) { data->pmsr_capa = *param->pmsr_capa; hw->wiphy->pmsr_capa = &data->pmsr_capa; } spin_lock_bh(&hwsim_radio_lock); err = rhashtable_insert_fast(&hwsim_radios_rht, &data->rht, hwsim_rht_params); if (err < 0) { if (info) { GENL_SET_ERR_MSG(info, "perm addr already present"); NL_SET_BAD_ATTR(info->extack, info->attrs[HWSIM_ATTR_PERM_ADDR]); } spin_unlock_bh(&hwsim_radio_lock); goto failed_final_insert; } list_add_tail(&data->list, &hwsim_radios); hwsim_radios_generation++; spin_unlock_bh(&hwsim_radio_lock); hwsim_mcast_new_radio(idx, info, param); return idx; failed_final_insert: debugfs_remove_recursive(data->debugfs); ieee80211_unregister_hw(data->hw); failed_hw: device_release_driver(data->dev); failed_bind: device_unregister(data->dev); failed_drvdata: ieee80211_free_hw(hw); failed: return err; } static void hwsim_mcast_del_radio(int id, const char *hwname, struct genl_info *info) { struct sk_buff *skb; void *data; int ret; skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return; data = genlmsg_put(skb, 0, 0, &hwsim_genl_family, 0, HWSIM_CMD_DEL_RADIO); if (!data) goto error; ret = nla_put_u32(skb, HWSIM_ATTR_RADIO_ID, id); if (ret < 0) goto error; ret = nla_put(skb, HWSIM_ATTR_RADIO_NAME, strlen(hwname), hwname); if (ret < 0) goto error; genlmsg_end(skb, data); hwsim_mcast_config_msg(skb, info); return; error: nlmsg_free(skb); } static void mac80211_hwsim_del_radio(struct mac80211_hwsim_data *data, const char *hwname, struct genl_info *info) { hwsim_mcast_del_radio(data->idx, hwname, info); debugfs_remove_recursive(data->debugfs); ieee80211_unregister_hw(data->hw); device_release_driver(data->dev); device_unregister(data->dev); ieee80211_free_hw(data->hw); } static int mac80211_hwsim_get_radio(struct sk_buff *skb, struct mac80211_hwsim_data *data, u32 portid, u32 seq, struct netlink_callback *cb, int flags) { void *hdr; struct hwsim_new_radio_params param = { }; int res = -EMSGSIZE; hdr = genlmsg_put(skb, portid, seq, &hwsim_genl_family, flags, HWSIM_CMD_GET_RADIO); if (!hdr) return -EMSGSIZE; if (cb) genl_dump_check_consistent(cb, hdr); if (data->alpha2[0] && data->alpha2[1]) param.reg_alpha2 = data->alpha2; param.reg_strict = !!(data->hw->wiphy->regulatory_flags & REGULATORY_STRICT_REG); param.p2p_device = !!(data->hw->wiphy->interface_modes & BIT(NL80211_IFTYPE_P2P_DEVICE)); param.use_chanctx = data->use_chanctx; param.regd = data->regd; param.channels = data->channels; param.hwname = wiphy_name(data->hw->wiphy); param.pmsr_capa = &data->pmsr_capa; res = append_radio_msg(skb, data->idx, &param); if (res < 0) goto out_err; genlmsg_end(skb, hdr); return 0; out_err: genlmsg_cancel(skb, hdr); return res; } static void mac80211_hwsim_free(void) { struct mac80211_hwsim_data *data; spin_lock_bh(&hwsim_radio_lock); while ((data = list_first_entry_or_null(&hwsim_radios, struct mac80211_hwsim_data, list))) { list_del(&data->list); spin_unlock_bh(&hwsim_radio_lock); mac80211_hwsim_del_radio(data, wiphy_name(data->hw->wiphy), NULL); spin_lock_bh(&hwsim_radio_lock); } spin_unlock_bh(&hwsim_radio_lock); class_destroy(hwsim_class); } static const struct net_device_ops hwsim_netdev_ops = { .ndo_start_xmit = hwsim_mon_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; static void hwsim_mon_setup(struct net_device *dev) { u8 addr[ETH_ALEN]; dev->netdev_ops = &hwsim_netdev_ops; dev->needs_free_netdev = true; ether_setup(dev); dev->priv_flags |= IFF_NO_QUEUE; dev->type = ARPHRD_IEEE80211_RADIOTAP; eth_zero_addr(addr); addr[0] = 0x12; eth_hw_addr_set(dev, addr); } static void hwsim_register_wmediumd(struct net *net, u32 portid) { struct mac80211_hwsim_data *data; hwsim_net_set_wmediumd(net, portid); spin_lock_bh(&hwsim_radio_lock); list_for_each_entry(data, &hwsim_radios, list) { if (data->netgroup == hwsim_net_get_netgroup(net)) data->wmediumd = portid; } spin_unlock_bh(&hwsim_radio_lock); } static int hwsim_tx_info_frame_received_nl(struct sk_buff *skb_2, struct genl_info *info) { struct ieee80211_hdr *hdr; struct mac80211_hwsim_data *data2; struct ieee80211_tx_info *txi; struct hwsim_tx_rate *tx_attempts; u64 ret_skb_cookie; struct sk_buff *skb, *tmp; const u8 *src; unsigned int hwsim_flags; int i; unsigned long flags; bool found = false; if (!info->attrs[HWSIM_ATTR_ADDR_TRANSMITTER] || !info->attrs[HWSIM_ATTR_FLAGS] || !info->attrs[HWSIM_ATTR_COOKIE] || !info->attrs[HWSIM_ATTR_SIGNAL] || !info->attrs[HWSIM_ATTR_TX_INFO]) goto out; src = (void *)nla_data(info->attrs[HWSIM_ATTR_ADDR_TRANSMITTER]); hwsim_flags = nla_get_u32(info->attrs[HWSIM_ATTR_FLAGS]); ret_skb_cookie = nla_get_u64(info->attrs[HWSIM_ATTR_COOKIE]); data2 = get_hwsim_data_ref_from_addr(src); if (!data2) goto out; if (!hwsim_virtio_enabled) { if (hwsim_net_get_netgroup(genl_info_net(info)) != data2->netgroup) goto out; if (info->snd_portid != data2->wmediumd) goto out; } /* look for the skb matching the cookie passed back from user */ spin_lock_irqsave(&data2->pending.lock, flags); skb_queue_walk_safe(&data2->pending, skb, tmp) { uintptr_t skb_cookie; txi = IEEE80211_SKB_CB(skb); skb_cookie = (uintptr_t)txi->rate_driver_data[0]; if (skb_cookie == ret_skb_cookie) { __skb_unlink(skb, &data2->pending); found = true; break; } } spin_unlock_irqrestore(&data2->pending.lock, flags); /* not found */ if (!found) goto out; /* Tx info received because the frame was broadcasted on user space, so we get all the necessary info: tx attempts and skb control buff */ tx_attempts = (struct hwsim_tx_rate *)nla_data( info->attrs[HWSIM_ATTR_TX_INFO]); /* now send back TX status */ txi = IEEE80211_SKB_CB(skb); ieee80211_tx_info_clear_status(txi); for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { txi->status.rates[i].idx = tx_attempts[i].idx; txi->status.rates[i].count = tx_attempts[i].count; } txi->status.ack_signal = nla_get_u32(info->attrs[HWSIM_ATTR_SIGNAL]); if (!(hwsim_flags & HWSIM_TX_CTL_NO_ACK) && (hwsim_flags & HWSIM_TX_STAT_ACK)) { if (skb->len >= 16) { hdr = (struct ieee80211_hdr *) skb->data; mac80211_hwsim_monitor_ack(data2->channel, hdr->addr2); } txi->flags |= IEEE80211_TX_STAT_ACK; } if (hwsim_flags & HWSIM_TX_CTL_NO_ACK) txi->flags |= IEEE80211_TX_STAT_NOACK_TRANSMITTED; ieee80211_tx_status_irqsafe(data2->hw, skb); return 0; out: return -EINVAL; } static int hwsim_cloned_frame_received_nl(struct sk_buff *skb_2, struct genl_info *info) { struct mac80211_hwsim_data *data2; struct ieee80211_rx_status rx_status; struct ieee80211_hdr *hdr; const u8 *dst; int frame_data_len; void *frame_data; struct sk_buff *skb = NULL; struct ieee80211_channel *channel = NULL; if (!info->attrs[HWSIM_ATTR_ADDR_RECEIVER] || !info->attrs[HWSIM_ATTR_FRAME] || !info->attrs[HWSIM_ATTR_RX_RATE] || !info->attrs[HWSIM_ATTR_SIGNAL]) goto out; dst = (void *)nla_data(info->attrs[HWSIM_ATTR_ADDR_RECEIVER]); frame_data_len = nla_len(info->attrs[HWSIM_ATTR_FRAME]); frame_data = (void *)nla_data(info->attrs[HWSIM_ATTR_FRAME]); if (frame_data_len < sizeof(struct ieee80211_hdr_3addr) || frame_data_len > IEEE80211_MAX_DATA_LEN) goto err; /* Allocate new skb here */ skb = alloc_skb(frame_data_len, GFP_KERNEL); if (skb == NULL) goto err; /* Copy the data */ skb_put_data(skb, frame_data, frame_data_len); data2 = get_hwsim_data_ref_from_addr(dst); if (!data2) goto out; if (data2->use_chanctx) { if (data2->tmp_chan) channel = data2->tmp_chan; } else { channel = data2->channel; } if (!hwsim_virtio_enabled) { if (hwsim_net_get_netgroup(genl_info_net(info)) != data2->netgroup) goto out; if (info->snd_portid != data2->wmediumd) goto out; } /* check if radio is configured properly */ if ((data2->idle && !data2->tmp_chan) || !data2->started) goto out; /* A frame is received from user space */ memset(&rx_status, 0, sizeof(rx_status)); if (info->attrs[HWSIM_ATTR_FREQ]) { struct tx_iter_data iter_data = {}; /* throw away off-channel packets, but allow both the temporary * ("hw" scan/remain-on-channel), regular channels and links, * since the internal datapath also allows this */ rx_status.freq = nla_get_u32(info->attrs[HWSIM_ATTR_FREQ]); iter_data.channel = ieee80211_get_channel(data2->hw->wiphy, rx_status.freq); if (!iter_data.channel) goto out; rx_status.band = iter_data.channel->band; mutex_lock(&data2->mutex); if (!hwsim_chans_compat(iter_data.channel, channel)) { ieee80211_iterate_active_interfaces_atomic( data2->hw, IEEE80211_IFACE_ITER_NORMAL, mac80211_hwsim_tx_iter, &iter_data); if (!iter_data.receive) { mutex_unlock(&data2->mutex); goto out; } } mutex_unlock(&data2->mutex); } else if (!channel) { goto out; } else { rx_status.freq = channel->center_freq; rx_status.band = channel->band; } rx_status.rate_idx = nla_get_u32(info->attrs[HWSIM_ATTR_RX_RATE]); if (rx_status.rate_idx >= data2->hw->wiphy->bands[rx_status.band]->n_bitrates) goto out; rx_status.signal = nla_get_u32(info->attrs[HWSIM_ATTR_SIGNAL]); hdr = (void *)skb->data; if (ieee80211_is_beacon(hdr->frame_control) || ieee80211_is_probe_resp(hdr->frame_control)) rx_status.boottime_ns = ktime_get_boottime_ns(); mac80211_hwsim_rx(data2, &rx_status, skb); return 0; err: pr_debug("mac80211_hwsim: error occurred in %s\n", __func__); out: dev_kfree_skb(skb); return -EINVAL; } static int hwsim_register_received_nl(struct sk_buff *skb_2, struct genl_info *info) { struct net *net = genl_info_net(info); struct mac80211_hwsim_data *data; int chans = 1; spin_lock_bh(&hwsim_radio_lock); list_for_each_entry(data, &hwsim_radios, list) chans = max(chans, data->channels); spin_unlock_bh(&hwsim_radio_lock); /* In the future we should revise the userspace API and allow it * to set a flag that it does support multi-channel, then we can * let this pass conditionally on the flag. * For current userspace, prohibit it since it won't work right. */ if (chans > 1) return -EOPNOTSUPP; if (hwsim_net_get_wmediumd(net)) return -EBUSY; hwsim_register_wmediumd(net, info->snd_portid); pr_debug("mac80211_hwsim: received a REGISTER, " "switching to wmediumd mode with pid %d\n", info->snd_portid); return 0; } /* ensures ciphers only include ciphers listed in 'hwsim_ciphers' array */ static bool hwsim_known_ciphers(const u32 *ciphers, int n_ciphers) { int i; for (i = 0; i < n_ciphers; i++) { int j; int found = 0; for (j = 0; j < ARRAY_SIZE(hwsim_ciphers); j++) { if (ciphers[i] == hwsim_ciphers[j]) { found = 1; break; } } if (!found) return false; } return true; } static int parse_ftm_capa(const struct nlattr *ftm_capa, struct cfg80211_pmsr_capabilities *out, struct genl_info *info) { struct nlattr *tb[NL80211_PMSR_FTM_CAPA_ATTR_MAX + 1]; int ret; ret = nla_parse_nested(tb, NL80211_PMSR_FTM_CAPA_ATTR_MAX, ftm_capa, hwsim_ftm_capa_policy, NULL); if (ret) { NL_SET_ERR_MSG_ATTR(info->extack, ftm_capa, "malformed FTM capability"); return -EINVAL; } out->ftm.supported = 1; if (tb[NL80211_PMSR_FTM_CAPA_ATTR_PREAMBLES]) out->ftm.preambles = nla_get_u32(tb[NL80211_PMSR_FTM_CAPA_ATTR_PREAMBLES]); if (tb[NL80211_PMSR_FTM_CAPA_ATTR_BANDWIDTHS]) out->ftm.bandwidths = nla_get_u32(tb[NL80211_PMSR_FTM_CAPA_ATTR_BANDWIDTHS]); if (tb[NL80211_PMSR_FTM_CAPA_ATTR_MAX_BURSTS_EXPONENT]) out->ftm.max_bursts_exponent = nla_get_u8(tb[NL80211_PMSR_FTM_CAPA_ATTR_MAX_BURSTS_EXPONENT]); if (tb[NL80211_PMSR_FTM_CAPA_ATTR_MAX_FTMS_PER_BURST]) out->ftm.max_ftms_per_burst = nla_get_u8(tb[NL80211_PMSR_FTM_CAPA_ATTR_MAX_FTMS_PER_BURST]); out->ftm.asap = !!tb[NL80211_PMSR_FTM_CAPA_ATTR_ASAP]; out->ftm.non_asap = !!tb[NL80211_PMSR_FTM_CAPA_ATTR_NON_ASAP]; out->ftm.request_lci = !!tb[NL80211_PMSR_FTM_CAPA_ATTR_REQ_LCI]; out->ftm.request_civicloc = !!tb[NL80211_PMSR_FTM_CAPA_ATTR_REQ_CIVICLOC]; out->ftm.trigger_based = !!tb[NL80211_PMSR_FTM_CAPA_ATTR_TRIGGER_BASED]; out->ftm.non_trigger_based = !!tb[NL80211_PMSR_FTM_CAPA_ATTR_NON_TRIGGER_BASED]; return 0; } static int parse_pmsr_capa(const struct nlattr *pmsr_capa, struct cfg80211_pmsr_capabilities *out, struct genl_info *info) { struct nlattr *tb[NL80211_PMSR_ATTR_MAX + 1]; struct nlattr *nla; int size; int ret; ret = nla_parse_nested(tb, NL80211_PMSR_ATTR_MAX, pmsr_capa, hwsim_pmsr_capa_policy, NULL); if (ret) { NL_SET_ERR_MSG_ATTR(info->extack, pmsr_capa, "malformed PMSR capability"); return -EINVAL; } if (tb[NL80211_PMSR_ATTR_MAX_PEERS]) out->max_peers = nla_get_u32(tb[NL80211_PMSR_ATTR_MAX_PEERS]); out->report_ap_tsf = !!tb[NL80211_PMSR_ATTR_REPORT_AP_TSF]; out->randomize_mac_addr = !!tb[NL80211_PMSR_ATTR_RANDOMIZE_MAC_ADDR]; if (!tb[NL80211_PMSR_ATTR_TYPE_CAPA]) { NL_SET_ERR_MSG_ATTR(info->extack, tb[NL80211_PMSR_ATTR_TYPE_CAPA], "malformed PMSR type"); return -EINVAL; } nla_for_each_nested(nla, tb[NL80211_PMSR_ATTR_TYPE_CAPA], size) { switch (nla_type(nla)) { case NL80211_PMSR_TYPE_FTM: parse_ftm_capa(nla, out, info); break; default: NL_SET_ERR_MSG_ATTR(info->extack, nla, "unsupported measurement type"); return -EINVAL; } } return 0; } static int hwsim_new_radio_nl(struct sk_buff *msg, struct genl_info *info) { struct hwsim_new_radio_params param = { 0 }; const char *hwname = NULL; int ret; param.reg_strict = info->attrs[HWSIM_ATTR_REG_STRICT_REG]; param.p2p_device = info->attrs[HWSIM_ATTR_SUPPORT_P2P_DEVICE]; param.channels = channels; param.destroy_on_close = info->attrs[HWSIM_ATTR_DESTROY_RADIO_ON_CLOSE]; if (info->attrs[HWSIM_ATTR_CHANNELS]) param.channels = nla_get_u32(info->attrs[HWSIM_ATTR_CHANNELS]); if (param.channels < 1) { GENL_SET_ERR_MSG(info, "must have at least one channel"); return -EINVAL; } if (info->attrs[HWSIM_ATTR_NO_VIF]) param.no_vif = true; if (info->attrs[HWSIM_ATTR_USE_CHANCTX]) param.use_chanctx = true; else param.use_chanctx = (param.channels > 1); if (info->attrs[HWSIM_ATTR_REG_HINT_ALPHA2]) param.reg_alpha2 = nla_data(info->attrs[HWSIM_ATTR_REG_HINT_ALPHA2]); if (info->attrs[HWSIM_ATTR_REG_CUSTOM_REG]) { u32 idx = nla_get_u32(info->attrs[HWSIM_ATTR_REG_CUSTOM_REG]); if (idx >= ARRAY_SIZE(hwsim_world_regdom_custom)) return -EINVAL; idx = array_index_nospec(idx, ARRAY_SIZE(hwsim_world_regdom_custom)); param.regd = hwsim_world_regdom_custom[idx]; } if (info->attrs[HWSIM_ATTR_PERM_ADDR]) { if (!is_valid_ether_addr( nla_data(info->attrs[HWSIM_ATTR_PERM_ADDR]))) { GENL_SET_ERR_MSG(info,"MAC is no valid source addr"); NL_SET_BAD_ATTR(info->extack, info->attrs[HWSIM_ATTR_PERM_ADDR]); return -EINVAL; } param.perm_addr = nla_data(info->attrs[HWSIM_ATTR_PERM_ADDR]); } if (info->attrs[HWSIM_ATTR_IFTYPE_SUPPORT]) { param.iftypes = nla_get_u32(info->attrs[HWSIM_ATTR_IFTYPE_SUPPORT]); if (param.iftypes & ~HWSIM_IFTYPE_SUPPORT_MASK) { NL_SET_ERR_MSG_ATTR(info->extack, info->attrs[HWSIM_ATTR_IFTYPE_SUPPORT], "cannot support more iftypes than kernel"); return -EINVAL; } } else { param.iftypes = HWSIM_IFTYPE_SUPPORT_MASK; } /* ensure both flag and iftype support is honored */ if (param.p2p_device || param.iftypes & BIT(NL80211_IFTYPE_P2P_DEVICE)) { param.iftypes |= BIT(NL80211_IFTYPE_P2P_DEVICE); param.p2p_device = true; } if (info->attrs[HWSIM_ATTR_CIPHER_SUPPORT]) { u32 len = nla_len(info->attrs[HWSIM_ATTR_CIPHER_SUPPORT]); param.ciphers = nla_data(info->attrs[HWSIM_ATTR_CIPHER_SUPPORT]); if (len % sizeof(u32)) { NL_SET_ERR_MSG_ATTR(info->extack, info->attrs[HWSIM_ATTR_CIPHER_SUPPORT], "bad cipher list length"); return -EINVAL; } param.n_ciphers = len / sizeof(u32); if (param.n_ciphers > ARRAY_SIZE(hwsim_ciphers)) { NL_SET_ERR_MSG_ATTR(info->extack, info->attrs[HWSIM_ATTR_CIPHER_SUPPORT], "too many ciphers specified"); return -EINVAL; } if (!hwsim_known_ciphers(param.ciphers, param.n_ciphers)) { NL_SET_ERR_MSG_ATTR(info->extack, info->attrs[HWSIM_ATTR_CIPHER_SUPPORT], "unsupported ciphers specified"); return -EINVAL; } } param.mlo = info->attrs[HWSIM_ATTR_MLO_SUPPORT]; if (param.mlo) param.use_chanctx = true; if (info->attrs[HWSIM_ATTR_RADIO_NAME]) { hwname = kstrndup((char *)nla_data(info->attrs[HWSIM_ATTR_RADIO_NAME]), nla_len(info->attrs[HWSIM_ATTR_RADIO_NAME]), GFP_KERNEL); if (!hwname) return -ENOMEM; param.hwname = hwname; } if (info->attrs[HWSIM_ATTR_PMSR_SUPPORT]) { struct cfg80211_pmsr_capabilities *pmsr_capa; pmsr_capa = kmalloc(sizeof(*pmsr_capa), GFP_KERNEL); if (!pmsr_capa) { ret = -ENOMEM; goto out_free; } param.pmsr_capa = pmsr_capa; ret = parse_pmsr_capa(info->attrs[HWSIM_ATTR_PMSR_SUPPORT], pmsr_capa, info); if (ret) goto out_free; } ret = mac80211_hwsim_new_radio(info, &param); out_free: kfree(hwname); kfree(param.pmsr_capa); return ret; } static int hwsim_del_radio_nl(struct sk_buff *msg, struct genl_info *info) { struct mac80211_hwsim_data *data; s64 idx = -1; const char *hwname = NULL; if (info->attrs[HWSIM_ATTR_RADIO_ID]) { idx = nla_get_u32(info->attrs[HWSIM_ATTR_RADIO_ID]); } else if (info->attrs[HWSIM_ATTR_RADIO_NAME]) { hwname = kstrndup((char *)nla_data(info->attrs[HWSIM_ATTR_RADIO_NAME]), nla_len(info->attrs[HWSIM_ATTR_RADIO_NAME]), GFP_KERNEL); if (!hwname) return -ENOMEM; } else return -EINVAL; spin_lock_bh(&hwsim_radio_lock); list_for_each_entry(data, &hwsim_radios, list) { if (idx >= 0) { if (data->idx != idx) continue; } else { if (!hwname || strcmp(hwname, wiphy_name(data->hw->wiphy))) continue; } if (!net_eq(wiphy_net(data->hw->wiphy), genl_info_net(info))) continue; list_del(&data->list); rhashtable_remove_fast(&hwsim_radios_rht, &data->rht, hwsim_rht_params); hwsim_radios_generation++; spin_unlock_bh(&hwsim_radio_lock); mac80211_hwsim_del_radio(data, wiphy_name(data->hw->wiphy), info); kfree(hwname); return 0; } spin_unlock_bh(&hwsim_radio_lock); kfree(hwname); return -ENODEV; } static int hwsim_get_radio_nl(struct sk_buff *msg, struct genl_info *info) { struct mac80211_hwsim_data *data; struct sk_buff *skb; int idx, res = -ENODEV; if (!info->attrs[HWSIM_ATTR_RADIO_ID]) return -EINVAL; idx = nla_get_u32(info->attrs[HWSIM_ATTR_RADIO_ID]); spin_lock_bh(&hwsim_radio_lock); list_for_each_entry(data, &hwsim_radios, list) { if (data->idx != idx) continue; if (!net_eq(wiphy_net(data->hw->wiphy), genl_info_net(info))) continue; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) { res = -ENOMEM; goto out_err; } res = mac80211_hwsim_get_radio(skb, data, info->snd_portid, info->snd_seq, NULL, 0); if (res < 0) { nlmsg_free(skb); goto out_err; } res = genlmsg_reply(skb, info); break; } out_err: spin_unlock_bh(&hwsim_radio_lock); return res; } static int hwsim_dump_radio_nl(struct sk_buff *skb, struct netlink_callback *cb) { int last_idx = cb->args[0] - 1; struct mac80211_hwsim_data *data = NULL; int res = 0; void *hdr; spin_lock_bh(&hwsim_radio_lock); cb->seq = hwsim_radios_generation; if (last_idx >= hwsim_radio_idx-1) goto done; list_for_each_entry(data, &hwsim_radios, list) { if (data->idx <= last_idx) continue; if (!net_eq(wiphy_net(data->hw->wiphy), sock_net(skb->sk))) continue; res = mac80211_hwsim_get_radio(skb, data, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb, NLM_F_MULTI); if (res < 0) break; last_idx = data->idx; } cb->args[0] = last_idx + 1; /* list changed, but no new element sent, set interrupted flag */ if (skb->len == 0 && cb->prev_seq && cb->seq != cb->prev_seq) { hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &hwsim_genl_family, NLM_F_MULTI, HWSIM_CMD_GET_RADIO); if (hdr) { genl_dump_check_consistent(cb, hdr); genlmsg_end(skb, hdr); } else { res = -EMSGSIZE; } } done: spin_unlock_bh(&hwsim_radio_lock); return res ?: skb->len; } /* Generic Netlink operations array */ static const struct genl_small_ops hwsim_ops[] = { { .cmd = HWSIM_CMD_REGISTER, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_register_received_nl, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = HWSIM_CMD_FRAME, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_cloned_frame_received_nl, }, { .cmd = HWSIM_CMD_TX_INFO_FRAME, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_tx_info_frame_received_nl, }, { .cmd = HWSIM_CMD_NEW_RADIO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_new_radio_nl, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = HWSIM_CMD_DEL_RADIO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_del_radio_nl, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = HWSIM_CMD_GET_RADIO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_get_radio_nl, .dumpit = hwsim_dump_radio_nl, }, { .cmd = HWSIM_CMD_REPORT_PMSR, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_pmsr_report_nl, }, }; static struct genl_family hwsim_genl_family __ro_after_init = { .name = "MAC80211_HWSIM", .version = 1, .maxattr = HWSIM_ATTR_MAX, .policy = hwsim_genl_policy, .netnsok = true, .module = THIS_MODULE, .small_ops = hwsim_ops, .n_small_ops = ARRAY_SIZE(hwsim_ops), .resv_start_op = HWSIM_CMD_REPORT_PMSR + 1, // match with __HWSIM_CMD_MAX .mcgrps = hwsim_mcgrps, .n_mcgrps = ARRAY_SIZE(hwsim_mcgrps), }; static void remove_user_radios(u32 portid) { struct mac80211_hwsim_data *entry, *tmp; LIST_HEAD(list); spin_lock_bh(&hwsim_radio_lock); list_for_each_entry_safe(entry, tmp, &hwsim_radios, list) { if (entry->destroy_on_close && entry->portid == portid) { list_move(&entry->list, &list); rhashtable_remove_fast(&hwsim_radios_rht, &entry->rht, hwsim_rht_params); hwsim_radios_generation++; } } spin_unlock_bh(&hwsim_radio_lock); list_for_each_entry_safe(entry, tmp, &list, list) { list_del(&entry->list); mac80211_hwsim_del_radio(entry, wiphy_name(entry->hw->wiphy), NULL); } } static int mac80211_hwsim_netlink_notify(struct notifier_block *nb, unsigned long state, void *_notify) { struct netlink_notify *notify = _notify; if (state != NETLINK_URELEASE) return NOTIFY_DONE; remove_user_radios(notify->portid); if (notify->portid == hwsim_net_get_wmediumd(notify->net)) { printk(KERN_INFO "mac80211_hwsim: wmediumd released netlink" " socket, switching to perfect channel medium\n"); hwsim_register_wmediumd(notify->net, 0); } return NOTIFY_DONE; } static struct notifier_block hwsim_netlink_notifier = { .notifier_call = mac80211_hwsim_netlink_notify, }; static int __init hwsim_init_netlink(void) { int rc; printk(KERN_INFO "mac80211_hwsim: initializing netlink\n"); rc = genl_register_family(&hwsim_genl_family); if (rc) goto failure; rc = netlink_register_notifier(&hwsim_netlink_notifier); if (rc) { genl_unregister_family(&hwsim_genl_family); goto failure; } return 0; failure: pr_debug("mac80211_hwsim: error occurred in %s\n", __func__); return -EINVAL; } static __net_init int hwsim_init_net(struct net *net) { return hwsim_net_set_netgroup(net); } static void __net_exit hwsim_exit_net(struct net *net) { struct mac80211_hwsim_data *data, *tmp; LIST_HEAD(list); spin_lock_bh(&hwsim_radio_lock); list_for_each_entry_safe(data, tmp, &hwsim_radios, list) { if (!net_eq(wiphy_net(data->hw->wiphy), net)) continue; /* Radios created in init_net are returned to init_net. */ if (data->netgroup == hwsim_net_get_netgroup(&init_net)) continue; list_move(&data->list, &list); rhashtable_remove_fast(&hwsim_radios_rht, &data->rht, hwsim_rht_params); hwsim_radios_generation++; } spin_unlock_bh(&hwsim_radio_lock); list_for_each_entry_safe(data, tmp, &list, list) { list_del(&data->list); mac80211_hwsim_del_radio(data, wiphy_name(data->hw->wiphy), NULL); } ida_free(&hwsim_netgroup_ida, hwsim_net_get_netgroup(net)); } static struct pernet_operations hwsim_net_ops = { .init = hwsim_init_net, .exit = hwsim_exit_net, .id = &hwsim_net_id, .size = sizeof(struct hwsim_net), }; static void hwsim_exit_netlink(void) { /* unregister the notifier */ netlink_unregister_notifier(&hwsim_netlink_notifier); /* unregister the family */ genl_unregister_family(&hwsim_genl_family); } #if IS_REACHABLE(CONFIG_VIRTIO) static void hwsim_virtio_tx_done(struct virtqueue *vq) { unsigned int len; struct sk_buff *skb; unsigned long flags; spin_lock_irqsave(&hwsim_virtio_lock, flags); while ((skb = virtqueue_get_buf(vq, &len))) dev_kfree_skb_irq(skb); spin_unlock_irqrestore(&hwsim_virtio_lock, flags); } static int hwsim_virtio_handle_cmd(struct sk_buff *skb) { struct nlmsghdr *nlh; struct genlmsghdr *gnlh; struct nlattr *tb[HWSIM_ATTR_MAX + 1]; struct genl_info info = {}; int err; nlh = nlmsg_hdr(skb); gnlh = nlmsg_data(nlh); if (skb->len < nlh->nlmsg_len) return -EINVAL; err = genlmsg_parse(nlh, &hwsim_genl_family, tb, HWSIM_ATTR_MAX, hwsim_genl_policy, NULL); if (err) { pr_err_ratelimited("hwsim: genlmsg_parse returned %d\n", err); return err; } info.attrs = tb; switch (gnlh->cmd) { case HWSIM_CMD_FRAME: hwsim_cloned_frame_received_nl(skb, &info); break; case HWSIM_CMD_TX_INFO_FRAME: hwsim_tx_info_frame_received_nl(skb, &info); break; case HWSIM_CMD_REPORT_PMSR: hwsim_pmsr_report_nl(skb, &info); break; default: pr_err_ratelimited("hwsim: invalid cmd: %d\n", gnlh->cmd); return -EPROTO; } return 0; } static void hwsim_virtio_rx_work(struct work_struct *work) { struct virtqueue *vq; unsigned int len; struct sk_buff *skb; struct scatterlist sg[1]; int err; unsigned long flags; spin_lock_irqsave(&hwsim_virtio_lock, flags); if (!hwsim_virtio_enabled) goto out_unlock; skb = virtqueue_get_buf(hwsim_vqs[HWSIM_VQ_RX], &len); if (!skb) goto out_unlock; spin_unlock_irqrestore(&hwsim_virtio_lock, flags); skb->data = skb->head; skb_reset_tail_pointer(skb); skb_put(skb, len); hwsim_virtio_handle_cmd(skb); spin_lock_irqsave(&hwsim_virtio_lock, flags); if (!hwsim_virtio_enabled) { dev_kfree_skb_irq(skb); goto out_unlock; } vq = hwsim_vqs[HWSIM_VQ_RX]; sg_init_one(sg, skb->head, skb_end_offset(skb)); err = virtqueue_add_inbuf(vq, sg, 1, skb, GFP_ATOMIC); if (WARN(err, "virtqueue_add_inbuf returned %d\n", err)) dev_kfree_skb_irq(skb); else virtqueue_kick(vq); schedule_work(&hwsim_virtio_rx); out_unlock: spin_unlock_irqrestore(&hwsim_virtio_lock, flags); } static void hwsim_virtio_rx_done(struct virtqueue *vq) { schedule_work(&hwsim_virtio_rx); } static int init_vqs(struct virtio_device *vdev) { vq_callback_t *callbacks[HWSIM_NUM_VQS] = { [HWSIM_VQ_TX] = hwsim_virtio_tx_done, [HWSIM_VQ_RX] = hwsim_virtio_rx_done, }; const char *names[HWSIM_NUM_VQS] = { [HWSIM_VQ_TX] = "tx", [HWSIM_VQ_RX] = "rx", }; return virtio_find_vqs(vdev, HWSIM_NUM_VQS, hwsim_vqs, callbacks, names, NULL); } static int fill_vq(struct virtqueue *vq) { int i, err; struct sk_buff *skb; struct scatterlist sg[1]; for (i = 0; i < virtqueue_get_vring_size(vq); i++) { skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return -ENOMEM; sg_init_one(sg, skb->head, skb_end_offset(skb)); err = virtqueue_add_inbuf(vq, sg, 1, skb, GFP_KERNEL); if (err) { nlmsg_free(skb); return err; } } virtqueue_kick(vq); return 0; } static void remove_vqs(struct virtio_device *vdev) { int i; virtio_reset_device(vdev); for (i = 0; i < ARRAY_SIZE(hwsim_vqs); i++) { struct virtqueue *vq = hwsim_vqs[i]; struct sk_buff *skb; while ((skb = virtqueue_detach_unused_buf(vq))) nlmsg_free(skb); } vdev->config->del_vqs(vdev); } static int hwsim_virtio_probe(struct virtio_device *vdev) { int err; unsigned long flags; spin_lock_irqsave(&hwsim_virtio_lock, flags); if (hwsim_virtio_enabled) { spin_unlock_irqrestore(&hwsim_virtio_lock, flags); return -EEXIST; } spin_unlock_irqrestore(&hwsim_virtio_lock, flags); err = init_vqs(vdev); if (err) return err; virtio_device_ready(vdev); err = fill_vq(hwsim_vqs[HWSIM_VQ_RX]); if (err) goto out_remove; spin_lock_irqsave(&hwsim_virtio_lock, flags); hwsim_virtio_enabled = true; spin_unlock_irqrestore(&hwsim_virtio_lock, flags); schedule_work(&hwsim_virtio_rx); return 0; out_remove: remove_vqs(vdev); return err; } static void hwsim_virtio_remove(struct virtio_device *vdev) { hwsim_virtio_enabled = false; cancel_work_sync(&hwsim_virtio_rx); remove_vqs(vdev); } /* MAC80211_HWSIM virtio device id table */ static const struct virtio_device_id id_table[] = { { VIRTIO_ID_MAC80211_HWSIM, VIRTIO_DEV_ANY_ID }, { 0 } }; MODULE_DEVICE_TABLE(virtio, id_table); static struct virtio_driver virtio_hwsim = { .driver.name = KBUILD_MODNAME, .driver.owner = THIS_MODULE, .id_table = id_table, .probe = hwsim_virtio_probe, .remove = hwsim_virtio_remove, }; static int hwsim_register_virtio_driver(void) { return register_virtio_driver(&virtio_hwsim); } static void hwsim_unregister_virtio_driver(void) { unregister_virtio_driver(&virtio_hwsim); } #else static inline int hwsim_register_virtio_driver(void) { return 0; } static inline void hwsim_unregister_virtio_driver(void) { } #endif static int __init init_mac80211_hwsim(void) { int i, err; if (radios < 0 || radios > 100) return -EINVAL; if (channels < 1) return -EINVAL; err = rhashtable_init(&hwsim_radios_rht, &hwsim_rht_params); if (err) return err; err = register_pernet_device(&hwsim_net_ops); if (err) goto out_free_rht; err = platform_driver_register(&mac80211_hwsim_driver); if (err) goto out_unregister_pernet; err = hwsim_init_netlink(); if (err) goto out_unregister_driver; err = hwsim_register_virtio_driver(); if (err) goto out_exit_netlink; hwsim_class = class_create("mac80211_hwsim"); if (IS_ERR(hwsim_class)) { err = PTR_ERR(hwsim_class); goto out_exit_virtio; } hwsim_init_s1g_channels(hwsim_channels_s1g); for (i = 0; i < radios; i++) { struct hwsim_new_radio_params param = { 0 }; param.channels = channels; switch (regtest) { case HWSIM_REGTEST_DIFF_COUNTRY: if (i < ARRAY_SIZE(hwsim_alpha2s)) param.reg_alpha2 = hwsim_alpha2s[i]; break; case HWSIM_REGTEST_DRIVER_REG_FOLLOW: if (!i) param.reg_alpha2 = hwsim_alpha2s[0]; break; case HWSIM_REGTEST_STRICT_ALL: param.reg_strict = true; fallthrough; case HWSIM_REGTEST_DRIVER_REG_ALL: param.reg_alpha2 = hwsim_alpha2s[0]; break; case HWSIM_REGTEST_WORLD_ROAM: if (i == 0) param.regd = &hwsim_world_regdom_custom_01; break; case HWSIM_REGTEST_CUSTOM_WORLD: param.regd = &hwsim_world_regdom_custom_01; break; case HWSIM_REGTEST_CUSTOM_WORLD_2: if (i == 0) param.regd = &hwsim_world_regdom_custom_01; else if (i == 1) param.regd = &hwsim_world_regdom_custom_02; break; case HWSIM_REGTEST_STRICT_FOLLOW: if (i == 0) { param.reg_strict = true; param.reg_alpha2 = hwsim_alpha2s[0]; } break; case HWSIM_REGTEST_STRICT_AND_DRIVER_REG: if (i == 0) { param.reg_strict = true; param.reg_alpha2 = hwsim_alpha2s[0]; } else if (i == 1) { param.reg_alpha2 = hwsim_alpha2s[1]; } break; case HWSIM_REGTEST_ALL: switch (i) { case 0: param.regd = &hwsim_world_regdom_custom_01; break; case 1: param.regd = &hwsim_world_regdom_custom_02; break; case 2: param.reg_alpha2 = hwsim_alpha2s[0]; break; case 3: param.reg_alpha2 = hwsim_alpha2s[1]; break; case 4: param.reg_strict = true; param.reg_alpha2 = hwsim_alpha2s[2]; break; } break; default: break; } param.p2p_device = support_p2p_device; param.mlo = mlo; param.use_chanctx = channels > 1 || mlo; param.iftypes = HWSIM_IFTYPE_SUPPORT_MASK; if (param.p2p_device) param.iftypes |= BIT(NL80211_IFTYPE_P2P_DEVICE); err = mac80211_hwsim_new_radio(NULL, &param); if (err < 0) goto out_free_radios; } hwsim_mon = alloc_netdev(0, "hwsim%d", NET_NAME_UNKNOWN, hwsim_mon_setup); if (hwsim_mon == NULL) { err = -ENOMEM; goto out_free_radios; } rtnl_lock(); err = dev_alloc_name(hwsim_mon, hwsim_mon->name); if (err < 0) { rtnl_unlock(); goto out_free_mon; } err = register_netdevice(hwsim_mon); if (err < 0) { rtnl_unlock(); goto out_free_mon; } rtnl_unlock(); return 0; out_free_mon: free_netdev(hwsim_mon); out_free_radios: mac80211_hwsim_free(); out_exit_virtio: hwsim_unregister_virtio_driver(); out_exit_netlink: hwsim_exit_netlink(); out_unregister_driver: platform_driver_unregister(&mac80211_hwsim_driver); out_unregister_pernet: unregister_pernet_device(&hwsim_net_ops); out_free_rht: rhashtable_destroy(&hwsim_radios_rht); return err; } module_init(init_mac80211_hwsim); static void __exit exit_mac80211_hwsim(void) { pr_debug("mac80211_hwsim: unregister radios\n"); hwsim_unregister_virtio_driver(); hwsim_exit_netlink(); mac80211_hwsim_free(); rhashtable_destroy(&hwsim_radios_rht); unregister_netdev(hwsim_mon); platform_driver_unregister(&mac80211_hwsim_driver); unregister_pernet_device(&hwsim_net_ops); } module_exit(exit_mac80211_hwsim);
34 34 3 2 2 1 3 2 2 1 4 3 3 1 4 3 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2022, Intel Corporation. */ #include "protocol.h" #include "mib.h" void mptcp_free_local_addr_list(struct mptcp_sock *msk) { struct mptcp_pm_addr_entry *entry, *tmp; struct sock *sk = (struct sock *)msk; LIST_HEAD(free_list); if (!mptcp_pm_is_userspace(msk)) return; spin_lock_bh(&msk->pm.lock); list_splice_init(&msk->pm.userspace_pm_local_addr_list, &free_list); spin_unlock_bh(&msk->pm.lock); list_for_each_entry_safe(entry, tmp, &free_list, list) { sock_kfree_s(sk, entry, sizeof(*entry)); } } static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *entry) { DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); struct mptcp_pm_addr_entry *match = NULL; struct sock *sk = (struct sock *)msk; struct mptcp_pm_addr_entry *e; bool addr_match = false; bool id_match = false; int ret = -EINVAL; bitmap_zero(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); spin_lock_bh(&msk->pm.lock); list_for_each_entry(e, &msk->pm.userspace_pm_local_addr_list, list) { addr_match = mptcp_addresses_equal(&e->addr, &entry->addr, true); if (addr_match && entry->addr.id == 0) entry->addr.id = e->addr.id; id_match = (e->addr.id == entry->addr.id); if (addr_match && id_match) { match = e; break; } else if (addr_match || id_match) { break; } __set_bit(e->addr.id, id_bitmap); } if (!match && !addr_match && !id_match) { /* Memory for the entry is allocated from the * sock option buffer. */ e = sock_kmalloc(sk, sizeof(*e), GFP_ATOMIC); if (!e) { ret = -ENOMEM; goto append_err; } *e = *entry; if (!e->addr.id) e->addr.id = find_next_zero_bit(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1, 1); list_add_tail_rcu(&e->list, &msk->pm.userspace_pm_local_addr_list); msk->pm.local_addr_used++; ret = e->addr.id; } else if (match) { ret = entry->addr.id; } append_err: spin_unlock_bh(&msk->pm.lock); return ret; } /* If the subflow is closed from the other peer (not via a * subflow destroy command then), we want to keep the entry * not to assign the same ID to another address and to be * able to send RM_ADDR after the removal of the subflow. */ static int mptcp_userspace_pm_delete_local_addr(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *addr) { struct mptcp_pm_addr_entry *entry, *tmp; list_for_each_entry_safe(entry, tmp, &msk->pm.userspace_pm_local_addr_list, list) { if (mptcp_addresses_equal(&entry->addr, &addr->addr, false)) { /* TODO: a refcount is needed because the entry can * be used multiple times (e.g. fullmesh mode). */ list_del_rcu(&entry->list); kfree(entry); msk->pm.local_addr_used--; return 0; } } return -EINVAL; } int mptcp_userspace_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, u8 *flags, int *ifindex) { struct mptcp_pm_addr_entry *entry, *match = NULL; spin_lock_bh(&msk->pm.lock); list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { if (id == entry->addr.id) { match = entry; break; } } spin_unlock_bh(&msk->pm.lock); if (match) { *flags = match->flags; *ifindex = match->ifindex; } return 0; } int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc) { struct mptcp_pm_addr_entry new_entry; __be16 msk_sport = ((struct inet_sock *) inet_sk((struct sock *)msk))->inet_sport; memset(&new_entry, 0, sizeof(struct mptcp_pm_addr_entry)); new_entry.addr = *skc; new_entry.addr.id = 0; new_entry.flags = MPTCP_PM_ADDR_FLAG_IMPLICIT; if (new_entry.addr.port == msk_sport) new_entry.addr.port = 0; return mptcp_userspace_pm_append_new_local_addr(msk, &new_entry); } int mptcp_pm_nl_announce_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct nlattr *addr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct mptcp_pm_addr_entry addr_val; struct mptcp_sock *msk; int err = -EINVAL; struct sock *sk; u32 token_val; if (!addr || !token) { GENL_SET_ERR_MSG(info, "missing required inputs"); return err; } token_val = nla_get_u32(token); msk = mptcp_token_get_sock(sock_net(skb->sk), token_val); if (!msk) { NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); return err; } sk = (struct sock *)msk; if (!mptcp_pm_is_userspace(msk)) { GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); goto announce_err; } err = mptcp_pm_parse_entry(addr, info, true, &addr_val); if (err < 0) { GENL_SET_ERR_MSG(info, "error parsing local address"); goto announce_err; } if (addr_val.addr.id == 0 || !(addr_val.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { GENL_SET_ERR_MSG(info, "invalid addr id or flags"); err = -EINVAL; goto announce_err; } err = mptcp_userspace_pm_append_new_local_addr(msk, &addr_val); if (err < 0) { GENL_SET_ERR_MSG(info, "did not match address and id"); goto announce_err; } lock_sock(sk); spin_lock_bh(&msk->pm.lock); if (mptcp_pm_alloc_anno_list(msk, &addr_val.addr)) { msk->pm.add_addr_signaled++; mptcp_pm_announce_addr(msk, &addr_val.addr, false); mptcp_pm_nl_addr_send_ack(msk); } spin_unlock_bh(&msk->pm.lock); release_sock(sk); err = 0; announce_err: sock_put(sk); return err; } static int mptcp_userspace_pm_remove_id_zero_address(struct mptcp_sock *msk, struct genl_info *info) { struct mptcp_rm_list list = { .nr = 0 }; struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; bool has_id_0 = false; int err = -EINVAL; lock_sock(sk); mptcp_for_each_subflow(msk, subflow) { if (subflow->local_id == 0) { has_id_0 = true; break; } } if (!has_id_0) { GENL_SET_ERR_MSG(info, "address with id 0 not found"); goto remove_err; } list.ids[list.nr++] = 0; spin_lock_bh(&msk->pm.lock); mptcp_pm_remove_addr(msk, &list); spin_unlock_bh(&msk->pm.lock); err = 0; remove_err: release_sock(sk); return err; } int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct nlattr *id = info->attrs[MPTCP_PM_ATTR_LOC_ID]; struct mptcp_pm_addr_entry *match = NULL; struct mptcp_pm_addr_entry *entry; struct mptcp_sock *msk; LIST_HEAD(free_list); int err = -EINVAL; struct sock *sk; u32 token_val; u8 id_val; if (!id || !token) { GENL_SET_ERR_MSG(info, "missing required inputs"); return err; } id_val = nla_get_u8(id); token_val = nla_get_u32(token); msk = mptcp_token_get_sock(sock_net(skb->sk), token_val); if (!msk) { NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); return err; } sk = (struct sock *)msk; if (!mptcp_pm_is_userspace(msk)) { GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); goto out; } if (id_val == 0) { err = mptcp_userspace_pm_remove_id_zero_address(msk, info); goto out; } lock_sock(sk); list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { if (entry->addr.id == id_val) { match = entry; break; } } if (!match) { GENL_SET_ERR_MSG(info, "address with specified id not found"); release_sock(sk); goto out; } list_move(&match->list, &free_list); mptcp_pm_remove_addrs(msk, &free_list); release_sock(sk); list_for_each_entry_safe(match, entry, &free_list, list) { sock_kfree_s(sk, match, sizeof(*match)); } err = 0; out: sock_put(sk); return err; } int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct mptcp_pm_addr_entry local = { 0 }; struct mptcp_addr_info addr_r; struct mptcp_addr_info addr_l; struct mptcp_sock *msk; int err = -EINVAL; struct sock *sk; u32 token_val; if (!laddr || !raddr || !token) { GENL_SET_ERR_MSG(info, "missing required inputs"); return err; } token_val = nla_get_u32(token); msk = mptcp_token_get_sock(genl_info_net(info), token_val); if (!msk) { NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); return err; } sk = (struct sock *)msk; if (!mptcp_pm_is_userspace(msk)) { GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); goto create_err; } err = mptcp_pm_parse_addr(laddr, info, &addr_l); if (err < 0) { NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr"); goto create_err; } err = mptcp_pm_parse_addr(raddr, info, &addr_r); if (err < 0) { NL_SET_ERR_MSG_ATTR(info->extack, raddr, "error parsing remote addr"); goto create_err; } if (!mptcp_pm_addr_families_match(sk, &addr_l, &addr_r)) { GENL_SET_ERR_MSG(info, "families mismatch"); err = -EINVAL; goto create_err; } local.addr = addr_l; err = mptcp_userspace_pm_append_new_local_addr(msk, &local); if (err < 0) { GENL_SET_ERR_MSG(info, "did not match address and id"); goto create_err; } lock_sock(sk); err = __mptcp_subflow_connect(sk, &addr_l, &addr_r); release_sock(sk); spin_lock_bh(&msk->pm.lock); if (err) mptcp_userspace_pm_delete_local_addr(msk, &local); else msk->pm.subflows++; spin_unlock_bh(&msk->pm.lock); create_err: sock_put(sk); return err; } static struct sock *mptcp_nl_find_ssk(struct mptcp_sock *msk, const struct mptcp_addr_info *local, const struct mptcp_addr_info *remote) { struct mptcp_subflow_context *subflow; if (local->family != remote->family) return NULL; mptcp_for_each_subflow(msk, subflow) { const struct inet_sock *issk; struct sock *ssk; ssk = mptcp_subflow_tcp_sock(subflow); if (local->family != ssk->sk_family) continue; issk = inet_sk(ssk); switch (ssk->sk_family) { case AF_INET: if (issk->inet_saddr != local->addr.s_addr || issk->inet_daddr != remote->addr.s_addr) continue; break; #if IS_ENABLED(CONFIG_MPTCP_IPV6) case AF_INET6: { const struct ipv6_pinfo *pinfo = inet6_sk(ssk); if (!ipv6_addr_equal(&local->addr6, &pinfo->saddr) || !ipv6_addr_equal(&remote->addr6, &ssk->sk_v6_daddr)) continue; break; } #endif default: continue; } if (issk->inet_sport == local->port && issk->inet_dport == remote->port) return ssk; } return NULL; } int mptcp_pm_nl_subflow_destroy_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct mptcp_addr_info addr_l; struct mptcp_addr_info addr_r; struct mptcp_sock *msk; struct sock *sk, *ssk; int err = -EINVAL; u32 token_val; if (!laddr || !raddr || !token) { GENL_SET_ERR_MSG(info, "missing required inputs"); return err; } token_val = nla_get_u32(token); msk = mptcp_token_get_sock(genl_info_net(info), token_val); if (!msk) { NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); return err; } sk = (struct sock *)msk; if (!mptcp_pm_is_userspace(msk)) { GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); goto destroy_err; } err = mptcp_pm_parse_addr(laddr, info, &addr_l); if (err < 0) { NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr"); goto destroy_err; } err = mptcp_pm_parse_addr(raddr, info, &addr_r); if (err < 0) { NL_SET_ERR_MSG_ATTR(info->extack, raddr, "error parsing remote addr"); goto destroy_err; } if (addr_l.family != addr_r.family) { GENL_SET_ERR_MSG(info, "address families do not match"); err = -EINVAL; goto destroy_err; } if (!addr_l.port || !addr_r.port) { GENL_SET_ERR_MSG(info, "missing local or remote port"); err = -EINVAL; goto destroy_err; } lock_sock(sk); ssk = mptcp_nl_find_ssk(msk, &addr_l, &addr_r); if (ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_pm_addr_entry entry = { .addr = addr_l }; spin_lock_bh(&msk->pm.lock); mptcp_userspace_pm_delete_local_addr(msk, &entry); spin_unlock_bh(&msk->pm.lock); mptcp_subflow_shutdown(sk, ssk, RCV_SHUTDOWN | SEND_SHUTDOWN); mptcp_close_ssk(sk, ssk, subflow); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMSUBFLOW); err = 0; } else { err = -ESRCH; } release_sock(sk); destroy_err: sock_put(sk); return err; } int mptcp_userspace_pm_set_flags(struct net *net, struct nlattr *token, struct mptcp_pm_addr_entry *loc, struct mptcp_pm_addr_entry *rem, u8 bkup) { struct mptcp_sock *msk; int ret = -EINVAL; struct sock *sk; u32 token_val; token_val = nla_get_u32(token); msk = mptcp_token_get_sock(net, token_val); if (!msk) return ret; sk = (struct sock *)msk; if (!mptcp_pm_is_userspace(msk)) goto set_flags_err; if (loc->addr.family == AF_UNSPEC || rem->addr.family == AF_UNSPEC) goto set_flags_err; lock_sock(sk); ret = mptcp_pm_nl_mp_prio_send_ack(msk, &loc->addr, &rem->addr, bkup); release_sock(sk); set_flags_err: sock_put(sk); return ret; }
1982 1250 214 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 // SPDX-License-Identifier: GPL-2.0 /* * fs/ext4/mballoc.h * * Written by: Alex Tomas <alex@clusterfs.com> * */ #ifndef _EXT4_MBALLOC_H #define _EXT4_MBALLOC_H #include <linux/time.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/module.h> #include <linux/swap.h> #include <linux/proc_fs.h> #include <linux/pagemap.h> #include <linux/seq_file.h> #include <linux/blkdev.h> #include <linux/mutex.h> #include "ext4_jbd2.h" #include "ext4.h" /* * mb_debug() dynamic printk msgs could be used to debug mballoc code. */ #ifdef CONFIG_EXT4_DEBUG #define mb_debug(sb, fmt, ...) \ pr_debug("[%s/%d] EXT4-fs (%s): (%s, %d): %s: " fmt, \ current->comm, task_pid_nr(current), sb->s_id, \ __FILE__, __LINE__, __func__, ##__VA_ARGS__) #else #define mb_debug(sb, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif #define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ #define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */ /* * How long mballoc can look for a best extent (in found extents) */ #define MB_DEFAULT_MAX_TO_SCAN 200 /* * How long mballoc must look for a best extent */ #define MB_DEFAULT_MIN_TO_SCAN 10 /* * with 's_mb_stats' allocator will collect stats that will be * shown at umount. The collecting costs though! */ #define MB_DEFAULT_STATS 0 /* * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served * by the stream allocator, which purpose is to pack requests * as close each to other as possible to produce smooth I/O traffic * We use locality group prealloc space for stream request. * We can tune the same via /proc/fs/ext4/<partition>/stream_req */ #define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */ /* * for which requests use 2^N search using buddies */ #define MB_DEFAULT_ORDER2_REQS 2 /* * default group prealloc size 512 blocks */ #define MB_DEFAULT_GROUP_PREALLOC 512 /* * Number of groups to search linearly before performing group scanning * optimization. */ #define MB_DEFAULT_LINEAR_LIMIT 4 /* * Minimum number of groups that should be present in the file system to perform * group scanning optimizations. */ #define MB_DEFAULT_LINEAR_SCAN_THRESHOLD 16 /* * The maximum order upto which CR_BEST_AVAIL_LEN can trim a particular * allocation request. Example, if we have an order 7 request and max trim order * of 3, we can trim this request upto order 4. */ #define MB_DEFAULT_BEST_AVAIL_TRIM_ORDER 3 /* * Number of valid buddy orders */ #define MB_NUM_ORDERS(sb) ((sb)->s_blocksize_bits + 2) struct ext4_free_data { /* this links the free block information from sb_info */ struct list_head efd_list; /* this links the free block information from group_info */ struct rb_node efd_node; /* group which free block extent belongs */ ext4_group_t efd_group; /* free block extent */ ext4_grpblk_t efd_start_cluster; ext4_grpblk_t efd_count; /* transaction which freed this extent */ tid_t efd_tid; }; struct ext4_prealloc_space { union { struct rb_node inode_node; /* for inode PA rbtree */ struct list_head lg_list; /* for lg PAs */ } pa_node; struct list_head pa_group_list; union { struct list_head pa_tmp_list; struct rcu_head pa_rcu; } u; spinlock_t pa_lock; atomic_t pa_count; unsigned pa_deleted; ext4_fsblk_t pa_pstart; /* phys. block */ ext4_lblk_t pa_lstart; /* log. block */ ext4_grpblk_t pa_len; /* len of preallocated chunk */ ext4_grpblk_t pa_free; /* how many blocks are free */ unsigned short pa_type; /* pa type. inode or group */ union { rwlock_t *inode_lock; /* locks the rbtree holding this PA */ spinlock_t *lg_lock; /* locks the lg list holding this PA */ } pa_node_lock; struct inode *pa_inode; /* used to get the inode during group discard */ }; enum { MB_INODE_PA = 0, MB_GROUP_PA = 1 }; struct ext4_free_extent { ext4_lblk_t fe_logical; ext4_grpblk_t fe_start; /* In cluster units */ ext4_group_t fe_group; ext4_grpblk_t fe_len; /* In cluster units */ }; /* * Locality group: * we try to group all related changes together * so that writeback can flush/allocate them together as well * Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC * (512). We store prealloc space into the hash based on the pa_free blocks * order value.ie, fls(pa_free)-1; */ #define PREALLOC_TB_SIZE 10 struct ext4_locality_group { /* for allocator */ /* to serialize allocates */ struct mutex lg_mutex; /* list of preallocations */ struct list_head lg_prealloc_list[PREALLOC_TB_SIZE]; spinlock_t lg_prealloc_lock; }; struct ext4_allocation_context { struct inode *ac_inode; struct super_block *ac_sb; /* original request */ struct ext4_free_extent ac_o_ex; /* goal request (normalized ac_o_ex) */ struct ext4_free_extent ac_g_ex; /* the best found extent */ struct ext4_free_extent ac_b_ex; /* copy of the best found extent taken before preallocation efforts */ struct ext4_free_extent ac_f_ex; /* * goal len can change in CR1.5, so save the original len. This is * used while adjusting the PA window and for accounting. */ ext4_grpblk_t ac_orig_goal_len; __u32 ac_groups_considered; __u32 ac_flags; /* allocation hints */ __u16 ac_groups_scanned; __u16 ac_groups_linear_remaining; __u16 ac_found; __u16 ac_cX_found[EXT4_MB_NUM_CRS]; __u16 ac_tail; __u16 ac_buddy; __u8 ac_status; __u8 ac_criteria; __u8 ac_2order; /* if request is to allocate 2^N blocks and * N > 0, the field stores N, otherwise 0 */ __u8 ac_op; /* operation, for history only */ struct page *ac_bitmap_page; struct page *ac_buddy_page; struct ext4_prealloc_space *ac_pa; struct ext4_locality_group *ac_lg; }; #define AC_STATUS_CONTINUE 1 #define AC_STATUS_FOUND 2 #define AC_STATUS_BREAK 3 struct ext4_buddy { struct page *bd_buddy_page; void *bd_buddy; struct page *bd_bitmap_page; void *bd_bitmap; struct ext4_group_info *bd_info; struct super_block *bd_sb; __u16 bd_blkbits; ext4_group_t bd_group; }; static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, struct ext4_free_extent *fex) { return ext4_group_first_block_no(sb, fex->fe_group) + (fex->fe_start << EXT4_SB(sb)->s_cluster_bits); } static inline loff_t extent_logical_end(struct ext4_sb_info *sbi, struct ext4_free_extent *fex) { /* Use loff_t to avoid end exceeding ext4_lblk_t max. */ return (loff_t)fex->fe_logical + EXT4_C2B(sbi, fex->fe_len); } static inline loff_t pa_logical_end(struct ext4_sb_info *sbi, struct ext4_prealloc_space *pa) { /* Use loff_t to avoid end exceeding ext4_lblk_t max. */ return (loff_t)pa->pa_lstart + EXT4_C2B(sbi, pa->pa_len); } typedef int (*ext4_mballoc_query_range_fn)( struct super_block *sb, ext4_group_t agno, ext4_grpblk_t start, ext4_grpblk_t len, void *priv); int ext4_mballoc_query_range( struct super_block *sb, ext4_group_t agno, ext4_grpblk_t start, ext4_grpblk_t end, ext4_mballoc_query_range_fn formatter, void *priv); #endif
19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Definitions for key type implementations * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef _LINUX_KEY_TYPE_H #define _LINUX_KEY_TYPE_H #include <linux/key.h> #include <linux/errno.h> #ifdef CONFIG_KEYS struct kernel_pkey_query; struct kernel_pkey_params; /* * Pre-parsed payload, used by key add, update and instantiate. * * This struct will be cleared and data and datalen will be set with the data * and length parameters from the caller and quotalen will be set from * def_datalen from the key type. Then if the preparse() op is provided by the * key type, that will be called. Then the struct will be passed to the * instantiate() or the update() op. * * If the preparse() op is given, the free_preparse() op will be called to * clear the contents. */ struct key_preparsed_payload { const char *orig_description; /* Actual or proposed description (maybe NULL) */ char *description; /* Proposed key description (or NULL) */ union key_payload payload; /* Proposed payload */ const void *data; /* Raw data */ size_t datalen; /* Raw datalen */ size_t quotalen; /* Quota length for proposed payload */ time64_t expiry; /* Expiry time of key */ } __randomize_layout; typedef int (*request_key_actor_t)(struct key *auth_key, void *aux); /* * Preparsed matching criterion. */ struct key_match_data { /* Comparison function, defaults to exact description match, but can be * overridden by type->match_preparse(). Should return true if a match * is found and false if not. */ bool (*cmp)(const struct key *key, const struct key_match_data *match_data); const void *raw_data; /* Raw match data */ void *preparsed; /* For ->match_preparse() to stash stuff */ unsigned lookup_type; /* Type of lookup for this search. */ #define KEYRING_SEARCH_LOOKUP_DIRECT 0x0000 /* Direct lookup by description. */ #define KEYRING_SEARCH_LOOKUP_ITERATE 0x0001 /* Iterative search. */ }; /* * kernel managed key type definition */ struct key_type { /* name of the type */ const char *name; /* default payload length for quota precalculation (optional) * - this can be used instead of calling key_payload_reserve(), that * function only needs to be called if the real datalen is different */ size_t def_datalen; unsigned int flags; #define KEY_TYPE_NET_DOMAIN 0x00000001 /* Keys of this type have a net namespace domain */ #define KEY_TYPE_INSTANT_REAP 0x00000002 /* Keys of this type don't have a delay after expiring */ /* vet a description */ int (*vet_description)(const char *description); /* Preparse the data blob from userspace that is to be the payload, * generating a proposed description and payload that will be handed to * the instantiate() and update() ops. */ int (*preparse)(struct key_preparsed_payload *prep); /* Free a preparse data structure. */ void (*free_preparse)(struct key_preparsed_payload *prep); /* instantiate a key of this type * - this method should call key_payload_reserve() to determine if the * user's quota will hold the payload */ int (*instantiate)(struct key *key, struct key_preparsed_payload *prep); /* update a key of this type (optional) * - this method should call key_payload_reserve() to recalculate the * quota consumption * - the key must be locked against read when modifying */ int (*update)(struct key *key, struct key_preparsed_payload *prep); /* Preparse the data supplied to ->match() (optional). The * data to be preparsed can be found in match_data->raw_data. * The lookup type can also be set by this function. */ int (*match_preparse)(struct key_match_data *match_data); /* Free preparsed match data (optional). This should be supplied it * ->match_preparse() is supplied. */ void (*match_free)(struct key_match_data *match_data); /* clear some of the data from a key on revokation (optional) * - the key's semaphore will be write-locked by the caller */ void (*revoke)(struct key *key); /* clear the data from a key (optional) */ void (*destroy)(struct key *key); /* describe a key */ void (*describe)(const struct key *key, struct seq_file *p); /* read a key's data (optional) * - permission checks will be done by the caller * - the key's semaphore will be readlocked by the caller * - should return the amount of data that could be read, no matter how * much is copied into the buffer * - shouldn't do the copy if the buffer is NULL */ long (*read)(const struct key *key, char *buffer, size_t buflen); /* handle request_key() for this type instead of invoking * /sbin/request-key (optional) * - key is the key to instantiate * - authkey is the authority to assume when instantiating this key * - op is the operation to be done, usually "create" * - the call must not return until the instantiation process has run * its course */ request_key_actor_t request_key; /* Look up a keyring access restriction (optional) * * - NULL is a valid return value (meaning the requested restriction * is known but will never block addition of a key) * - should return -EINVAL if the restriction is unknown */ struct key_restriction *(*lookup_restriction)(const char *params); /* Asymmetric key accessor functions. */ int (*asym_query)(const struct kernel_pkey_params *params, struct kernel_pkey_query *info); int (*asym_eds_op)(struct kernel_pkey_params *params, const void *in, void *out); int (*asym_verify_signature)(struct kernel_pkey_params *params, const void *in, const void *in2); /* internal fields */ struct list_head link; /* link in types list */ struct lock_class_key lock_class; /* key->sem lock class */ } __randomize_layout; extern struct key_type key_type_keyring; extern int register_key_type(struct key_type *ktype); extern void unregister_key_type(struct key_type *ktype); extern int key_payload_reserve(struct key *key, size_t datalen); extern int key_instantiate_and_link(struct key *key, const void *data, size_t datalen, struct key *keyring, struct key *authkey); extern int key_reject_and_link(struct key *key, unsigned timeout, unsigned error, struct key *keyring, struct key *authkey); extern void complete_request_key(struct key *authkey, int error); static inline int key_negate_and_link(struct key *key, unsigned timeout, struct key *keyring, struct key *authkey) { return key_reject_and_link(key, timeout, ENOKEY, keyring, authkey); } extern int generic_key_instantiate(struct key *key, struct key_preparsed_payload *prep); #endif /* CONFIG_KEYS */ #endif /* _LINUX_KEY_TYPE_H */
2 2 791 8 11789 3 784 27 18 20 7 72 27 7 27 88 31 88 58 21 39 25 27 54 54 93 50 2 8 1 1 15 14 14 1 54 45 17 14 45 52 45 44 8 8 41 41 41 5 45 54 10 6 8 7 1 7 6 10 3 2 85 85 83 33 84 31 9 83 25 85 84 86 84 30 11 30 56 54 56 28 56 31 30 13 7 13 3 1 7 7 19 7 14 11 2 12 2 2 5 12 1 12 12 47 47 47 26 23 1 1 15 29 28 27 28 2 15 12 22 22 3 21 63 63 63 29 62 63 63 62 28 62 29 62 60 54 54 22 22 4 4 4 4 3 21 22 54 8 59 65 63 65 114 108 114 66 64 14 2 13 13 6 6 3 2 2 22 81 80 3 2 70 67 64 64 6 8 7 5 5 8 5 1 5 4 4 70 16 14 16 13 2 16 36 35 34 34 33 1 29 29 29 29 29 19 36 7 22 10 12 16 15 22 99 99 52 4837 102 4838 4826 4831 7 7 4825 3 2 2 3 3 6 2 7 7 11 3 6 9873 8 8 8 11799 11772 8 1 6 6 6 1 6 12 12 11796 45 11071 11072 177 45 1 43 11055 11068 12 4720 8864 1714 2 2 1 4401 2 4397 2 4399 4398 4397 1696 1697 1697 37 35 37 37 37 33 33 14 15 15 10 9 15 12 4 12 3 13 14 10 4 13 4 4 3 3 3 13 765 11 8 11 764 4 3 3 3 3 3 3 12 13 13 11 11 3 3 2 3 3 3 2 3 11 11 11 11 3 3 3 760 2 2 2 760 11 11 11 455 1 1 1 36 20 36 22 17 27 11 7 3 7 6 3 4 2 3 16 9 7 14 11 7 4 2 12 20 13 19 12 36 3 3 1 1 3 1 1 1 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 // SPDX-License-Identifier: GPL-2.0-only /* * Simple NUMA memory policy for the Linux kernel. * * Copyright 2003,2004 Andi Kleen, SuSE Labs. * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. * * NUMA policy allows the user to give hints in which node(s) memory should * be allocated. * * Support four policies per VMA and per process: * * The VMA policy has priority over the process policy for a page fault. * * interleave Allocate memory interleaved over a set of nodes, * with normal fallback if it fails. * For VMA based allocations this interleaves based on the * offset into the backing object or offset into the mapping * for anonymous memory. For process policy an process counter * is used. * * bind Only allocate memory on a specific set of nodes, * no fallback. * FIXME: memory is allocated starting with the first node * to the last. It would be better if bind would truly restrict * the allocation to memory nodes instead * * preferred Try a specific node first before normal fallback. * As a special case NUMA_NO_NODE here means do the allocation * on the local CPU. This is normally identical to default, * but useful to set in a VMA when you have a non default * process policy. * * preferred many Try a set of nodes first before normal fallback. This is * similar to preferred without the special case. * * default Allocate on the local node first, or when on a VMA * use the process policy. This is what Linux always did * in a NUMA aware kernel and still does by, ahem, default. * * The process policy is applied for most non interrupt memory allocations * in that process' context. Interrupts ignore the policies and always * try to allocate on the local CPU. The VMA policy is only applied for memory * allocations for a VMA in the VM. * * Currently there are a few corner cases in swapping where the policy * is not applied, but the majority should be handled. When process policy * is used it is not remembered over swap outs/swap ins. * * Only the highest zone in the zone hierarchy gets policied. Allocations * requesting a lower zone just use default policy. This implies that * on systems with highmem kernel lowmem allocation don't get policied. * Same with GFP_DMA allocations. * * For shmem/tmpfs shared memory the policy is shared between * all users and remembered even when nobody has memory mapped. */ /* Notebook: fix mmap readahead to honour policy and enable policy for any page cache object statistics for bigpages global policy for page cache? currently it uses process policy. Requires first item above. handle mremap for shared memory (currently ignored for the policy) grows down? make bind policy root only? It can trigger oom much faster and the kernel is not always grateful with that. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mempolicy.h> #include <linux/pagewalk.h> #include <linux/highmem.h> #include <linux/hugetlb.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/task.h> #include <linux/nodemask.h> #include <linux/cpuset.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/export.h> #include <linux/nsproxy.h> #include <linux/interrupt.h> #include <linux/init.h> #include <linux/compat.h> #include <linux/ptrace.h> #include <linux/swap.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/migrate.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/ctype.h> #include <linux/mm_inline.h> #include <linux/mmu_notifier.h> #include <linux/printk.h> #include <linux/swapops.h> #include <asm/tlbflush.h> #include <asm/tlb.h> #include <linux/uaccess.h> #include "internal.h" /* Internal flags */ #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ #define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */ static struct kmem_cache *policy_cache; static struct kmem_cache *sn_cache; /* Highest zone. An specific allocation for a zone below that is not policied. */ enum zone_type policy_zone = 0; /* * run-time system-wide default policy => local allocation */ static struct mempolicy default_policy = { .refcnt = ATOMIC_INIT(1), /* never free it */ .mode = MPOL_LOCAL, }; static struct mempolicy preferred_node_policy[MAX_NUMNODES]; /** * numa_nearest_node - Find nearest node by state * @node: Node id to start the search * @state: State to filter the search * * Lookup the closest node by distance if @nid is not in state. * * Return: this @node if it is in state, otherwise the closest node by distance */ int numa_nearest_node(int node, unsigned int state) { int min_dist = INT_MAX, dist, n, min_node; if (state >= NR_NODE_STATES) return -EINVAL; if (node == NUMA_NO_NODE || node_state(node, state)) return node; min_node = node; for_each_node_state(n, state) { dist = node_distance(node, n); if (dist < min_dist) { min_dist = dist; min_node = n; } } return min_node; } EXPORT_SYMBOL_GPL(numa_nearest_node); struct mempolicy *get_task_policy(struct task_struct *p) { struct mempolicy *pol = p->mempolicy; int node; if (pol) return pol; node = numa_node_id(); if (node != NUMA_NO_NODE) { pol = &preferred_node_policy[node]; /* preferred_node_policy is not initialised early in boot */ if (pol->mode) return pol; } return &default_policy; } static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); } mpol_ops[MPOL_MAX]; static inline int mpol_store_user_nodemask(const struct mempolicy *pol) { return pol->flags & MPOL_MODE_FLAGS; } static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, const nodemask_t *rel) { nodemask_t tmp; nodes_fold(tmp, *orig, nodes_weight(*rel)); nodes_onto(*ret, tmp, *rel); } static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes) { if (nodes_empty(*nodes)) return -EINVAL; pol->nodes = *nodes; return 0; } static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) { if (nodes_empty(*nodes)) return -EINVAL; nodes_clear(pol->nodes); node_set(first_node(*nodes), pol->nodes); return 0; } /* * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if * any, for the new policy. mpol_new() has already validated the nodes * parameter with respect to the policy mode and flags. * * Must be called holding task's alloc_lock to protect task's mems_allowed * and mempolicy. May also be called holding the mmap_lock for write. */ static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes, struct nodemask_scratch *nsc) { int ret; /* * Default (pol==NULL) resp. local memory policies are not a * subject of any remapping. They also do not need any special * constructor. */ if (!pol || pol->mode == MPOL_LOCAL) return 0; /* Check N_MEMORY */ nodes_and(nsc->mask1, cpuset_current_mems_allowed, node_states[N_MEMORY]); VM_BUG_ON(!nodes); if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); else nodes_and(nsc->mask2, *nodes, nsc->mask1); if (mpol_store_user_nodemask(pol)) pol->w.user_nodemask = *nodes; else pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed; ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); return ret; } /* * This function just creates a new policy, does some check and simple * initialization. You must invoke mpol_set_nodemask() to set nodes. */ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, nodemask_t *nodes) { struct mempolicy *policy; if (mode == MPOL_DEFAULT) { if (nodes && !nodes_empty(*nodes)) return ERR_PTR(-EINVAL); return NULL; } VM_BUG_ON(!nodes); /* * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation). * All other modes require a valid pointer to a non-empty nodemask. */ if (mode == MPOL_PREFERRED) { if (nodes_empty(*nodes)) { if (((flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES))) return ERR_PTR(-EINVAL); mode = MPOL_LOCAL; } } else if (mode == MPOL_LOCAL) { if (!nodes_empty(*nodes) || (flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES)) return ERR_PTR(-EINVAL); } else if (nodes_empty(*nodes)) return ERR_PTR(-EINVAL); policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!policy) return ERR_PTR(-ENOMEM); atomic_set(&policy->refcnt, 1); policy->mode = mode; policy->flags = flags; policy->home_node = NUMA_NO_NODE; return policy; } /* Slow path of a mpol destructor. */ void __mpol_put(struct mempolicy *pol) { if (!atomic_dec_and_test(&pol->refcnt)) return; kmem_cache_free(policy_cache, pol); } static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) { } static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) { nodemask_t tmp; if (pol->flags & MPOL_F_STATIC_NODES) nodes_and(tmp, pol->w.user_nodemask, *nodes); else if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); else { nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed, *nodes); pol->w.cpuset_mems_allowed = *nodes; } if (nodes_empty(tmp)) tmp = *nodes; pol->nodes = tmp; } static void mpol_rebind_preferred(struct mempolicy *pol, const nodemask_t *nodes) { pol->w.cpuset_mems_allowed = *nodes; } /* * mpol_rebind_policy - Migrate a policy to a different set of nodes * * Per-vma policies are protected by mmap_lock. Allocations using per-task * policies are protected by task->mems_allowed_seq to prevent a premature * OOM/allocation failure due to parallel nodemask modification. */ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) { if (!pol || pol->mode == MPOL_LOCAL) return; if (!mpol_store_user_nodemask(pol) && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; mpol_ops[pol->mode].rebind(pol, newmask); } /* * Wrapper for mpol_rebind_policy() that just requires task * pointer, and updates task mempolicy. * * Called with task's alloc_lock held. */ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) { mpol_rebind_policy(tsk->mempolicy, new); } /* * Rebind each vma in mm to new nodemask. * * Call holding a reference to mm. Takes mm->mmap_lock during call. */ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); mmap_write_lock(mm); for_each_vma(vmi, vma) { vma_start_write(vma); mpol_rebind_policy(vma->vm_policy, new); } mmap_write_unlock(mm); } static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { [MPOL_DEFAULT] = { .rebind = mpol_rebind_default, }, [MPOL_INTERLEAVE] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, [MPOL_PREFERRED] = { .create = mpol_new_preferred, .rebind = mpol_rebind_preferred, }, [MPOL_BIND] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, [MPOL_LOCAL] = { .rebind = mpol_rebind_default, }, [MPOL_PREFERRED_MANY] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_preferred, }, }; static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags); static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, pgoff_t ilx, int *nid); static bool strictly_unmovable(unsigned long flags) { /* * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO * if any misplaced page is found. */ return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) == MPOL_MF_STRICT; } struct migration_mpol { /* for alloc_migration_target_by_mpol() */ struct mempolicy *pol; pgoff_t ilx; }; struct queue_pages { struct list_head *pagelist; unsigned long flags; nodemask_t *nmask; unsigned long start; unsigned long end; struct vm_area_struct *first; struct folio *large; /* note last large folio encountered */ long nr_failed; /* could not be isolated at this time */ }; /* * Check if the folio's nid is in qp->nmask. * * If MPOL_MF_INVERT is set in qp->flags, check if the nid is * in the invert of qp->nmask. */ static inline bool queue_folio_required(struct folio *folio, struct queue_pages *qp) { int nid = folio_nid(folio); unsigned long flags = qp->flags; return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); } static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) { struct folio *folio; struct queue_pages *qp = walk->private; if (unlikely(is_pmd_migration_entry(*pmd))) { qp->nr_failed++; return; } folio = pfn_folio(pmd_pfn(*pmd)); if (is_huge_zero_page(&folio->page)) { walk->action = ACTION_CONTINUE; return; } if (!queue_folio_required(folio, qp)) return; if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || !vma_migratable(walk->vma) || !migrate_folio_add(folio, qp->pagelist, qp->flags)) qp->nr_failed++; } /* * Scan through folios, checking if they satisfy the required conditions, * moving them from LRU to local pagelist for migration if they do (or not). * * queue_folios_pte_range() has two possible return values: * 0 - continue walking to scan for more, even if an existing folio on the * wrong node could not be isolated and queued for migration. * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL, * and an existing folio was on a node that does not follow the policy. */ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; struct folio *folio; struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; pte_t *pte, *mapped_pte; pte_t ptent; spinlock_t *ptl; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { queue_folios_pmd(pmd, walk); spin_unlock(ptl); goto out; } mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (!pte) { walk->action = ACTION_AGAIN; return 0; } for (; addr != end; pte++, addr += PAGE_SIZE) { ptent = ptep_get(pte); if (pte_none(ptent)) continue; if (!pte_present(ptent)) { if (is_migration_entry(pte_to_swp_entry(ptent))) qp->nr_failed++; continue; } folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; /* * vm_normal_folio() filters out zero pages, but there might * still be reserved folios to skip, perhaps in a VDSO. */ if (folio_test_reserved(folio)) continue; if (!queue_folio_required(folio, qp)) continue; if (folio_test_large(folio)) { /* * A large folio can only be isolated from LRU once, * but may be mapped by many PTEs (and Copy-On-Write may * intersperse PTEs of other, order 0, folios). This is * a common case, so don't mistake it for failure (but * there can be other cases of multi-mapped pages which * this quick check does not help to filter out - and a * search of the pagelist might grow to be prohibitive). * * migrate_pages(&pagelist) returns nr_failed folios, so * check "large" now so that queue_pages_range() returns * a comparable nr_failed folios. This does imply that * if folio could not be isolated for some racy reason * at its first PTE, later PTEs will not give it another * chance of isolation; but keeps the accounting simple. */ if (folio == qp->large) continue; qp->large = folio; } if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || !vma_migratable(vma) || !migrate_folio_add(folio, qp->pagelist, flags)) { qp->nr_failed++; if (strictly_unmovable(flags)) break; } } pte_unmap_unlock(mapped_pte, ptl); cond_resched(); out: if (qp->nr_failed && strictly_unmovable(flags)) return -EIO; return 0; } static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { #ifdef CONFIG_HUGETLB_PAGE struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; struct folio *folio; spinlock_t *ptl; pte_t entry; ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); entry = huge_ptep_get(pte); if (!pte_present(entry)) { if (unlikely(is_hugetlb_entry_migration(entry))) qp->nr_failed++; goto unlock; } folio = pfn_folio(pte_pfn(entry)); if (!queue_folio_required(folio, qp)) goto unlock; if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || !vma_migratable(walk->vma)) { qp->nr_failed++; goto unlock; } /* * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. * Choosing not to migrate a shared folio is not counted as a failure. * * To check if the folio is shared, ideally we want to make sure * every page is mapped to the same process. Doing that is very * expensive, so check the estimated sharers of the folio instead. */ if ((flags & MPOL_MF_MOVE_ALL) || (folio_estimated_sharers(folio) == 1 && !hugetlb_pmd_shared(pte))) if (!isolate_hugetlb(folio, qp->pagelist)) qp->nr_failed++; unlock: spin_unlock(ptl); if (qp->nr_failed && strictly_unmovable(flags)) return -EIO; #endif return 0; } #ifdef CONFIG_NUMA_BALANCING /* * This is used to mark a range of virtual addresses to be inaccessible. * These are later cleared by a NUMA hinting fault. Depending on these * faults, pages may be migrated for better NUMA placement. * * This is assuming that NUMA faults are handled using PROT_NONE. If * an architecture makes a different choice, it will need further * changes to the core. */ unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { struct mmu_gather tlb; long nr_updated; tlb_gather_mmu(&tlb, vma->vm_mm); nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA); if (nr_updated > 0) count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); tlb_finish_mmu(&tlb); return nr_updated; } #endif /* CONFIG_NUMA_BALANCING */ static int queue_pages_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *next, *vma = walk->vma; struct queue_pages *qp = walk->private; unsigned long endvma = vma->vm_end; unsigned long flags = qp->flags; /* range check first */ VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma); if (!qp->first) { qp->first = vma; if (!(flags & MPOL_MF_DISCONTIG_OK) && (qp->start < vma->vm_start)) /* hole at head side of range */ return -EFAULT; } next = find_vma(vma->vm_mm, vma->vm_end); if (!(flags & MPOL_MF_DISCONTIG_OK) && ((vma->vm_end < qp->end) && (!next || vma->vm_end < next->vm_start))) /* hole at middle or tail of range */ return -EFAULT; /* * Need check MPOL_MF_STRICT to return -EIO if possible * regardless of vma_migratable */ if (!vma_migratable(vma) && !(flags & MPOL_MF_STRICT)) return 1; if (endvma > end) endvma = end; /* * Check page nodes, and queue pages to move, in the current vma. * But if no moving, and no strict checking, the scan can be skipped. */ if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) return 0; return 1; } static const struct mm_walk_ops queue_pages_walk_ops = { .hugetlb_entry = queue_folios_hugetlb, .pmd_entry = queue_folios_pte_range, .test_walk = queue_pages_test_walk, .walk_lock = PGWALK_RDLOCK, }; static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = { .hugetlb_entry = queue_folios_hugetlb, .pmd_entry = queue_folios_pte_range, .test_walk = queue_pages_test_walk, .walk_lock = PGWALK_WRLOCK, }; /* * Walk through page tables and collect pages to be migrated. * * If pages found in a given range are not on the required set of @nodes, * and migration is allowed, they are isolated and queued to @pagelist. * * queue_pages_range() may return: * 0 - all pages already on the right node, or successfully queued for moving * (or neither strict checking nor moving requested: only range checking). * >0 - this number of misplaced folios could not be queued for moving * (a hugetlbfs page or a transparent huge page being counted as 1). * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs. * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified. */ static long queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, nodemask_t *nodes, unsigned long flags, struct list_head *pagelist) { int err; struct queue_pages qp = { .pagelist = pagelist, .flags = flags, .nmask = nodes, .start = start, .end = end, .first = NULL, }; const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ? &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops; err = walk_page_range(mm, start, end, ops, &qp); if (!qp.first) /* whole range in hole */ err = -EFAULT; return err ? : qp.nr_failed; } /* * Apply policy to a single VMA * This must be called with the mmap_lock held for writing. */ static int vma_replace_policy(struct vm_area_struct *vma, struct mempolicy *pol) { int err; struct mempolicy *old; struct mempolicy *new; vma_assert_write_locked(vma); new = mpol_dup(pol); if (IS_ERR(new)) return PTR_ERR(new); if (vma->vm_ops && vma->vm_ops->set_policy) { err = vma->vm_ops->set_policy(vma, new); if (err) goto err_out; } old = vma->vm_policy; vma->vm_policy = new; /* protected by mmap_lock */ mpol_put(old); return 0; err_out: mpol_put(new); return err; } /* Split or merge the VMA (if required) and apply the new policy */ static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, struct mempolicy *new_pol) { unsigned long vmstart, vmend; vmend = min(end, vma->vm_end); if (start > vma->vm_start) { *prev = vma; vmstart = start; } else { vmstart = vma->vm_start; } if (mpol_equal(vma->vm_policy, new_pol)) { *prev = vma; return 0; } vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol); if (IS_ERR(vma)) return PTR_ERR(vma); *prev = vma; return vma_replace_policy(vma, new_pol); } /* Set the process memory policy */ static long do_set_mempolicy(unsigned short mode, unsigned short flags, nodemask_t *nodes) { struct mempolicy *new, *old; NODEMASK_SCRATCH(scratch); int ret; if (!scratch) return -ENOMEM; new = mpol_new(mode, flags, nodes); if (IS_ERR(new)) { ret = PTR_ERR(new); goto out; } task_lock(current); ret = mpol_set_nodemask(new, nodes, scratch); if (ret) { task_unlock(current); mpol_put(new); goto out; } old = current->mempolicy; current->mempolicy = new; if (new && new->mode == MPOL_INTERLEAVE) current->il_prev = MAX_NUMNODES-1; task_unlock(current); mpol_put(old); ret = 0; out: NODEMASK_SCRATCH_FREE(scratch); return ret; } /* * Return nodemask for policy for get_mempolicy() query * * Called with task's alloc_lock held */ static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes) { nodes_clear(*nodes); if (pol == &default_policy) return; switch (pol->mode) { case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: *nodes = pol->nodes; break; case MPOL_LOCAL: /* return empty node mask for local allocation */ break; default: BUG(); } } static int lookup_node(struct mm_struct *mm, unsigned long addr) { struct page *p = NULL; int ret; ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p); if (ret > 0) { ret = page_to_nid(p); put_page(p); } return ret; } /* Retrieve NUMA policy */ static long do_get_mempolicy(int *policy, nodemask_t *nmask, unsigned long addr, unsigned long flags) { int err; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL; if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) return -EINVAL; if (flags & MPOL_F_MEMS_ALLOWED) { if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) return -EINVAL; *policy = 0; /* just so it's initialized */ task_lock(current); *nmask = cpuset_current_mems_allowed; task_unlock(current); return 0; } if (flags & MPOL_F_ADDR) { pgoff_t ilx; /* ignored here */ /* * Do NOT fall back to task policy if the * vma/shared policy at addr is NULL. We * want to return MPOL_DEFAULT in this case. */ mmap_read_lock(mm); vma = vma_lookup(mm, addr); if (!vma) { mmap_read_unlock(mm); return -EFAULT; } pol = __get_vma_policy(vma, addr, &ilx); } else if (addr) return -EINVAL; if (!pol) pol = &default_policy; /* indicates default behavior */ if (flags & MPOL_F_NODE) { if (flags & MPOL_F_ADDR) { /* * Take a refcount on the mpol, because we are about to * drop the mmap_lock, after which only "pol" remains * valid, "vma" is stale. */ pol_refcount = pol; vma = NULL; mpol_get(pol); mmap_read_unlock(mm); err = lookup_node(mm, addr); if (err < 0) goto out; *policy = err; } else if (pol == current->mempolicy && pol->mode == MPOL_INTERLEAVE) { *policy = next_node_in(current->il_prev, pol->nodes); } else { err = -EINVAL; goto out; } } else { *policy = pol == &default_policy ? MPOL_DEFAULT : pol->mode; /* * Internal mempolicy flags must be masked off before exposing * the policy to userspace. */ *policy |= (pol->flags & MPOL_MODE_FLAGS); } err = 0; if (nmask) { if (mpol_store_user_nodemask(pol)) { *nmask = pol->w.user_nodemask; } else { task_lock(current); get_policy_nodemask(pol, nmask); task_unlock(current); } } out: mpol_cond_put(pol); if (vma) mmap_read_unlock(mm); if (pol_refcount) mpol_put(pol_refcount); return err; } #ifdef CONFIG_MIGRATION static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { /* * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. * Choosing not to migrate a shared folio is not counted as a failure. * * To check if the folio is shared, ideally we want to make sure * every page is mapped to the same process. Doing that is very * expensive, so check the estimated sharers of the folio instead. */ if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) { if (folio_isolate_lru(folio)) { list_add_tail(&folio->lru, foliolist); node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); } else { /* * Non-movable folio may reach here. And, there may be * temporary off LRU folios or non-LRU movable folios. * Treat them as unmovable folios since they can't be * isolated, so they can't be moved at the moment. */ return false; } } return true; } /* * Migrate pages from one node to a target node. * Returns error or the number of pages not migrated. */ static long migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) { nodemask_t nmask; struct vm_area_struct *vma; LIST_HEAD(pagelist); long nr_failed; long err = 0; struct migration_target_control mtc = { .nid = dest, .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, }; nodes_clear(nmask); node_set(source, nmask); VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); mmap_read_lock(mm); vma = find_vma(mm, 0); /* * This does not migrate the range, but isolates all pages that * need migration. Between passing in the full user address * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail, * but passes back the count of pages which could not be isolated. */ nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); mmap_read_unlock(mm); if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); if (err) putback_movable_pages(&pagelist); } if (err >= 0) err += nr_failed; return err; } /* * Move pages between the two nodesets so as to preserve the physical * layout as much as possible. * * Returns the number of page that could not be moved. */ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { long nr_failed = 0; long err = 0; nodemask_t tmp; lru_cache_disable(); /* * Find a 'source' bit set in 'tmp' whose corresponding 'dest' * bit in 'to' is not also set in 'tmp'. Clear the found 'source' * bit in 'tmp', and return that <source, dest> pair for migration. * The pair of nodemasks 'to' and 'from' define the map. * * If no pair of bits is found that way, fallback to picking some * pair of 'source' and 'dest' bits that are not the same. If the * 'source' and 'dest' bits are the same, this represents a node * that will be migrating to itself, so no pages need move. * * If no bits are left in 'tmp', or if all remaining bits left * in 'tmp' correspond to the same bit in 'to', return false * (nothing left to migrate). * * This lets us pick a pair of nodes to migrate between, such that * if possible the dest node is not already occupied by some other * source node, minimizing the risk of overloading the memory on a * node that would happen if we migrated incoming memory to a node * before migrating outgoing memory source that same node. * * A single scan of tmp is sufficient. As we go, we remember the * most recent <s, d> pair that moved (s != d). If we find a pair * that not only moved, but what's better, moved to an empty slot * (d is not set in tmp), then we break out then, with that pair. * Otherwise when we finish scanning from_tmp, we at least have the * most recent <s, d> pair that moved. If we get all the way through * the scan of tmp without finding any node that moved, much less * moved to an empty node, then there is nothing left worth migrating. */ tmp = *from; while (!nodes_empty(tmp)) { int s, d; int source = NUMA_NO_NODE; int dest = 0; for_each_node_mask(s, tmp) { /* * do_migrate_pages() tries to maintain the relative * node relationship of the pages established between * threads and memory areas. * * However if the number of source nodes is not equal to * the number of destination nodes we can not preserve * this node relative relationship. In that case, skip * copying memory from a node that is in the destination * mask. * * Example: [2,3,4] -> [3,4,5] moves everything. * [0-7] - > [3,4,5] moves only 0,1,2,6,7. */ if ((nodes_weight(*from) != nodes_weight(*to)) && (node_isset(s, *to))) continue; d = node_remap(s, *from, *to); if (s == d) continue; source = s; /* Node moved. Memorize */ dest = d; /* dest not in remaining from nodes? */ if (!node_isset(dest, tmp)) break; } if (source == NUMA_NO_NODE) break; node_clear(source, tmp); err = migrate_to_node(mm, source, dest, flags); if (err > 0) nr_failed += err; if (err < 0) break; } lru_cache_enable(); if (err < 0) return err; return (nr_failed < INT_MAX) ? nr_failed : INT_MAX; } /* * Allocate a new folio for page migration, according to NUMA mempolicy. */ static struct folio *alloc_migration_target_by_mpol(struct folio *src, unsigned long private) { struct migration_mpol *mmpol = (struct migration_mpol *)private; struct mempolicy *pol = mmpol->pol; pgoff_t ilx = mmpol->ilx; struct page *page; unsigned int order; int nid = numa_node_id(); gfp_t gfp; order = folio_order(src); ilx += src->index >> order; if (folio_test_hugetlb(src)) { nodemask_t *nodemask; struct hstate *h; h = folio_hstate(src); gfp = htlb_alloc_mask(h); nodemask = policy_nodemask(gfp, pol, ilx, &nid); return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp); } if (folio_test_large(src)) gfp = GFP_TRANSHUGE; else gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP; page = alloc_pages_mpol(gfp, order, pol, ilx, nid); return page_rmappable_folio(page); } #else static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { return false; } int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { return -ENOSYS; } static struct folio *alloc_migration_target_by_mpol(struct folio *src, unsigned long private) { return NULL; } #endif static long do_mbind(unsigned long start, unsigned long len, unsigned short mode, unsigned short mode_flags, nodemask_t *nmask, unsigned long flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; struct vma_iterator vmi; struct migration_mpol mmpol; struct mempolicy *new; unsigned long end; long err; long nr_failed; LIST_HEAD(pagelist); if (flags & ~(unsigned long)MPOL_MF_VALID) return -EINVAL; if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) return -EPERM; if (start & ~PAGE_MASK) return -EINVAL; if (mode == MPOL_DEFAULT) flags &= ~MPOL_MF_STRICT; len = PAGE_ALIGN(len); end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; new = mpol_new(mode, mode_flags, nmask); if (IS_ERR(new)) return PTR_ERR(new); /* * If we are using the default policy then operation * on discontinuous address spaces is okay after all */ if (!new) flags |= MPOL_MF_DISCONTIG_OK; if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) lru_cache_disable(); { NODEMASK_SCRATCH(scratch); if (scratch) { mmap_write_lock(mm); err = mpol_set_nodemask(new, nmask, scratch); if (err) mmap_write_unlock(mm); } else err = -ENOMEM; NODEMASK_SCRATCH_FREE(scratch); } if (err) goto mpol_out; /* * Lock the VMAs before scanning for pages to migrate, * to ensure we don't miss a concurrently inserted page. */ nr_failed = queue_pages_range(mm, start, end, nmask, flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist); if (nr_failed < 0) { err = nr_failed; nr_failed = 0; } else { vma_iter_init(&vmi, mm, start); prev = vma_prev(&vmi); for_each_vma_range(vmi, vma, end) { err = mbind_range(&vmi, vma, &prev, start, end, new); if (err) break; } } if (!err && !list_empty(&pagelist)) { /* Convert MPOL_DEFAULT's NULL to task or default policy */ if (!new) { new = get_task_policy(current); mpol_get(new); } mmpol.pol = new; mmpol.ilx = 0; /* * In the interleaved case, attempt to allocate on exactly the * targeted nodes, for the first VMA to be migrated; for later * VMAs, the nodes will still be interleaved from the targeted * nodemask, but one by one may be selected differently. */ if (new->mode == MPOL_INTERLEAVE) { struct page *page; unsigned int order; unsigned long addr = -EFAULT; list_for_each_entry(page, &pagelist, lru) { if (!PageKsm(page)) break; } if (!list_entry_is_head(page, &pagelist, lru)) { vma_iter_init(&vmi, mm, start); for_each_vma_range(vmi, vma, end) { addr = page_address_in_vma(page, vma); if (addr != -EFAULT) break; } } if (addr != -EFAULT) { order = compound_order(page); /* We already know the pol, but not the ilx */ mpol_cond_put(get_vma_policy(vma, addr, order, &mmpol.ilx)); /* Set base from which to increment by index */ mmpol.ilx -= page->index >> order; } } } mmap_write_unlock(mm); if (!err && !list_empty(&pagelist)) { nr_failed |= migrate_pages(&pagelist, alloc_migration_target_by_mpol, NULL, (unsigned long)&mmpol, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); } if (nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; if (!list_empty(&pagelist)) putback_movable_pages(&pagelist); mpol_out: mpol_put(new); if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) lru_cache_enable(); return err; } /* * User space interface with variable sized bitmaps for nodelists. */ static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask, unsigned long maxnode) { unsigned long nlongs = BITS_TO_LONGS(maxnode); int ret; if (in_compat_syscall()) ret = compat_get_bitmap(mask, (const compat_ulong_t __user *)nmask, maxnode); else ret = copy_from_user(mask, nmask, nlongs * sizeof(unsigned long)); if (ret) return -EFAULT; if (maxnode % BITS_PER_LONG) mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1; return 0; } /* Copy a node mask from user space. */ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, unsigned long maxnode) { --maxnode; nodes_clear(*nodes); if (maxnode == 0 || !nmask) return 0; if (maxnode > PAGE_SIZE*BITS_PER_BYTE) return -EINVAL; /* * When the user specified more nodes than supported just check * if the non supported part is all zero, one word at a time, * starting at the end. */ while (maxnode > MAX_NUMNODES) { unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG); unsigned long t; if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits)) return -EFAULT; if (maxnode - bits >= MAX_NUMNODES) { maxnode -= bits; } else { maxnode = MAX_NUMNODES; t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); } if (t) return -EINVAL; } return get_bitmap(nodes_addr(*nodes), nmask, maxnode); } /* Copy a kernel node mask to user space */ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, nodemask_t *nodes) { unsigned long copy = ALIGN(maxnode-1, 64) / 8; unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long); bool compat = in_compat_syscall(); if (compat) nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t); if (copy > nbytes) { if (copy > PAGE_SIZE) return -EINVAL; if (clear_user((char __user *)mask + nbytes, copy - nbytes)) return -EFAULT; copy = nbytes; maxnode = nr_node_ids; } if (compat) return compat_put_bitmap((compat_ulong_t __user *)mask, nodes_addr(*nodes), maxnode); return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; } /* Basic parameter sanity check used by both mbind() and set_mempolicy() */ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) { *flags = *mode & MPOL_MODE_FLAGS; *mode &= ~MPOL_MODE_FLAGS; if ((unsigned int)(*mode) >= MPOL_MAX) return -EINVAL; if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) return -EINVAL; if (*flags & MPOL_F_NUMA_BALANCING) { if (*mode != MPOL_BIND) return -EINVAL; *flags |= (MPOL_F_MOF | MPOL_F_MORON); } return 0; } static long kernel_mbind(unsigned long start, unsigned long len, unsigned long mode, const unsigned long __user *nmask, unsigned long maxnode, unsigned int flags) { unsigned short mode_flags; nodemask_t nodes; int lmode = mode; int err; start = untagged_addr(start); err = sanitize_mpol_flags(&lmode, &mode_flags); if (err) return err; err = get_nodes(&nodes, nmask, maxnode); if (err) return err; return do_mbind(start, len, lmode, mode_flags, &nodes, flags); } SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len, unsigned long, home_node, unsigned long, flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; struct mempolicy *new, *old; unsigned long end; int err = -ENOENT; VMA_ITERATOR(vmi, mm, start); start = untagged_addr(start); if (start & ~PAGE_MASK) return -EINVAL; /* * flags is used for future extension if any. */ if (flags != 0) return -EINVAL; /* * Check home_node is online to avoid accessing uninitialized * NODE_DATA. */ if (home_node >= MAX_NUMNODES || !node_online(home_node)) return -EINVAL; len = PAGE_ALIGN(len); end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; mmap_write_lock(mm); prev = vma_prev(&vmi); for_each_vma_range(vmi, vma, end) { /* * If any vma in the range got policy other than MPOL_BIND * or MPOL_PREFERRED_MANY we return error. We don't reset * the home node for vmas we already updated before. */ old = vma_policy(vma); if (!old) { prev = vma; continue; } if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) { err = -EOPNOTSUPP; break; } new = mpol_dup(old); if (IS_ERR(new)) { err = PTR_ERR(new); break; } vma_start_write(vma); new->home_node = home_node; err = mbind_range(&vmi, vma, &prev, start, end, new); mpol_put(new); if (err) break; } mmap_write_unlock(mm); return err; } SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, unsigned long, mode, const unsigned long __user *, nmask, unsigned long, maxnode, unsigned int, flags) { return kernel_mbind(start, len, mode, nmask, maxnode, flags); } /* Set the process memory policy */ static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask, unsigned long maxnode) { unsigned short mode_flags; nodemask_t nodes; int lmode = mode; int err; err = sanitize_mpol_flags(&lmode, &mode_flags); if (err) return err; err = get_nodes(&nodes, nmask, maxnode); if (err) return err; return do_set_mempolicy(lmode, mode_flags, &nodes); } SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, unsigned long, maxnode) { return kernel_set_mempolicy(mode, nmask, maxnode); } static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, const unsigned long __user *old_nodes, const unsigned long __user *new_nodes) { struct mm_struct *mm = NULL; struct task_struct *task; nodemask_t task_nodes; int err; nodemask_t *old; nodemask_t *new; NODEMASK_SCRATCH(scratch); if (!scratch) return -ENOMEM; old = &scratch->mask1; new = &scratch->mask2; err = get_nodes(old, old_nodes, maxnode); if (err) goto out; err = get_nodes(new, new_nodes, maxnode); if (err) goto out; /* Find the mm_struct */ rcu_read_lock(); task = pid ? find_task_by_vpid(pid) : current; if (!task) { rcu_read_unlock(); err = -ESRCH; goto out; } get_task_struct(task); err = -EINVAL; /* * Check if this process has the right to modify the specified process. * Use the regular "ptrace_may_access()" checks. */ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { rcu_read_unlock(); err = -EPERM; goto out_put; } rcu_read_unlock(); task_nodes = cpuset_mems_allowed(task); /* Is the user allowed to access the target nodes? */ if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { err = -EPERM; goto out_put; } task_nodes = cpuset_mems_allowed(current); nodes_and(*new, *new, task_nodes); if (nodes_empty(*new)) goto out_put; err = security_task_movememory(task); if (err) goto out_put; mm = get_task_mm(task); put_task_struct(task); if (!mm) { err = -EINVAL; goto out; } err = do_migrate_pages(mm, old, new, capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); mmput(mm); out: NODEMASK_SCRATCH_FREE(scratch); return err; out_put: put_task_struct(task); goto out; } SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, const unsigned long __user *, old_nodes, const unsigned long __user *, new_nodes) { return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes); } /* Retrieve NUMA policy */ static int kernel_get_mempolicy(int __user *policy, unsigned long __user *nmask, unsigned long maxnode, unsigned long addr, unsigned long flags) { int err; int pval; nodemask_t nodes; if (nmask != NULL && maxnode < nr_node_ids) return -EINVAL; addr = untagged_addr(addr); err = do_get_mempolicy(&pval, &nodes, addr, flags); if (err) return err; if (policy && put_user(pval, policy)) return -EFAULT; if (nmask) err = copy_nodes_to_user(nmask, maxnode, &nodes); return err; } SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, unsigned long __user *, nmask, unsigned long, maxnode, unsigned long, addr, unsigned long, flags) { return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags); } bool vma_migratable(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_IO | VM_PFNMAP)) return false; /* * DAX device mappings require predictable access latency, so avoid * incurring periodic faults. */ if (vma_is_dax(vma)) return false; if (is_vm_hugetlb_page(vma) && !hugepage_migration_supported(hstate_vma(vma))) return false; /* * Migration allocates pages in the highest zone. If we cannot * do so then migration (at least from node to node) is not * possible. */ if (vma->vm_file && gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping)) < policy_zone) return false; return true; } struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx) { *ilx = 0; return (vma->vm_ops && vma->vm_ops->get_policy) ? vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy; } /* * get_vma_policy(@vma, @addr, @order, @ilx) * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup * @order: 0, or appropriate huge_page_order for interleaving * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE * * Returns effective policy for a VMA at specified address. * Falls back to current->mempolicy or system default policy, as necessary. * Shared policies [those marked as MPOL_F_SHARED] require an extra reference * count--added by the get_policy() vm_op, as appropriate--to protect against * freeing by another task. It is the caller's responsibility to free the * extra reference for shared policies. */ struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr, int order, pgoff_t *ilx) { struct mempolicy *pol; pol = __get_vma_policy(vma, addr, ilx); if (!pol) pol = get_task_policy(current); if (pol->mode == MPOL_INTERLEAVE) { *ilx += vma->vm_pgoff >> order; *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order); } return pol; } bool vma_policy_mof(struct vm_area_struct *vma) { struct mempolicy *pol; if (vma->vm_ops && vma->vm_ops->get_policy) { bool ret = false; pgoff_t ilx; /* ignored here */ pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx); if (pol && (pol->flags & MPOL_F_MOF)) ret = true; mpol_cond_put(pol); return ret; } pol = vma->vm_policy; if (!pol) pol = get_task_policy(current); return pol->flags & MPOL_F_MOF; } bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) { enum zone_type dynamic_policy_zone = policy_zone; BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); /* * if policy->nodes has movable memory only, * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. * * policy->nodes is intersect with node_states[N_MEMORY]. * so if the following test fails, it implies * policy->nodes has movable memory only. */ if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY])) dynamic_policy_zone = ZONE_MOVABLE; return zone >= dynamic_policy_zone; } /* Do dynamic interleaving for a process */ static unsigned int interleave_nodes(struct mempolicy *policy) { unsigned int nid; nid = next_node_in(current->il_prev, policy->nodes); if (nid < MAX_NUMNODES) current->il_prev = nid; return nid; } /* * Depending on the memory policy provide a node from which to allocate the * next slab entry. */ unsigned int mempolicy_slab_node(void) { struct mempolicy *policy; int node = numa_mem_id(); if (!in_task()) return node; policy = current->mempolicy; if (!policy) return node; switch (policy->mode) { case MPOL_PREFERRED: return first_node(policy->nodes); case MPOL_INTERLEAVE: return interleave_nodes(policy); case MPOL_BIND: case MPOL_PREFERRED_MANY: { struct zoneref *z; /* * Follow bind policy behavior and start allocation at the * first node. */ struct zonelist *zonelist; enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; z = first_zones_zonelist(zonelist, highest_zoneidx, &policy->nodes); return z->zone ? zone_to_nid(z->zone) : node; } case MPOL_LOCAL: return node; default: BUG(); } } /* * Do static interleaving for interleave index @ilx. Returns the ilx'th * node in pol->nodes (starting from ilx=0), wrapping around if ilx * exceeds the number of present nodes. */ static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx) { nodemask_t nodemask = pol->nodes; unsigned int target, nnodes; int i; int nid; /* * The barrier will stabilize the nodemask in a register or on * the stack so that it will stop changing under the code. * * Between first_node() and next_node(), pol->nodes could be changed * by other threads. So we put pol->nodes in a local stack. */ barrier(); nnodes = nodes_weight(nodemask); if (!nnodes) return numa_node_id(); target = ilx % nnodes; nid = first_node(nodemask); for (i = 0; i < target; i++) nid = next_node(nid, nodemask); return nid; } /* * Return a nodemask representing a mempolicy for filtering nodes for * page allocation, together with preferred node id (or the input node id). */ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, pgoff_t ilx, int *nid) { nodemask_t *nodemask = NULL; switch (pol->mode) { case MPOL_PREFERRED: /* Override input node id */ *nid = first_node(pol->nodes); break; case MPOL_PREFERRED_MANY: nodemask = &pol->nodes; if (pol->home_node != NUMA_NO_NODE) *nid = pol->home_node; break; case MPOL_BIND: /* Restrict to nodemask (but not on lower zones) */ if (apply_policy_zone(pol, gfp_zone(gfp)) && cpuset_nodemask_valid_mems_allowed(&pol->nodes)) nodemask = &pol->nodes; if (pol->home_node != NUMA_NO_NODE) *nid = pol->home_node; /* * __GFP_THISNODE shouldn't even be used with the bind policy * because we might easily break the expectation to stay on the * requested node and not break the policy. */ WARN_ON_ONCE(gfp & __GFP_THISNODE); break; case MPOL_INTERLEAVE: /* Override input node id */ *nid = (ilx == NO_INTERLEAVE_INDEX) ? interleave_nodes(pol) : interleave_nid(pol, ilx); break; } return nodemask; } #ifdef CONFIG_HUGETLBFS /* * huge_node(@vma, @addr, @gfp_flags, @mpol) * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup and interleave policy * @gfp_flags: for requested zone * @mpol: pointer to mempolicy pointer for reference counted mempolicy * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy * * Returns a nid suitable for a huge page allocation and a pointer * to the struct mempolicy for conditional unref after allocation. * If the effective policy is 'bind' or 'prefer-many', returns a pointer * to the mempolicy's @nodemask for filtering the zonelist. */ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask) { pgoff_t ilx; int nid; nid = numa_node_id(); *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx); *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid); return nid; } /* * init_nodemask_of_mempolicy * * If the current task's mempolicy is "default" [NULL], return 'false' * to indicate default policy. Otherwise, extract the policy nodemask * for 'bind' or 'interleave' policy into the argument nodemask, or * initialize the argument nodemask to contain the single node for * 'preferred' or 'local' policy and return 'true' to indicate presence * of non-default mempolicy. * * We don't bother with reference counting the mempolicy [mpol_get/put] * because the current task is examining it's own mempolicy and a task's * mempolicy is only ever changed by the task itself. * * N.B., it is the caller's responsibility to free a returned nodemask. */ bool init_nodemask_of_mempolicy(nodemask_t *mask) { struct mempolicy *mempolicy; if (!(mask && current->mempolicy)) return false; task_lock(current); mempolicy = current->mempolicy; switch (mempolicy->mode) { case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: *mask = mempolicy->nodes; break; case MPOL_LOCAL: init_nodemask_of_node(mask, numa_node_id()); break; default: BUG(); } task_unlock(current); return true; } #endif /* * mempolicy_in_oom_domain * * If tsk's mempolicy is "bind", check for intersection between mask and * the policy nodemask. Otherwise, return true for all other policies * including "interleave", as a tsk with "interleave" policy may have * memory allocated from all nodes in system. * * Takes task_lock(tsk) to prevent freeing of its mempolicy. */ bool mempolicy_in_oom_domain(struct task_struct *tsk, const nodemask_t *mask) { struct mempolicy *mempolicy; bool ret = true; if (!mask) return ret; task_lock(tsk); mempolicy = tsk->mempolicy; if (mempolicy && mempolicy->mode == MPOL_BIND) ret = nodes_intersects(mempolicy->nodes, *mask); task_unlock(tsk); return ret; } static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, int nid, nodemask_t *nodemask) { struct page *page; gfp_t preferred_gfp; /* * This is a two pass approach. The first pass will only try the * preferred nodes but skip the direct reclaim and allow the * allocation to fail, while the second pass will try all the * nodes in system. */ preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); page = __alloc_pages(preferred_gfp, order, nid, nodemask); if (!page) page = __alloc_pages(gfp, order, nid, NULL); return page; } /** * alloc_pages_mpol - Allocate pages according to NUMA mempolicy. * @gfp: GFP flags. * @order: Order of the page allocation. * @pol: Pointer to the NUMA mempolicy. * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()). * @nid: Preferred node (usually numa_node_id() but @mpol may override it). * * Return: The page on success or NULL if allocation fails. */ struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, struct mempolicy *pol, pgoff_t ilx, int nid) { nodemask_t *nodemask; struct page *page; nodemask = policy_nodemask(gfp, pol, ilx, &nid); if (pol->mode == MPOL_PREFERRED_MANY) return alloc_pages_preferred_many(gfp, order, nid, nodemask); if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && /* filter "hugepage" allocation, unless from alloc_pages() */ order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) { /* * For hugepage allocation and non-interleave policy which * allows the current node (or other explicitly preferred * node) we only try to allocate from the current/preferred * node and don't fall back to other nodes, as the cost of * remote accesses would likely offset THP benefits. * * If the policy is interleave or does not allow the current * node in its nodemask, we allocate the standard way. */ if (pol->mode != MPOL_INTERLEAVE && (!nodemask || node_isset(nid, *nodemask))) { /* * First, try to allocate THP only on local node, but * don't reclaim unnecessarily, just compact. */ page = __alloc_pages_node(nid, gfp | __GFP_THISNODE | __GFP_NORETRY, order); if (page || !(gfp & __GFP_DIRECT_RECLAIM)) return page; /* * If hugepage allocations are configured to always * synchronous compact or the vma has been madvised * to prefer hugepage backing, retry allowing remote * memory with both reclaim and compact as well. */ } } page = __alloc_pages(gfp, order, nid, nodemask); if (unlikely(pol->mode == MPOL_INTERLEAVE) && page) { /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */ if (static_branch_likely(&vm_numa_stat_key) && page_to_nid(page) == nid) { preempt_disable(); __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); preempt_enable(); } } return page; } /** * vma_alloc_folio - Allocate a folio for a VMA. * @gfp: GFP flags. * @order: Order of the folio. * @vma: Pointer to VMA. * @addr: Virtual address of the allocation. Must be inside @vma. * @hugepage: Unused (was: For hugepages try only preferred node if possible). * * Allocate a folio for a specific address in @vma, using the appropriate * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the * VMA to prevent it from going away. Should be used for all allocations * for folios that will be mapped into user space, excepting hugetlbfs, and * excepting where direct use of alloc_pages_mpol() is more appropriate. * * Return: The folio on success or NULL if allocation fails. */ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, bool hugepage) { struct mempolicy *pol; pgoff_t ilx; struct page *page; pol = get_vma_policy(vma, addr, order, &ilx); page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol, ilx, numa_node_id()); mpol_cond_put(pol); return page_rmappable_folio(page); } EXPORT_SYMBOL(vma_alloc_folio); /** * alloc_pages - Allocate pages. * @gfp: GFP flags. * @order: Power of two of number of pages to allocate. * * Allocate 1 << @order contiguous pages. The physical address of the * first page is naturally aligned (eg an order-3 allocation will be aligned * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current * process is honoured when in process context. * * Context: Can be called from any context, providing the appropriate GFP * flags are used. * Return: The page on success or NULL if allocation fails. */ struct page *alloc_pages(gfp_t gfp, unsigned int order) { struct mempolicy *pol = &default_policy; /* * No reference counting needed for current->mempolicy * nor system default_policy */ if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX, numa_node_id()); } EXPORT_SYMBOL(alloc_pages); struct folio *folio_alloc(gfp_t gfp, unsigned int order) { return page_rmappable_folio(alloc_pages(gfp | __GFP_COMP, order)); } EXPORT_SYMBOL(folio_alloc); static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { int nodes; unsigned long nr_pages_per_node; int delta; int i; unsigned long nr_allocated; unsigned long total_allocated = 0; nodes = nodes_weight(pol->nodes); nr_pages_per_node = nr_pages / nodes; delta = nr_pages - nodes * nr_pages_per_node; for (i = 0; i < nodes; i++) { if (delta) { nr_allocated = __alloc_pages_bulk(gfp, interleave_nodes(pol), NULL, nr_pages_per_node + 1, NULL, page_array); delta--; } else { nr_allocated = __alloc_pages_bulk(gfp, interleave_nodes(pol), NULL, nr_pages_per_node, NULL, page_array); } page_array += nr_allocated; total_allocated += nr_allocated; } return total_allocated; } static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { gfp_t preferred_gfp; unsigned long nr_allocated = 0; preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); nr_allocated = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes, nr_pages, NULL, page_array); if (nr_allocated < nr_pages) nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL, nr_pages - nr_allocated, NULL, page_array + nr_allocated); return nr_allocated; } /* alloc pages bulk and mempolicy should be considered at the * same time in some situation such as vmalloc. * * It can accelerate memory allocation especially interleaving * allocate memory. */ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, unsigned long nr_pages, struct page **page_array) { struct mempolicy *pol = &default_policy; nodemask_t *nodemask; int nid; if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); if (pol->mode == MPOL_INTERLEAVE) return alloc_pages_bulk_array_interleave(gfp, pol, nr_pages, page_array); if (pol->mode == MPOL_PREFERRED_MANY) return alloc_pages_bulk_array_preferred_many(gfp, numa_node_id(), pol, nr_pages, page_array); nid = numa_node_id(); nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid); return __alloc_pages_bulk(gfp, nid, nodemask, nr_pages, NULL, page_array); } int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { struct mempolicy *pol = mpol_dup(src->vm_policy); if (IS_ERR(pol)) return PTR_ERR(pol); dst->vm_policy = pol; return 0; } /* * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it * rebinds the mempolicy its copying by calling mpol_rebind_policy() * with the mems_allowed returned by cpuset_mems_allowed(). This * keeps mempolicies cpuset relative after its cpuset moves. See * further kernel/cpuset.c update_nodemask(). * * current's mempolicy may be rebinded by the other task(the task that changes * cpuset's mems), so we needn't do rebind work for current task. */ /* Slow path of a mempolicy duplicate */ struct mempolicy *__mpol_dup(struct mempolicy *old) { struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!new) return ERR_PTR(-ENOMEM); /* task's mempolicy is protected by alloc_lock */ if (old == current->mempolicy) { task_lock(current); *new = *old; task_unlock(current); } else *new = *old; if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); mpol_rebind_policy(new, &mems); } atomic_set(&new->refcnt, 1); return new; } /* Slow path of a mempolicy comparison */ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) { if (!a || !b) return false; if (a->mode != b->mode) return false; if (a->flags != b->flags) return false; if (a->home_node != b->home_node) return false; if (mpol_store_user_nodemask(a)) if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) return false; switch (a->mode) { case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: return !!nodes_equal(a->nodes, b->nodes); case MPOL_LOCAL: return true; default: BUG(); return false; } } /* * Shared memory backing store policy support. * * Remember policies even when nobody has shared memory mapped. * The policies are kept in Red-Black tree linked from the inode. * They are protected by the sp->lock rwlock, which should be held * for any accesses to the tree. */ /* * lookup first element intersecting start-end. Caller holds sp->lock for * reading or for writing */ static struct sp_node *sp_lookup(struct shared_policy *sp, pgoff_t start, pgoff_t end) { struct rb_node *n = sp->root.rb_node; while (n) { struct sp_node *p = rb_entry(n, struct sp_node, nd); if (start >= p->end) n = n->rb_right; else if (end <= p->start) n = n->rb_left; else break; } if (!n) return NULL; for (;;) { struct sp_node *w = NULL; struct rb_node *prev = rb_prev(n); if (!prev) break; w = rb_entry(prev, struct sp_node, nd); if (w->end <= start) break; n = prev; } return rb_entry(n, struct sp_node, nd); } /* * Insert a new shared policy into the list. Caller holds sp->lock for * writing. */ static void sp_insert(struct shared_policy *sp, struct sp_node *new) { struct rb_node **p = &sp->root.rb_node; struct rb_node *parent = NULL; struct sp_node *nd; while (*p) { parent = *p; nd = rb_entry(parent, struct sp_node, nd); if (new->start < nd->start) p = &(*p)->rb_left; else if (new->end > nd->end) p = &(*p)->rb_right; else BUG(); } rb_link_node(&new->nd, parent, p); rb_insert_color(&new->nd, &sp->root); } /* Find shared policy intersecting idx */ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx) { struct mempolicy *pol = NULL; struct sp_node *sn; if (!sp->root.rb_node) return NULL; read_lock(&sp->lock); sn = sp_lookup(sp, idx, idx+1); if (sn) { mpol_get(sn->policy); pol = sn->policy; } read_unlock(&sp->lock); return pol; } static void sp_free(struct sp_node *n) { mpol_put(n->policy); kmem_cache_free(sn_cache, n); } /** * mpol_misplaced - check whether current folio node is valid in policy * * @folio: folio to be checked * @vma: vm area where folio mapped * @addr: virtual address in @vma for shared policy lookup and interleave policy * * Lookup current policy node id for vma,addr and "compare to" folio's * node id. Policy determination "mimics" alloc_page_vma(). * Called from fault path where we know the vma and faulting address. * * Return: NUMA_NO_NODE if the page is in a node that is valid for this * policy, or a suitable node ID to allocate a replacement folio from. */ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol; pgoff_t ilx; struct zoneref *z; int curnid = folio_nid(folio); int thiscpu = raw_smp_processor_id(); int thisnid = cpu_to_node(thiscpu); int polnid = NUMA_NO_NODE; int ret = NUMA_NO_NODE; pol = get_vma_policy(vma, addr, folio_order(folio), &ilx); if (!(pol->flags & MPOL_F_MOF)) goto out; switch (pol->mode) { case MPOL_INTERLEAVE: polnid = interleave_nid(pol, ilx); break; case MPOL_PREFERRED: if (node_isset(curnid, pol->nodes)) goto out; polnid = first_node(pol->nodes); break; case MPOL_LOCAL: polnid = numa_node_id(); break; case MPOL_BIND: /* Optimize placement among multiple nodes via NUMA balancing */ if (pol->flags & MPOL_F_MORON) { if (node_isset(thisnid, pol->nodes)) break; goto out; } fallthrough; case MPOL_PREFERRED_MANY: /* * use current page if in policy nodemask, * else select nearest allowed node, if any. * If no allowed nodes, use current [!misplaced]. */ if (node_isset(curnid, pol->nodes)) goto out; z = first_zones_zonelist( node_zonelist(numa_node_id(), GFP_HIGHUSER), gfp_zone(GFP_HIGHUSER), &pol->nodes); polnid = zone_to_nid(z->zone); break; default: BUG(); } /* Migrate the folio towards the node whose CPU is referencing it */ if (pol->flags & MPOL_F_MORON) { polnid = thisnid; if (!should_numa_migrate_memory(current, folio, curnid, thiscpu)) goto out; } if (curnid != polnid) ret = polnid; out: mpol_cond_put(pol); return ret; } /* * Drop the (possibly final) reference to task->mempolicy. It needs to be * dropped after task->mempolicy is set to NULL so that any allocation done as * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed * policy. */ void mpol_put_task_policy(struct task_struct *task) { struct mempolicy *pol; task_lock(task); pol = task->mempolicy; task->mempolicy = NULL; task_unlock(task); mpol_put(pol); } static void sp_delete(struct shared_policy *sp, struct sp_node *n) { rb_erase(&n->nd, &sp->root); sp_free(n); } static void sp_node_init(struct sp_node *node, unsigned long start, unsigned long end, struct mempolicy *pol) { node->start = start; node->end = end; node->policy = pol; } static struct sp_node *sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) { struct sp_node *n; struct mempolicy *newpol; n = kmem_cache_alloc(sn_cache, GFP_KERNEL); if (!n) return NULL; newpol = mpol_dup(pol); if (IS_ERR(newpol)) { kmem_cache_free(sn_cache, n); return NULL; } newpol->flags |= MPOL_F_SHARED; sp_node_init(n, start, end, newpol); return n; } /* Replace a policy range. */ static int shared_policy_replace(struct shared_policy *sp, pgoff_t start, pgoff_t end, struct sp_node *new) { struct sp_node *n; struct sp_node *n_new = NULL; struct mempolicy *mpol_new = NULL; int ret = 0; restart: write_lock(&sp->lock); n = sp_lookup(sp, start, end); /* Take care of old policies in the same range. */ while (n && n->start < end) { struct rb_node *next = rb_next(&n->nd); if (n->start >= start) { if (n->end <= end) sp_delete(sp, n); else n->start = end; } else { /* Old policy spanning whole new range. */ if (n->end > end) { if (!n_new) goto alloc_new; *mpol_new = *n->policy; atomic_set(&mpol_new->refcnt, 1); sp_node_init(n_new, end, n->end, mpol_new); n->end = start; sp_insert(sp, n_new); n_new = NULL; mpol_new = NULL; break; } else n->end = start; } if (!next) break; n = rb_entry(next, struct sp_node, nd); } if (new) sp_insert(sp, new); write_unlock(&sp->lock); ret = 0; err_out: if (mpol_new) mpol_put(mpol_new); if (n_new) kmem_cache_free(sn_cache, n_new); return ret; alloc_new: write_unlock(&sp->lock); ret = -ENOMEM; n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); if (!n_new) goto err_out; mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!mpol_new) goto err_out; atomic_set(&mpol_new->refcnt, 1); goto restart; } /** * mpol_shared_policy_init - initialize shared policy for inode * @sp: pointer to inode shared policy * @mpol: struct mempolicy to install * * Install non-NULL @mpol in inode's shared policy rb-tree. * On entry, the current task has a reference on a non-NULL @mpol. * This must be released on exit. * This is called at get_inode() calls and we can use GFP_KERNEL. */ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) { int ret; sp->root = RB_ROOT; /* empty tree == default mempolicy */ rwlock_init(&sp->lock); if (mpol) { struct sp_node *sn; struct mempolicy *npol; NODEMASK_SCRATCH(scratch); if (!scratch) goto put_mpol; /* contextualize the tmpfs mount point mempolicy to this file */ npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); if (IS_ERR(npol)) goto free_scratch; /* no valid nodemask intersection */ task_lock(current); ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch); task_unlock(current); if (ret) goto put_npol; /* alloc node covering entire file; adds ref to file's npol */ sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol); if (sn) sp_insert(sp, sn); put_npol: mpol_put(npol); /* drop initial ref on file's npol */ free_scratch: NODEMASK_SCRATCH_FREE(scratch); put_mpol: mpol_put(mpol); /* drop our incoming ref on sb mpol */ } } int mpol_set_shared_policy(struct shared_policy *sp, struct vm_area_struct *vma, struct mempolicy *pol) { int err; struct sp_node *new = NULL; unsigned long sz = vma_pages(vma); if (pol) { new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol); if (!new) return -ENOMEM; } err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new); if (err && new) sp_free(new); return err; } /* Free a backing policy store on inode delete. */ void mpol_free_shared_policy(struct shared_policy *sp) { struct sp_node *n; struct rb_node *next; if (!sp->root.rb_node) return; write_lock(&sp->lock); next = rb_first(&sp->root); while (next) { n = rb_entry(next, struct sp_node, nd); next = rb_next(&n->nd); sp_delete(sp, n); } write_unlock(&sp->lock); } #ifdef CONFIG_NUMA_BALANCING static int __initdata numabalancing_override; static void __init check_numabalancing_enable(void) { bool numabalancing_default = false; if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) numabalancing_default = true; /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ if (numabalancing_override) set_numabalancing_state(numabalancing_override == 1); if (num_online_nodes() > 1 && !numabalancing_override) { pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n", numabalancing_default ? "Enabling" : "Disabling"); set_numabalancing_state(numabalancing_default); } } static int __init setup_numabalancing(char *str) { int ret = 0; if (!str) goto out; if (!strcmp(str, "enable")) { numabalancing_override = 1; ret = 1; } else if (!strcmp(str, "disable")) { numabalancing_override = -1; ret = 1; } out: if (!ret) pr_warn("Unable to parse numa_balancing=\n"); return ret; } __setup("numa_balancing=", setup_numabalancing); #else static inline void __init check_numabalancing_enable(void) { } #endif /* CONFIG_NUMA_BALANCING */ void __init numa_policy_init(void) { nodemask_t interleave_nodes; unsigned long largest = 0; int nid, prefer = 0; policy_cache = kmem_cache_create("numa_policy", sizeof(struct mempolicy), 0, SLAB_PANIC, NULL); sn_cache = kmem_cache_create("shared_policy_node", sizeof(struct sp_node), 0, SLAB_PANIC, NULL); for_each_node(nid) { preferred_node_policy[nid] = (struct mempolicy) { .refcnt = ATOMIC_INIT(1), .mode = MPOL_PREFERRED, .flags = MPOL_F_MOF | MPOL_F_MORON, .nodes = nodemask_of_node(nid), }; } /* * Set interleaving policy for system init. Interleaving is only * enabled across suitably sized nodes (default is >= 16MB), or * fall back to the largest node if they're all smaller. */ nodes_clear(interleave_nodes); for_each_node_state(nid, N_MEMORY) { unsigned long total_pages = node_present_pages(nid); /* Preserve the largest node */ if (largest < total_pages) { largest = total_pages; prefer = nid; } /* Interleave this node? */ if ((total_pages << PAGE_SHIFT) >= (16 << 20)) node_set(nid, interleave_nodes); } /* All too small, use the largest */ if (unlikely(nodes_empty(interleave_nodes))) node_set(prefer, interleave_nodes); if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) pr_err("%s: interleaving failed\n", __func__); check_numabalancing_enable(); } /* Reset policy of current process to default */ void numa_default_policy(void) { do_set_mempolicy(MPOL_DEFAULT, 0, NULL); } /* * Parse and format mempolicy from/to strings */ static const char * const policy_modes[] = { [MPOL_DEFAULT] = "default", [MPOL_PREFERRED] = "prefer", [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", [MPOL_LOCAL] = "local", [MPOL_PREFERRED_MANY] = "prefer (many)", }; #ifdef CONFIG_TMPFS /** * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. * @str: string containing mempolicy to parse * @mpol: pointer to struct mempolicy pointer, returned on success. * * Format of input: * <mode>[=<flags>][:<nodelist>] * * Return: %0 on success, else %1 */ int mpol_parse_str(char *str, struct mempolicy **mpol) { struct mempolicy *new = NULL; unsigned short mode_flags; nodemask_t nodes; char *nodelist = strchr(str, ':'); char *flags = strchr(str, '='); int err = 1, mode; if (flags) *flags++ = '\0'; /* terminate mode string */ if (nodelist) { /* NUL-terminate mode or flags string */ *nodelist++ = '\0'; if (nodelist_parse(nodelist, nodes)) goto out; if (!nodes_subset(nodes, node_states[N_MEMORY])) goto out; } else nodes_clear(nodes); mode = match_string(policy_modes, MPOL_MAX, str); if (mode < 0) goto out; switch (mode) { case MPOL_PREFERRED: /* * Insist on a nodelist of one node only, although later * we use first_node(nodes) to grab a single node, so here * nodelist (or nodes) cannot be empty. */ if (nodelist) { char *rest = nodelist; while (isdigit(*rest)) rest++; if (*rest) goto out; if (nodes_empty(nodes)) goto out; } break; case MPOL_INTERLEAVE: /* * Default to online nodes with memory if no nodelist */ if (!nodelist) nodes = node_states[N_MEMORY]; break; case MPOL_LOCAL: /* * Don't allow a nodelist; mpol_new() checks flags */ if (nodelist) goto out; break; case MPOL_DEFAULT: /* * Insist on a empty nodelist */ if (!nodelist) err = 0; goto out; case MPOL_PREFERRED_MANY: case MPOL_BIND: /* * Insist on a nodelist */ if (!nodelist) goto out; } mode_flags = 0; if (flags) { /* * Currently, we only support two mutually exclusive * mode flags. */ if (!strcmp(flags, "static")) mode_flags |= MPOL_F_STATIC_NODES; else if (!strcmp(flags, "relative")) mode_flags |= MPOL_F_RELATIVE_NODES; else goto out; } new = mpol_new(mode, mode_flags, &nodes); if (IS_ERR(new)) goto out; /* * Save nodes for mpol_to_str() to show the tmpfs mount options * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. */ if (mode != MPOL_PREFERRED) { new->nodes = nodes; } else if (nodelist) { nodes_clear(new->nodes); node_set(first_node(nodes), new->nodes); } else { new->mode = MPOL_LOCAL; } /* * Save nodes for contextualization: this will be used to "clone" * the mempolicy in a specific context [cpuset] at a later time. */ new->w.user_nodemask = nodes; err = 0; out: /* Restore string for error message */ if (nodelist) *--nodelist = ':'; if (flags) *--flags = '='; if (!err) *mpol = new; return err; } #endif /* CONFIG_TMPFS */ /** * mpol_to_str - format a mempolicy structure for printing * @buffer: to contain formatted mempolicy string * @maxlen: length of @buffer * @pol: pointer to mempolicy to be formatted * * Convert @pol into a string. If @buffer is too short, truncate the string. * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the * longest flag, "relative", and to display at least a few node ids. */ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) { char *p = buffer; nodemask_t nodes = NODE_MASK_NONE; unsigned short mode = MPOL_DEFAULT; unsigned short flags = 0; if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) { mode = pol->mode; flags = pol->flags; } switch (mode) { case MPOL_DEFAULT: case MPOL_LOCAL: break; case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: nodes = pol->nodes; break; default: WARN_ON_ONCE(1); snprintf(p, maxlen, "unknown"); return; } p += snprintf(p, maxlen, "%s", policy_modes[mode]); if (flags & MPOL_MODE_FLAGS) { p += snprintf(p, buffer + maxlen - p, "="); /* * Currently, the only defined flags are mutually exclusive */ if (flags & MPOL_F_STATIC_NODES) p += snprintf(p, buffer + maxlen - p, "static"); else if (flags & MPOL_F_RELATIVE_NODES) p += snprintf(p, buffer + maxlen - p, "relative"); } if (!nodes_empty(nodes)) p += scnprintf(p, buffer + maxlen - p, ":%*pbl", nodemask_pr_args(&nodes)); }
23 8 4 8 22 23 23 15 1 23 23 23 10 10 10 23 23 23 23 18 23 12 20 23 23 23 22 21 23 23 23 23 78 61 10 55 75 78 71 50 4 69 71 49 18 18 18 49 49 49 33 48 49 49 48 49 49 49 48 23 23 23 23 23 2 2 2 2 2 2 23 12 12 2 2 12 23 65 1 65 11 64 2 1 2 2 2 63 63 63 6 1 5 59 63 30 4 1 29 30 63 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 // SPDX-License-Identifier: GPL-2.0-or-later /* mpihelp-mul.c - MPI helper functions * Copyright (C) 1994, 1996, 1998, 1999, * 2000 Free Software Foundation, Inc. * * This file is part of GnuPG. * * Note: This code is heavily based on the GNU MP Library. * Actually it's the same code with only minor changes in the * way the data is stored; this is to support the abstraction * of an optional secure memory allocation which may be used * to avoid revealing of sensitive data due to paging etc. * The GNU MP Library itself is published under the LGPL; * however I decided to publish this code under the plain GPL. */ #include <linux/string.h> #include "mpi-internal.h" #include "longlong.h" #define MPN_MUL_N_RECURSE(prodp, up, vp, size, tspace) \ do { \ if ((size) < KARATSUBA_THRESHOLD) \ mul_n_basecase(prodp, up, vp, size); \ else \ mul_n(prodp, up, vp, size, tspace); \ } while (0); #define MPN_SQR_N_RECURSE(prodp, up, size, tspace) \ do { \ if ((size) < KARATSUBA_THRESHOLD) \ mpih_sqr_n_basecase(prodp, up, size); \ else \ mpih_sqr_n(prodp, up, size, tspace); \ } while (0); /* Multiply the natural numbers u (pointed to by UP) and v (pointed to by VP), * both with SIZE limbs, and store the result at PRODP. 2 * SIZE limbs are * always stored. Return the most significant limb. * * Argument constraints: * 1. PRODP != UP and PRODP != VP, i.e. the destination * must be distinct from the multiplier and the multiplicand. * * * Handle simple cases with traditional multiplication. * * This is the most critical code of multiplication. All multiplies rely * on this, both small and huge. Small ones arrive here immediately. Huge * ones arrive here as this is the base case for Karatsuba's recursive * algorithm below. */ static mpi_limb_t mul_n_basecase(mpi_ptr_t prodp, mpi_ptr_t up, mpi_ptr_t vp, mpi_size_t size) { mpi_size_t i; mpi_limb_t cy; mpi_limb_t v_limb; /* Multiply by the first limb in V separately, as the result can be * stored (not added) to PROD. We also avoid a loop for zeroing. */ v_limb = vp[0]; if (v_limb <= 1) { if (v_limb == 1) MPN_COPY(prodp, up, size); else MPN_ZERO(prodp, size); cy = 0; } else cy = mpihelp_mul_1(prodp, up, size, v_limb); prodp[size] = cy; prodp++; /* For each iteration in the outer loop, multiply one limb from * U with one limb from V, and add it to PROD. */ for (i = 1; i < size; i++) { v_limb = vp[i]; if (v_limb <= 1) { cy = 0; if (v_limb == 1) cy = mpihelp_add_n(prodp, prodp, up, size); } else cy = mpihelp_addmul_1(prodp, up, size, v_limb); prodp[size] = cy; prodp++; } return cy; } static void mul_n(mpi_ptr_t prodp, mpi_ptr_t up, mpi_ptr_t vp, mpi_size_t size, mpi_ptr_t tspace) { if (size & 1) { /* The size is odd, and the code below doesn't handle that. * Multiply the least significant (size - 1) limbs with a recursive * call, and handle the most significant limb of S1 and S2 * separately. * A slightly faster way to do this would be to make the Karatsuba * code below behave as if the size were even, and let it check for * odd size in the end. I.e., in essence move this code to the end. * Doing so would save us a recursive call, and potentially make the * stack grow a lot less. */ mpi_size_t esize = size - 1; /* even size */ mpi_limb_t cy_limb; MPN_MUL_N_RECURSE(prodp, up, vp, esize, tspace); cy_limb = mpihelp_addmul_1(prodp + esize, up, esize, vp[esize]); prodp[esize + esize] = cy_limb; cy_limb = mpihelp_addmul_1(prodp + esize, vp, size, up[esize]); prodp[esize + size] = cy_limb; } else { /* Anatolij Alekseevich Karatsuba's divide-and-conquer algorithm. * * Split U in two pieces, U1 and U0, such that * U = U0 + U1*(B**n), * and V in V1 and V0, such that * V = V0 + V1*(B**n). * * UV is then computed recursively using the identity * * 2n n n n * UV = (B + B )U V + B (U -U )(V -V ) + (B + 1)U V * 1 1 1 0 0 1 0 0 * * Where B = 2**BITS_PER_MP_LIMB. */ mpi_size_t hsize = size >> 1; mpi_limb_t cy; int negflg; /* Product H. ________________ ________________ * |_____U1 x V1____||____U0 x V0_____| * Put result in upper part of PROD and pass low part of TSPACE * as new TSPACE. */ MPN_MUL_N_RECURSE(prodp + size, up + hsize, vp + hsize, hsize, tspace); /* Product M. ________________ * |_(U1-U0)(V0-V1)_| */ if (mpihelp_cmp(up + hsize, up, hsize) >= 0) { mpihelp_sub_n(prodp, up + hsize, up, hsize); negflg = 0; } else { mpihelp_sub_n(prodp, up, up + hsize, hsize); negflg = 1; } if (mpihelp_cmp(vp + hsize, vp, hsize) >= 0) { mpihelp_sub_n(prodp + hsize, vp + hsize, vp, hsize); negflg ^= 1; } else { mpihelp_sub_n(prodp + hsize, vp, vp + hsize, hsize); /* No change of NEGFLG. */ } /* Read temporary operands from low part of PROD. * Put result in low part of TSPACE using upper part of TSPACE * as new TSPACE. */ MPN_MUL_N_RECURSE(tspace, prodp, prodp + hsize, hsize, tspace + size); /* Add/copy product H. */ MPN_COPY(prodp + hsize, prodp + size, hsize); cy = mpihelp_add_n(prodp + size, prodp + size, prodp + size + hsize, hsize); /* Add product M (if NEGFLG M is a negative number) */ if (negflg) cy -= mpihelp_sub_n(prodp + hsize, prodp + hsize, tspace, size); else cy += mpihelp_add_n(prodp + hsize, prodp + hsize, tspace, size); /* Product L. ________________ ________________ * |________________||____U0 x V0_____| * Read temporary operands from low part of PROD. * Put result in low part of TSPACE using upper part of TSPACE * as new TSPACE. */ MPN_MUL_N_RECURSE(tspace, up, vp, hsize, tspace + size); /* Add/copy Product L (twice) */ cy += mpihelp_add_n(prodp + hsize, prodp + hsize, tspace, size); if (cy) mpihelp_add_1(prodp + hsize + size, prodp + hsize + size, hsize, cy); MPN_COPY(prodp, tspace, hsize); cy = mpihelp_add_n(prodp + hsize, prodp + hsize, tspace + hsize, hsize); if (cy) mpihelp_add_1(prodp + size, prodp + size, size, 1); } } void mpih_sqr_n_basecase(mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size) { mpi_size_t i; mpi_limb_t cy_limb; mpi_limb_t v_limb; /* Multiply by the first limb in V separately, as the result can be * stored (not added) to PROD. We also avoid a loop for zeroing. */ v_limb = up[0]; if (v_limb <= 1) { if (v_limb == 1) MPN_COPY(prodp, up, size); else MPN_ZERO(prodp, size); cy_limb = 0; } else cy_limb = mpihelp_mul_1(prodp, up, size, v_limb); prodp[size] = cy_limb; prodp++; /* For each iteration in the outer loop, multiply one limb from * U with one limb from V, and add it to PROD. */ for (i = 1; i < size; i++) { v_limb = up[i]; if (v_limb <= 1) { cy_limb = 0; if (v_limb == 1) cy_limb = mpihelp_add_n(prodp, prodp, up, size); } else cy_limb = mpihelp_addmul_1(prodp, up, size, v_limb); prodp[size] = cy_limb; prodp++; } } void mpih_sqr_n(mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size, mpi_ptr_t tspace) { if (size & 1) { /* The size is odd, and the code below doesn't handle that. * Multiply the least significant (size - 1) limbs with a recursive * call, and handle the most significant limb of S1 and S2 * separately. * A slightly faster way to do this would be to make the Karatsuba * code below behave as if the size were even, and let it check for * odd size in the end. I.e., in essence move this code to the end. * Doing so would save us a recursive call, and potentially make the * stack grow a lot less. */ mpi_size_t esize = size - 1; /* even size */ mpi_limb_t cy_limb; MPN_SQR_N_RECURSE(prodp, up, esize, tspace); cy_limb = mpihelp_addmul_1(prodp + esize, up, esize, up[esize]); prodp[esize + esize] = cy_limb; cy_limb = mpihelp_addmul_1(prodp + esize, up, size, up[esize]); prodp[esize + size] = cy_limb; } else { mpi_size_t hsize = size >> 1; mpi_limb_t cy; /* Product H. ________________ ________________ * |_____U1 x U1____||____U0 x U0_____| * Put result in upper part of PROD and pass low part of TSPACE * as new TSPACE. */ MPN_SQR_N_RECURSE(prodp + size, up + hsize, hsize, tspace); /* Product M. ________________ * |_(U1-U0)(U0-U1)_| */ if (mpihelp_cmp(up + hsize, up, hsize) >= 0) mpihelp_sub_n(prodp, up + hsize, up, hsize); else mpihelp_sub_n(prodp, up, up + hsize, hsize); /* Read temporary operands from low part of PROD. * Put result in low part of TSPACE using upper part of TSPACE * as new TSPACE. */ MPN_SQR_N_RECURSE(tspace, prodp, hsize, tspace + size); /* Add/copy product H */ MPN_COPY(prodp + hsize, prodp + size, hsize); cy = mpihelp_add_n(prodp + size, prodp + size, prodp + size + hsize, hsize); /* Add product M (if NEGFLG M is a negative number). */ cy -= mpihelp_sub_n(prodp + hsize, prodp + hsize, tspace, size); /* Product L. ________________ ________________ * |________________||____U0 x U0_____| * Read temporary operands from low part of PROD. * Put result in low part of TSPACE using upper part of TSPACE * as new TSPACE. */ MPN_SQR_N_RECURSE(tspace, up, hsize, tspace + size); /* Add/copy Product L (twice). */ cy += mpihelp_add_n(prodp + hsize, prodp + hsize, tspace, size); if (cy) mpihelp_add_1(prodp + hsize + size, prodp + hsize + size, hsize, cy); MPN_COPY(prodp, tspace, hsize); cy = mpihelp_add_n(prodp + hsize, prodp + hsize, tspace + hsize, hsize); if (cy) mpihelp_add_1(prodp + size, prodp + size, size, 1); } } void mpihelp_mul_n(mpi_ptr_t prodp, mpi_ptr_t up, mpi_ptr_t vp, mpi_size_t size) { if (up == vp) { if (size < KARATSUBA_THRESHOLD) mpih_sqr_n_basecase(prodp, up, size); else { mpi_ptr_t tspace; tspace = mpi_alloc_limb_space(2 * size); mpih_sqr_n(prodp, up, size, tspace); mpi_free_limb_space(tspace); } } else { if (size < KARATSUBA_THRESHOLD) mul_n_basecase(prodp, up, vp, size); else { mpi_ptr_t tspace; tspace = mpi_alloc_limb_space(2 * size); mul_n(prodp, up, vp, size, tspace); mpi_free_limb_space(tspace); } } } int mpihelp_mul_karatsuba_case(mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t usize, mpi_ptr_t vp, mpi_size_t vsize, struct karatsuba_ctx *ctx) { mpi_limb_t cy; if (!ctx->tspace || ctx->tspace_size < vsize) { if (ctx->tspace) mpi_free_limb_space(ctx->tspace); ctx->tspace = mpi_alloc_limb_space(2 * vsize); if (!ctx->tspace) return -ENOMEM; ctx->tspace_size = vsize; } MPN_MUL_N_RECURSE(prodp, up, vp, vsize, ctx->tspace); prodp += vsize; up += vsize; usize -= vsize; if (usize >= vsize) { if (!ctx->tp || ctx->tp_size < vsize) { if (ctx->tp) mpi_free_limb_space(ctx->tp); ctx->tp = mpi_alloc_limb_space(2 * vsize); if (!ctx->tp) { if (ctx->tspace) mpi_free_limb_space(ctx->tspace); ctx->tspace = NULL; return -ENOMEM; } ctx->tp_size = vsize; } do { MPN_MUL_N_RECURSE(ctx->tp, up, vp, vsize, ctx->tspace); cy = mpihelp_add_n(prodp, prodp, ctx->tp, vsize); mpihelp_add_1(prodp + vsize, ctx->tp + vsize, vsize, cy); prodp += vsize; up += vsize; usize -= vsize; } while (usize >= vsize); } if (usize) { if (usize < KARATSUBA_THRESHOLD) { mpi_limb_t tmp; if (mpihelp_mul(ctx->tspace, vp, vsize, up, usize, &tmp) < 0) return -ENOMEM; } else { if (!ctx->next) { ctx->next = kzalloc(sizeof *ctx, GFP_KERNEL); if (!ctx->next) return -ENOMEM; } if (mpihelp_mul_karatsuba_case(ctx->tspace, vp, vsize, up, usize, ctx->next) < 0) return -ENOMEM; } cy = mpihelp_add_n(prodp, prodp, ctx->tspace, vsize); mpihelp_add_1(prodp + vsize, ctx->tspace + vsize, usize, cy); } return 0; } void mpihelp_release_karatsuba_ctx(struct karatsuba_ctx *ctx) { struct karatsuba_ctx *ctx2; if (ctx->tp) mpi_free_limb_space(ctx->tp); if (ctx->tspace) mpi_free_limb_space(ctx->tspace); for (ctx = ctx->next; ctx; ctx = ctx2) { ctx2 = ctx->next; if (ctx->tp) mpi_free_limb_space(ctx->tp); if (ctx->tspace) mpi_free_limb_space(ctx->tspace); kfree(ctx); } } /* Multiply the natural numbers u (pointed to by UP, with USIZE limbs) * and v (pointed to by VP, with VSIZE limbs), and store the result at * PRODP. USIZE + VSIZE limbs are always stored, but if the input * operands are normalized. Return the most significant limb of the * result. * * NOTE: The space pointed to by PRODP is overwritten before finished * with U and V, so overlap is an error. * * Argument constraints: * 1. USIZE >= VSIZE. * 2. PRODP != UP and PRODP != VP, i.e. the destination * must be distinct from the multiplier and the multiplicand. */ int mpihelp_mul(mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t usize, mpi_ptr_t vp, mpi_size_t vsize, mpi_limb_t *_result) { mpi_ptr_t prod_endp = prodp + usize + vsize - 1; mpi_limb_t cy; struct karatsuba_ctx ctx; if (vsize < KARATSUBA_THRESHOLD) { mpi_size_t i; mpi_limb_t v_limb; if (!vsize) { *_result = 0; return 0; } /* Multiply by the first limb in V separately, as the result can be * stored (not added) to PROD. We also avoid a loop for zeroing. */ v_limb = vp[0]; if (v_limb <= 1) { if (v_limb == 1) MPN_COPY(prodp, up, usize); else MPN_ZERO(prodp, usize); cy = 0; } else cy = mpihelp_mul_1(prodp, up, usize, v_limb); prodp[usize] = cy; prodp++; /* For each iteration in the outer loop, multiply one limb from * U with one limb from V, and add it to PROD. */ for (i = 1; i < vsize; i++) { v_limb = vp[i]; if (v_limb <= 1) { cy = 0; if (v_limb == 1) cy = mpihelp_add_n(prodp, prodp, up, usize); } else cy = mpihelp_addmul_1(prodp, up, usize, v_limb); prodp[usize] = cy; prodp++; } *_result = cy; return 0; } memset(&ctx, 0, sizeof ctx); if (mpihelp_mul_karatsuba_case(prodp, up, usize, vp, vsize, &ctx) < 0) return -ENOMEM; mpihelp_release_karatsuba_ctx(&ctx); *_result = *prod_endp; return 0; }
28 28 3 6 25 26 1 1 2 1 2 1 1 1 1 1 1 6 1 1 1 1 1 2 2 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 // SPDX-License-Identifier: GPL-2.0-only /* * kernel/power/main.c - PM subsystem core functionality. * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab */ #include <linux/acpi.h> #include <linux/export.h> #include <linux/kobject.h> #include <linux/string.h> #include <linux/pm-trace.h> #include <linux/workqueue.h> #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/suspend.h> #include <linux/syscalls.h> #include <linux/pm_runtime.h> #include "power.h" #ifdef CONFIG_PM_SLEEP /* * The following functions are used by the suspend/hibernate code to temporarily * change gfp_allowed_mask in order to avoid using I/O during memory allocations * while devices are suspended. To avoid races with the suspend/hibernate code, * they should always be called with system_transition_mutex held * (gfp_allowed_mask also should only be modified with system_transition_mutex * held, unless the suspend/hibernate code is guaranteed not to run in parallel * with that modification). */ static gfp_t saved_gfp_mask; void pm_restore_gfp_mask(void) { WARN_ON(!mutex_is_locked(&system_transition_mutex)); if (saved_gfp_mask) { gfp_allowed_mask = saved_gfp_mask; saved_gfp_mask = 0; } } void pm_restrict_gfp_mask(void) { WARN_ON(!mutex_is_locked(&system_transition_mutex)); WARN_ON(saved_gfp_mask); saved_gfp_mask = gfp_allowed_mask; gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); } unsigned int lock_system_sleep(void) { unsigned int flags = current->flags; current->flags |= PF_NOFREEZE; mutex_lock(&system_transition_mutex); return flags; } EXPORT_SYMBOL_GPL(lock_system_sleep); void unlock_system_sleep(unsigned int flags) { if (!(flags & PF_NOFREEZE)) current->flags &= ~PF_NOFREEZE; mutex_unlock(&system_transition_mutex); } EXPORT_SYMBOL_GPL(unlock_system_sleep); void ksys_sync_helper(void) { ktime_t start; long elapsed_msecs; start = ktime_get(); ksys_sync(); elapsed_msecs = ktime_to_ms(ktime_sub(ktime_get(), start)); pr_info("Filesystems sync: %ld.%03ld seconds\n", elapsed_msecs / MSEC_PER_SEC, elapsed_msecs % MSEC_PER_SEC); } EXPORT_SYMBOL_GPL(ksys_sync_helper); /* Routines for PM-transition notifications */ static BLOCKING_NOTIFIER_HEAD(pm_chain_head); int register_pm_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&pm_chain_head, nb); } EXPORT_SYMBOL_GPL(register_pm_notifier); int unregister_pm_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&pm_chain_head, nb); } EXPORT_SYMBOL_GPL(unregister_pm_notifier); void pm_report_hw_sleep_time(u64 t) { suspend_stats.last_hw_sleep = t; suspend_stats.total_hw_sleep += t; } EXPORT_SYMBOL_GPL(pm_report_hw_sleep_time); void pm_report_max_hw_sleep(u64 t) { suspend_stats.max_hw_sleep = t; } EXPORT_SYMBOL_GPL(pm_report_max_hw_sleep); int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down) { int ret; ret = blocking_notifier_call_chain_robust(&pm_chain_head, val_up, val_down, NULL); return notifier_to_errno(ret); } int pm_notifier_call_chain(unsigned long val) { return blocking_notifier_call_chain(&pm_chain_head, val, NULL); } /* If set, devices may be suspended and resumed asynchronously. */ int pm_async_enabled = 1; static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sprintf(buf, "%d\n", pm_async_enabled); } static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long val; if (kstrtoul(buf, 10, &val)) return -EINVAL; if (val > 1) return -EINVAL; pm_async_enabled = val; return n; } power_attr(pm_async); #ifdef CONFIG_SUSPEND static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { char *s = buf; suspend_state_t i; for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) { if (i >= PM_SUSPEND_MEM && cxl_mem_active()) continue; if (mem_sleep_states[i]) { const char *label = mem_sleep_states[i]; if (mem_sleep_current == i) s += sprintf(s, "[%s] ", label); else s += sprintf(s, "%s ", label); } } /* Convert the last space to a newline if needed. */ if (s != buf) *(s-1) = '\n'; return (s - buf); } static suspend_state_t decode_suspend_state(const char *buf, size_t n) { suspend_state_t state; char *p; int len; p = memchr(buf, '\n', n); len = p ? p - buf : n; for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) { const char *label = mem_sleep_states[state]; if (label && len == strlen(label) && !strncmp(buf, label, len)) return state; } return PM_SUSPEND_ON; } static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { suspend_state_t state; int error; error = pm_autosleep_lock(); if (error) return error; if (pm_autosleep_state() > PM_SUSPEND_ON) { error = -EBUSY; goto out; } state = decode_suspend_state(buf, n); if (state < PM_SUSPEND_MAX && state > PM_SUSPEND_ON) mem_sleep_current = state; else error = -EINVAL; out: pm_autosleep_unlock(); return error ? error : n; } power_attr(mem_sleep); /* * sync_on_suspend: invoke ksys_sync_helper() before suspend. * * show() returns whether ksys_sync_helper() is invoked before suspend. * store() accepts 0 or 1. 0 disables ksys_sync_helper() and 1 enables it. */ bool sync_on_suspend_enabled = !IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC); static ssize_t sync_on_suspend_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sprintf(buf, "%d\n", sync_on_suspend_enabled); } static ssize_t sync_on_suspend_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long val; if (kstrtoul(buf, 10, &val)) return -EINVAL; if (val > 1) return -EINVAL; sync_on_suspend_enabled = !!val; return n; } power_attr(sync_on_suspend); #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_PM_SLEEP_DEBUG int pm_test_level = TEST_NONE; static const char * const pm_tests[__TEST_AFTER_LAST] = { [TEST_NONE] = "none", [TEST_CORE] = "core", [TEST_CPUS] = "processors", [TEST_PLATFORM] = "platform", [TEST_DEVICES] = "devices", [TEST_FREEZER] = "freezer", }; static ssize_t pm_test_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { char *s = buf; int level; for (level = TEST_FIRST; level <= TEST_MAX; level++) if (pm_tests[level]) { if (level == pm_test_level) s += sprintf(s, "[%s] ", pm_tests[level]); else s += sprintf(s, "%s ", pm_tests[level]); } if (s != buf) /* convert the last space to a newline */ *(s-1) = '\n'; return (s - buf); } static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned int sleep_flags; const char * const *s; int error = -EINVAL; int level; char *p; int len; p = memchr(buf, '\n', n); len = p ? p - buf : n; sleep_flags = lock_system_sleep(); level = TEST_FIRST; for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) { pm_test_level = level; error = 0; break; } unlock_system_sleep(sleep_flags); return error ? error : n; } power_attr(pm_test); #endif /* CONFIG_PM_SLEEP_DEBUG */ static char *suspend_step_name(enum suspend_stat_step step) { switch (step) { case SUSPEND_FREEZE: return "freeze"; case SUSPEND_PREPARE: return "prepare"; case SUSPEND_SUSPEND: return "suspend"; case SUSPEND_SUSPEND_NOIRQ: return "suspend_noirq"; case SUSPEND_RESUME_NOIRQ: return "resume_noirq"; case SUSPEND_RESUME: return "resume"; default: return ""; } } #define suspend_attr(_name, format_str) \ static ssize_t _name##_show(struct kobject *kobj, \ struct kobj_attribute *attr, char *buf) \ { \ return sprintf(buf, format_str, suspend_stats._name); \ } \ static struct kobj_attribute _name = __ATTR_RO(_name) suspend_attr(success, "%d\n"); suspend_attr(fail, "%d\n"); suspend_attr(failed_freeze, "%d\n"); suspend_attr(failed_prepare, "%d\n"); suspend_attr(failed_suspend, "%d\n"); suspend_attr(failed_suspend_late, "%d\n"); suspend_attr(failed_suspend_noirq, "%d\n"); suspend_attr(failed_resume, "%d\n"); suspend_attr(failed_resume_early, "%d\n"); suspend_attr(failed_resume_noirq, "%d\n"); suspend_attr(last_hw_sleep, "%llu\n"); suspend_attr(total_hw_sleep, "%llu\n"); suspend_attr(max_hw_sleep, "%llu\n"); static ssize_t last_failed_dev_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { int index; char *last_failed_dev = NULL; index = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; index %= REC_FAILED_NUM; last_failed_dev = suspend_stats.failed_devs[index]; return sprintf(buf, "%s\n", last_failed_dev); } static struct kobj_attribute last_failed_dev = __ATTR_RO(last_failed_dev); static ssize_t last_failed_errno_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { int index; int last_failed_errno; index = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1; index %= REC_FAILED_NUM; last_failed_errno = suspend_stats.errno[index]; return sprintf(buf, "%d\n", last_failed_errno); } static struct kobj_attribute last_failed_errno = __ATTR_RO(last_failed_errno); static ssize_t last_failed_step_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { int index; enum suspend_stat_step step; char *last_failed_step = NULL; index = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; index %= REC_FAILED_NUM; step = suspend_stats.failed_steps[index]; last_failed_step = suspend_step_name(step); return sprintf(buf, "%s\n", last_failed_step); } static struct kobj_attribute last_failed_step = __ATTR_RO(last_failed_step); static struct attribute *suspend_attrs[] = { &success.attr, &fail.attr, &failed_freeze.attr, &failed_prepare.attr, &failed_suspend.attr, &failed_suspend_late.attr, &failed_suspend_noirq.attr, &failed_resume.attr, &failed_resume_early.attr, &failed_resume_noirq.attr, &last_failed_dev.attr, &last_failed_errno.attr, &last_failed_step.attr, &last_hw_sleep.attr, &total_hw_sleep.attr, &max_hw_sleep.attr, NULL, }; static umode_t suspend_attr_is_visible(struct kobject *kobj, struct attribute *attr, int idx) { if (attr != &last_hw_sleep.attr && attr != &total_hw_sleep.attr && attr != &max_hw_sleep.attr) return 0444; #ifdef CONFIG_ACPI if (acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0) return 0444; #endif return 0; } static const struct attribute_group suspend_attr_group = { .name = "suspend_stats", .attrs = suspend_attrs, .is_visible = suspend_attr_is_visible, }; #ifdef CONFIG_DEBUG_FS static int suspend_stats_show(struct seq_file *s, void *unused) { int i, index, last_dev, last_errno, last_step; last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; last_dev %= REC_FAILED_NUM; last_errno = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1; last_errno %= REC_FAILED_NUM; last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; last_step %= REC_FAILED_NUM; seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n" "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n", "success", suspend_stats.success, "fail", suspend_stats.fail, "failed_freeze", suspend_stats.failed_freeze, "failed_prepare", suspend_stats.failed_prepare, "failed_suspend", suspend_stats.failed_suspend, "failed_suspend_late", suspend_stats.failed_suspend_late, "failed_suspend_noirq", suspend_stats.failed_suspend_noirq, "failed_resume", suspend_stats.failed_resume, "failed_resume_early", suspend_stats.failed_resume_early, "failed_resume_noirq", suspend_stats.failed_resume_noirq); seq_printf(s, "failures:\n last_failed_dev:\t%-s\n", suspend_stats.failed_devs[last_dev]); for (i = 1; i < REC_FAILED_NUM; i++) { index = last_dev + REC_FAILED_NUM - i; index %= REC_FAILED_NUM; seq_printf(s, "\t\t\t%-s\n", suspend_stats.failed_devs[index]); } seq_printf(s, " last_failed_errno:\t%-d\n", suspend_stats.errno[last_errno]); for (i = 1; i < REC_FAILED_NUM; i++) { index = last_errno + REC_FAILED_NUM - i; index %= REC_FAILED_NUM; seq_printf(s, "\t\t\t%-d\n", suspend_stats.errno[index]); } seq_printf(s, " last_failed_step:\t%-s\n", suspend_step_name( suspend_stats.failed_steps[last_step])); for (i = 1; i < REC_FAILED_NUM; i++) { index = last_step + REC_FAILED_NUM - i; index %= REC_FAILED_NUM; seq_printf(s, "\t\t\t%-s\n", suspend_step_name( suspend_stats.failed_steps[index])); } return 0; } DEFINE_SHOW_ATTRIBUTE(suspend_stats); static int __init pm_debugfs_init(void) { debugfs_create_file("suspend_stats", S_IFREG | S_IRUGO, NULL, NULL, &suspend_stats_fops); return 0; } late_initcall(pm_debugfs_init); #endif /* CONFIG_DEBUG_FS */ #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_PM_SLEEP_DEBUG /* * pm_print_times: print time taken by devices to suspend and resume. * * show() returns whether printing of suspend and resume times is enabled. * store() accepts 0 or 1. 0 disables printing and 1 enables it. */ bool pm_print_times_enabled; static ssize_t pm_print_times_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sprintf(buf, "%d\n", pm_print_times_enabled); } static ssize_t pm_print_times_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long val; if (kstrtoul(buf, 10, &val)) return -EINVAL; if (val > 1) return -EINVAL; pm_print_times_enabled = !!val; return n; } power_attr(pm_print_times); static inline void pm_print_times_init(void) { pm_print_times_enabled = !!initcall_debug; } static ssize_t pm_wakeup_irq_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { if (!pm_wakeup_irq()) return -ENODATA; return sprintf(buf, "%u\n", pm_wakeup_irq()); } power_attr_ro(pm_wakeup_irq); bool pm_debug_messages_on __read_mostly; bool pm_debug_messages_should_print(void) { return pm_debug_messages_on && pm_suspend_target_state != PM_SUSPEND_ON; } EXPORT_SYMBOL_GPL(pm_debug_messages_should_print); static ssize_t pm_debug_messages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sprintf(buf, "%d\n", pm_debug_messages_on); } static ssize_t pm_debug_messages_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long val; if (kstrtoul(buf, 10, &val)) return -EINVAL; if (val > 1) return -EINVAL; pm_debug_messages_on = !!val; return n; } power_attr(pm_debug_messages); static int __init pm_debug_messages_setup(char *str) { pm_debug_messages_on = true; return 1; } __setup("pm_debug_messages", pm_debug_messages_setup); #else /* !CONFIG_PM_SLEEP_DEBUG */ static inline void pm_print_times_init(void) {} #endif /* CONFIG_PM_SLEEP_DEBUG */ struct kobject *power_kobj; /* * state - control system sleep states. * * show() returns available sleep state labels, which may be "mem", "standby", * "freeze" and "disk" (hibernation). * See Documentation/admin-guide/pm/sleep-states.rst for a description of * what they mean. * * store() accepts one of those strings, translates it into the proper * enumerated value, and initiates a suspend transition. */ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { char *s = buf; #ifdef CONFIG_SUSPEND suspend_state_t i; for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) if (pm_states[i]) s += sprintf(s,"%s ", pm_states[i]); #endif if (hibernation_available()) s += sprintf(s, "disk "); if (s != buf) /* convert the last space to a newline */ *(s-1) = '\n'; return (s - buf); } static suspend_state_t decode_state(const char *buf, size_t n) { #ifdef CONFIG_SUSPEND suspend_state_t state; #endif char *p; int len; p = memchr(buf, '\n', n); len = p ? p - buf : n; /* Check hibernation first. */ if (len == 4 && str_has_prefix(buf, "disk")) return PM_SUSPEND_MAX; #ifdef CONFIG_SUSPEND for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) { const char *label = pm_states[state]; if (label && len == strlen(label) && !strncmp(buf, label, len)) return state; } #endif return PM_SUSPEND_ON; } static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { suspend_state_t state; int error; error = pm_autosleep_lock(); if (error) return error; if (pm_autosleep_state() > PM_SUSPEND_ON) { error = -EBUSY; goto out; } state = decode_state(buf, n); if (state < PM_SUSPEND_MAX) { if (state == PM_SUSPEND_MEM) state = mem_sleep_current; error = pm_suspend(state); } else if (state == PM_SUSPEND_MAX) { error = hibernate(); } else { error = -EINVAL; } out: pm_autosleep_unlock(); return error ? error : n; } power_attr(state); #ifdef CONFIG_PM_SLEEP /* * The 'wakeup_count' attribute, along with the functions defined in * drivers/base/power/wakeup.c, provides a means by which wakeup events can be * handled in a non-racy way. * * If a wakeup event occurs when the system is in a sleep state, it simply is * woken up. In turn, if an event that would wake the system up from a sleep * state occurs when it is undergoing a transition to that sleep state, the * transition should be aborted. Moreover, if such an event occurs when the * system is in the working state, an attempt to start a transition to the * given sleep state should fail during certain period after the detection of * the event. Using the 'state' attribute alone is not sufficient to satisfy * these requirements, because a wakeup event may occur exactly when 'state' * is being written to and may be delivered to user space right before it is * frozen, so the event will remain only partially processed until the system is * woken up by another event. In particular, it won't cause the transition to * a sleep state to be aborted. * * This difficulty may be overcome if user space uses 'wakeup_count' before * writing to 'state'. It first should read from 'wakeup_count' and store * the read value. Then, after carrying out its own preparations for the system * transition to a sleep state, it should write the stored value to * 'wakeup_count'. If that fails, at least one wakeup event has occurred since * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it * is allowed to write to 'state', but the transition will be aborted if there * are any wakeup events detected after 'wakeup_count' was written to. */ static ssize_t wakeup_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { unsigned int val; return pm_get_wakeup_count(&val, true) ? sprintf(buf, "%u\n", val) : -EINTR; } static ssize_t wakeup_count_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned int val; int error; error = pm_autosleep_lock(); if (error) return error; if (pm_autosleep_state() > PM_SUSPEND_ON) { error = -EBUSY; goto out; } error = -EINVAL; if (sscanf(buf, "%u", &val) == 1) { if (pm_save_wakeup_count(val)) error = n; else pm_print_active_wakeup_sources(); } out: pm_autosleep_unlock(); return error; } power_attr(wakeup_count); #ifdef CONFIG_PM_AUTOSLEEP static ssize_t autosleep_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { suspend_state_t state = pm_autosleep_state(); if (state == PM_SUSPEND_ON) return sprintf(buf, "off\n"); #ifdef CONFIG_SUSPEND if (state < PM_SUSPEND_MAX) return sprintf(buf, "%s\n", pm_states[state] ? pm_states[state] : "error"); #endif #ifdef CONFIG_HIBERNATION return sprintf(buf, "disk\n"); #else return sprintf(buf, "error"); #endif } static ssize_t autosleep_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { suspend_state_t state = decode_state(buf, n); int error; if (state == PM_SUSPEND_ON && strcmp(buf, "off") && strcmp(buf, "off\n")) return -EINVAL; if (state == PM_SUSPEND_MEM) state = mem_sleep_current; error = pm_autosleep_set_state(state); return error ? error : n; } power_attr(autosleep); #endif /* CONFIG_PM_AUTOSLEEP */ #ifdef CONFIG_PM_WAKELOCKS static ssize_t wake_lock_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return pm_show_wakelocks(buf, true); } static ssize_t wake_lock_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { int error = pm_wake_lock(buf); return error ? error : n; } power_attr(wake_lock); static ssize_t wake_unlock_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return pm_show_wakelocks(buf, false); } static ssize_t wake_unlock_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { int error = pm_wake_unlock(buf); return error ? error : n; } power_attr(wake_unlock); #endif /* CONFIG_PM_WAKELOCKS */ #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_PM_TRACE int pm_trace_enabled; static ssize_t pm_trace_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sprintf(buf, "%d\n", pm_trace_enabled); } static ssize_t pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { int val; if (sscanf(buf, "%d", &val) == 1) { pm_trace_enabled = !!val; if (pm_trace_enabled) { pr_warn("PM: Enabling pm_trace changes system date and time during resume.\n" "PM: Correct system time has to be restored manually after resume.\n"); } return n; } return -EINVAL; } power_attr(pm_trace); static ssize_t pm_trace_dev_match_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return show_trace_dev_match(buf, PAGE_SIZE); } power_attr_ro(pm_trace_dev_match); #endif /* CONFIG_PM_TRACE */ #ifdef CONFIG_FREEZER static ssize_t pm_freeze_timeout_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sprintf(buf, "%u\n", freeze_timeout_msecs); } static ssize_t pm_freeze_timeout_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long val; if (kstrtoul(buf, 10, &val)) return -EINVAL; freeze_timeout_msecs = val; return n; } power_attr(pm_freeze_timeout); #endif /* CONFIG_FREEZER*/ static struct attribute * g[] = { &state_attr.attr, #ifdef CONFIG_PM_TRACE &pm_trace_attr.attr, &pm_trace_dev_match_attr.attr, #endif #ifdef CONFIG_PM_SLEEP &pm_async_attr.attr, &wakeup_count_attr.attr, #ifdef CONFIG_SUSPEND &mem_sleep_attr.attr, &sync_on_suspend_attr.attr, #endif #ifdef CONFIG_PM_AUTOSLEEP &autosleep_attr.attr, #endif #ifdef CONFIG_PM_WAKELOCKS &wake_lock_attr.attr, &wake_unlock_attr.attr, #endif #ifdef CONFIG_PM_SLEEP_DEBUG &pm_test_attr.attr, &pm_print_times_attr.attr, &pm_wakeup_irq_attr.attr, &pm_debug_messages_attr.attr, #endif #endif #ifdef CONFIG_FREEZER &pm_freeze_timeout_attr.attr, #endif NULL, }; static const struct attribute_group attr_group = { .attrs = g, }; static const struct attribute_group *attr_groups[] = { &attr_group, #ifdef CONFIG_PM_SLEEP &suspend_attr_group, #endif NULL, }; struct workqueue_struct *pm_wq; EXPORT_SYMBOL_GPL(pm_wq); static int __init pm_start_workqueue(void) { pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0); return pm_wq ? 0 : -ENOMEM; } static int __init pm_init(void) { int error = pm_start_workqueue(); if (error) return error; hibernate_image_size_init(); hibernate_reserved_size_init(); pm_states_init(); power_kobj = kobject_create_and_add("power", NULL); if (!power_kobj) return -ENOMEM; error = sysfs_create_groups(power_kobj, attr_groups); if (error) return error; pm_print_times_init(); return pm_autosleep_init(); } core_initcall(pm_init);
42 42 42 42 42 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. * * This file contains power management functions related to interrupts. */ #include <linux/irq.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/suspend.h> #include <linux/syscore_ops.h> #include "internals.h" bool irq_pm_check_wakeup(struct irq_desc *desc) { if (irqd_is_wakeup_armed(&desc->irq_data)) { irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); desc->istate |= IRQS_SUSPENDED | IRQS_PENDING; desc->depth++; irq_disable(desc); pm_system_irq_wakeup(irq_desc_get_irq(desc)); return true; } return false; } /* * Called from __setup_irq() with desc->lock held after @action has * been installed in the action chain. */ void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { desc->nr_actions++; if (action->flags & IRQF_FORCE_RESUME) desc->force_resume_depth++; WARN_ON_ONCE(desc->force_resume_depth && desc->force_resume_depth != desc->nr_actions); if (action->flags & IRQF_NO_SUSPEND) desc->no_suspend_depth++; else if (action->flags & IRQF_COND_SUSPEND) desc->cond_suspend_depth++; WARN_ON_ONCE(desc->no_suspend_depth && (desc->no_suspend_depth + desc->cond_suspend_depth) != desc->nr_actions); } /* * Called from __free_irq() with desc->lock held after @action has * been removed from the action chain. */ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { desc->nr_actions--; if (action->flags & IRQF_FORCE_RESUME) desc->force_resume_depth--; if (action->flags & IRQF_NO_SUSPEND) desc->no_suspend_depth--; else if (action->flags & IRQF_COND_SUSPEND) desc->cond_suspend_depth--; } static bool suspend_device_irq(struct irq_desc *desc) { unsigned long chipflags = irq_desc_get_chip(desc)->flags; struct irq_data *irqd = &desc->irq_data; if (!desc->action || irq_desc_is_chained(desc) || desc->no_suspend_depth) return false; if (irqd_is_wakeup_set(irqd)) { irqd_set(irqd, IRQD_WAKEUP_ARMED); if ((chipflags & IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND) && irqd_irq_disabled(irqd)) { /* * Interrupt marked for wakeup is in disabled state. * Enable interrupt here to unmask/enable in irqchip * to be able to resume with such interrupts. */ __enable_irq(desc); irqd_set(irqd, IRQD_IRQ_ENABLED_ON_SUSPEND); } /* * We return true here to force the caller to issue * synchronize_irq(). We need to make sure that the * IRQD_WAKEUP_ARMED is visible before we return from * suspend_device_irqs(). */ return true; } desc->istate |= IRQS_SUSPENDED; __disable_irq(desc); /* * Hardware which has no wakeup source configuration facility * requires that the non wakeup interrupts are masked at the * chip level. The chip implementation indicates that with * IRQCHIP_MASK_ON_SUSPEND. */ if (chipflags & IRQCHIP_MASK_ON_SUSPEND) mask_irq(desc); return true; } /** * suspend_device_irqs - disable all currently enabled interrupt lines * * During system-wide suspend or hibernation device drivers need to be * prevented from receiving interrupts and this function is provided * for this purpose. * * So we disable all interrupts and mark them IRQS_SUSPENDED except * for those which are unused, those which are marked as not * suspendable via an interrupt request with the flag IRQF_NO_SUSPEND * set and those which are marked as active wakeup sources. * * The active wakeup sources are handled by the flow handler entry * code which checks for the IRQD_WAKEUP_ARMED flag, suspends the * interrupt and notifies the pm core about the wakeup. */ void suspend_device_irqs(void) { struct irq_desc *desc; int irq; for_each_irq_desc(irq, desc) { unsigned long flags; bool sync; if (irq_settings_is_nested_thread(desc)) continue; raw_spin_lock_irqsave(&desc->lock, flags); sync = suspend_device_irq(desc); raw_spin_unlock_irqrestore(&desc->lock, flags); if (sync) synchronize_irq(irq); } } static void resume_irq(struct irq_desc *desc) { struct irq_data *irqd = &desc->irq_data; irqd_clear(irqd, IRQD_WAKEUP_ARMED); if (irqd_is_enabled_on_suspend(irqd)) { /* * Interrupt marked for wakeup was enabled during suspend * entry. Disable such interrupts to restore them back to * original state. */ __disable_irq(desc); irqd_clear(irqd, IRQD_IRQ_ENABLED_ON_SUSPEND); } if (desc->istate & IRQS_SUSPENDED) goto resume; /* Force resume the interrupt? */ if (!desc->force_resume_depth) return; /* Pretend that it got disabled ! */ desc->depth++; irq_state_set_disabled(desc); irq_state_set_masked(desc); resume: desc->istate &= ~IRQS_SUSPENDED; __enable_irq(desc); } static void resume_irqs(bool want_early) { struct irq_desc *desc; int irq; for_each_irq_desc(irq, desc) { unsigned long flags; bool is_early = desc->action && desc->action->flags & IRQF_EARLY_RESUME; if (!is_early && want_early) continue; if (irq_settings_is_nested_thread(desc)) continue; raw_spin_lock_irqsave(&desc->lock, flags); resume_irq(desc); raw_spin_unlock_irqrestore(&desc->lock, flags); } } /** * rearm_wake_irq - rearm a wakeup interrupt line after signaling wakeup * @irq: Interrupt to rearm */ void rearm_wake_irq(unsigned int irq) { unsigned long flags; struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); if (!desc) return; if (!(desc->istate & IRQS_SUSPENDED) || !irqd_is_wakeup_set(&desc->irq_data)) goto unlock; desc->istate &= ~IRQS_SUSPENDED; irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); __enable_irq(desc); unlock: irq_put_desc_busunlock(desc, flags); } /** * irq_pm_syscore_resume - enable interrupt lines early * * Enable all interrupt lines with %IRQF_EARLY_RESUME set. */ static void irq_pm_syscore_resume(void) { resume_irqs(true); } static struct syscore_ops irq_pm_syscore_ops = { .resume = irq_pm_syscore_resume, }; static int __init irq_pm_init_ops(void) { register_syscore_ops(&irq_pm_syscore_ops); return 0; } device_initcall(irq_pm_init_ops); /** * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() * * Enable all non-%IRQF_EARLY_RESUME interrupt lines previously * disabled by suspend_device_irqs() that have the IRQS_SUSPENDED flag * set as well as those with %IRQF_FORCE_RESUME. */ void resume_device_irqs(void) { resume_irqs(false); }
14 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BPF_CGROUP_H #define _BPF_CGROUP_H #include <linux/bpf.h> #include <linux/bpf-cgroup-defs.h> #include <linux/errno.h> #include <linux/jump_label.h> #include <linux/percpu.h> #include <linux/rbtree.h> #include <net/sock.h> #include <uapi/linux/bpf.h> struct sock; struct sockaddr; struct cgroup; struct sk_buff; struct bpf_map; struct bpf_prog; struct bpf_sock_ops_kern; struct bpf_cgroup_storage; struct ctl_table; struct ctl_table_header; struct task_struct; unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx, const struct bpf_insn *insn); unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx, const struct bpf_insn *insn); unsigned int __cgroup_bpf_run_lsm_current(const void *ctx, const struct bpf_insn *insn); #ifdef CONFIG_CGROUP_BPF #define CGROUP_ATYPE(type) \ case BPF_##type: return type static inline enum cgroup_bpf_attach_type to_cgroup_bpf_attach_type(enum bpf_attach_type attach_type) { switch (attach_type) { CGROUP_ATYPE(CGROUP_INET_INGRESS); CGROUP_ATYPE(CGROUP_INET_EGRESS); CGROUP_ATYPE(CGROUP_INET_SOCK_CREATE); CGROUP_ATYPE(CGROUP_SOCK_OPS); CGROUP_ATYPE(CGROUP_DEVICE); CGROUP_ATYPE(CGROUP_INET4_BIND); CGROUP_ATYPE(CGROUP_INET6_BIND); CGROUP_ATYPE(CGROUP_INET4_CONNECT); CGROUP_ATYPE(CGROUP_INET6_CONNECT); CGROUP_ATYPE(CGROUP_UNIX_CONNECT); CGROUP_ATYPE(CGROUP_INET4_POST_BIND); CGROUP_ATYPE(CGROUP_INET6_POST_BIND); CGROUP_ATYPE(CGROUP_UDP4_SENDMSG); CGROUP_ATYPE(CGROUP_UDP6_SENDMSG); CGROUP_ATYPE(CGROUP_UNIX_SENDMSG); CGROUP_ATYPE(CGROUP_SYSCTL); CGROUP_ATYPE(CGROUP_UDP4_RECVMSG); CGROUP_ATYPE(CGROUP_UDP6_RECVMSG); CGROUP_ATYPE(CGROUP_UNIX_RECVMSG); CGROUP_ATYPE(CGROUP_GETSOCKOPT); CGROUP_ATYPE(CGROUP_SETSOCKOPT); CGROUP_ATYPE(CGROUP_INET4_GETPEERNAME); CGROUP_ATYPE(CGROUP_INET6_GETPEERNAME); CGROUP_ATYPE(CGROUP_UNIX_GETPEERNAME); CGROUP_ATYPE(CGROUP_INET4_GETSOCKNAME); CGROUP_ATYPE(CGROUP_INET6_GETSOCKNAME); CGROUP_ATYPE(CGROUP_UNIX_GETSOCKNAME); CGROUP_ATYPE(CGROUP_INET_SOCK_RELEASE); default: return CGROUP_BPF_ATTACH_TYPE_INVALID; } } #undef CGROUP_ATYPE extern struct static_key_false cgroup_bpf_enabled_key[MAX_CGROUP_BPF_ATTACH_TYPE]; #define cgroup_bpf_enabled(atype) static_branch_unlikely(&cgroup_bpf_enabled_key[atype]) #define for_each_cgroup_storage_type(stype) \ for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++) struct bpf_cgroup_storage_map; struct bpf_storage_buffer { struct rcu_head rcu; char data[]; }; struct bpf_cgroup_storage { union { struct bpf_storage_buffer *buf; void __percpu *percpu_buf; }; struct bpf_cgroup_storage_map *map; struct bpf_cgroup_storage_key key; struct list_head list_map; struct list_head list_cg; struct rb_node node; struct rcu_head rcu; }; struct bpf_cgroup_link { struct bpf_link link; struct cgroup *cgroup; enum bpf_attach_type type; }; struct bpf_prog_list { struct hlist_node node; struct bpf_prog *prog; struct bpf_cgroup_link *link; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]; }; int cgroup_bpf_inherit(struct cgroup *cgrp); void cgroup_bpf_offline(struct cgroup *cgrp); int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, enum cgroup_bpf_attach_type atype); int __cgroup_bpf_run_filter_sk(struct sock *sk, enum cgroup_bpf_attach_type atype); int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, int *uaddrlen, enum cgroup_bpf_attach_type atype, void *t_ctx, u32 *flags); int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, enum cgroup_bpf_attach_type atype); int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, short access, enum cgroup_bpf_attach_type atype); int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, char **buf, size_t *pcount, loff_t *ppos, enum cgroup_bpf_attach_type atype); int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level, int *optname, sockptr_t optval, int *optlen, char **kernel_optval); int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen, int max_optlen, int retval); int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, int optname, void *optval, int *optlen, int retval); static inline enum bpf_cgroup_storage_type cgroup_storage_type( struct bpf_map *map) { if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) return BPF_CGROUP_STORAGE_PERCPU; return BPF_CGROUP_STORAGE_SHARED; } struct bpf_cgroup_storage * cgroup_storage_lookup(struct bpf_cgroup_storage_map *map, void *key, bool locked); struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, enum bpf_cgroup_storage_type stype); void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage); void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, struct cgroup *cgroup, enum bpf_attach_type type); void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage); int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map); int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, void *value, u64 flags); /* Opportunistic check to see whether we have any BPF program attached*/ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, enum cgroup_bpf_attach_type type) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); struct bpf_prog_array *array; array = rcu_access_pointer(cgrp->bpf.effective[type]); return array != &bpf_empty_prog_array.hdr; } /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_INET_INGRESS) && \ cgroup_bpf_sock_enabled(sk, CGROUP_INET_INGRESS)) \ __ret = __cgroup_bpf_run_filter_skb(sk, skb, \ CGROUP_INET_INGRESS); \ \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk) { \ typeof(sk) __sk = sk_to_full_sk(sk); \ if (sk_fullsock(__sk) && __sk == skb_to_full_sk(skb) && \ cgroup_bpf_sock_enabled(__sk, CGROUP_INET_EGRESS)) \ __ret = __cgroup_bpf_run_filter_skb(__sk, skb, \ CGROUP_INET_EGRESS); \ } \ __ret; \ }) #define BPF_CGROUP_RUN_SK_PROG(sk, atype) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(atype)) { \ __ret = __cgroup_bpf_run_filter_sk(sk, atype); \ } \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET_SOCK_CREATE) #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) \ BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET_SOCK_RELEASE) #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) \ BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET4_POST_BIND) #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) \ BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET6_POST_BIND) #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, atype) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(atype)) \ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \ atype, NULL, NULL); \ __ret; \ }) #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, atype, t_ctx) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(atype)) { \ lock_sock(sk); \ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \ atype, t_ctx, NULL); \ release_sock(sk); \ } \ __ret; \ }) /* BPF_CGROUP_INET4_BIND and BPF_CGROUP_INET6_BIND can return extra flags * via upper bits of return code. The only flag that is supported * (at bit position 0) is to indicate CAP_NET_BIND_SERVICE capability check * should be bypassed (BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE). */ #define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, uaddrlen, atype, bind_flags) \ ({ \ u32 __flags = 0; \ int __ret = 0; \ if (cgroup_bpf_enabled(atype)) { \ lock_sock(sk); \ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \ atype, NULL, &__flags); \ release_sock(sk); \ if (__flags & BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE) \ *bind_flags |= BIND_NO_CAP_NET_BIND_SERVICE; \ } \ __ret; \ }) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) \ ((cgroup_bpf_enabled(CGROUP_INET4_CONNECT) || \ cgroup_bpf_enabled(CGROUP_INET6_CONNECT)) && \ (sk)->sk_prot->pre_connect) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, uaddrlen) \ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, CGROUP_INET4_CONNECT) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, uaddrlen) \ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, CGROUP_INET6_CONNECT) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, uaddrlen) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_INET4_CONNECT, NULL) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, uaddrlen) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_INET6_CONNECT, NULL) #define BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, uaddrlen) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UNIX_CONNECT, NULL) #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP4_SENDMSG, t_ctx) #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP6_SENDMSG, t_ctx) #define BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UNIX_SENDMSG, t_ctx) #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr, uaddrlen) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP4_RECVMSG, NULL) #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr, uaddrlen) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP6_RECVMSG, NULL) #define BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, uaddr, uaddrlen) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UNIX_RECVMSG, NULL) /* The SOCK_OPS"_SK" macro should be used when sock_ops->sk is not a * fullsock and its parent fullsock cannot be traced by * sk_to_full_sk(). * * e.g. sock_ops->sk is a request_sock and it is under syncookie mode. * Its listener-sk is not attached to the rsk_listener. * In this case, the caller holds the listener-sk (unlocked), * set its sock_ops->sk to req_sk, and call this SOCK_OPS"_SK" with * the listener-sk such that the cgroup-bpf-progs of the * listener-sk will be run. * * Regardless of syncookie mode or not, * calling bpf_setsockopt on listener-sk will not make sense anyway, * so passing 'sock_ops->sk == req_sk' to the bpf prog is appropriate here. */ #define BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(sock_ops, sk) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_SOCK_OPS)) \ __ret = __cgroup_bpf_run_filter_sock_ops(sk, \ sock_ops, \ CGROUP_SOCK_OPS); \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) && (sock_ops)->sk) { \ typeof(sk) __sk = sk_to_full_sk((sock_ops)->sk); \ if (__sk && sk_fullsock(__sk)) \ __ret = __cgroup_bpf_run_filter_sock_ops(__sk, \ sock_ops, \ CGROUP_SOCK_OPS); \ } \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_DEVICE)) \ __ret = __cgroup_bpf_check_dev_permission(atype, major, minor, \ access, \ CGROUP_DEVICE); \ \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_SYSCTL)) \ __ret = __cgroup_bpf_run_filter_sysctl(head, table, write, \ buf, count, pos, \ CGROUP_SYSCTL); \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \ kernel_optval) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_SETSOCKOPT) && \ cgroup_bpf_sock_enabled(sock, CGROUP_SETSOCKOPT)) \ __ret = __cgroup_bpf_run_filter_setsockopt(sock, level, \ optname, optval, \ optlen, \ kernel_optval); \ __ret; \ }) #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT)) \ copy_from_sockptr(&__ret, optlen, sizeof(int)); \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, optlen, \ max_optlen, retval) \ ({ \ int __ret = retval; \ if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT) && \ cgroup_bpf_sock_enabled(sock, CGROUP_GETSOCKOPT)) \ if (!(sock)->sk_prot->bpf_bypass_getsockopt || \ !INDIRECT_CALL_INET_1((sock)->sk_prot->bpf_bypass_getsockopt, \ tcp_bpf_bypass_getsockopt, \ level, optname)) \ __ret = __cgroup_bpf_run_filter_getsockopt( \ sock, level, optname, optval, optlen, \ max_optlen, retval); \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \ optlen, retval) \ ({ \ int __ret = retval; \ if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT)) \ __ret = __cgroup_bpf_run_filter_getsockopt_kern( \ sock, level, optname, optval, optlen, retval); \ __ret; \ }) int cgroup_bpf_prog_attach(const union bpf_attr *attr, enum bpf_prog_type ptype, struct bpf_prog *prog); int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int cgroup_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); const struct bpf_func_proto * cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); const struct bpf_func_proto * cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); #else static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } static inline void cgroup_bpf_offline(struct cgroup *cgrp) {} static inline int cgroup_bpf_prog_attach(const union bpf_attr *attr, enum bpf_prog_type ptype, struct bpf_prog *prog) { return -EINVAL; } static inline int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) { return -EINVAL; } static inline int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { return -EINVAL; } static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { return -EINVAL; } static inline const struct bpf_func_proto * cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { return NULL; } static inline const struct bpf_func_proto * cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { return NULL; } static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map) { return 0; } static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return NULL; } static inline void bpf_cgroup_storage_free( struct bpf_cgroup_storage *storage) {} static inline int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value) { return 0; } static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, void *value, u64 flags) { return 0; } #define cgroup_bpf_enabled(atype) (0) #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, atype, t_ctx) ({ 0; }) #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, atype) ({ 0; }) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, uaddrlen, atype, flags) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, uaddrlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, uaddrlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; }) #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \ optlen, max_optlen, retval) ({ retval; }) #define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \ optlen, retval) ({ retval; }) #define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \ kernel_optval) ({ 0; }) #define for_each_cgroup_storage_type(stype) for (; false; ) #endif /* CONFIG_CGROUP_BPF */ #endif /* _BPF_CGROUP_H */
66 66 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 // SPDX-License-Identifier: GPL-2.0-only /* Miscellaneous routines. * * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/swap.h> #include "internal.h" /* * Attach a folio to the buffer and maybe set marks on it to say that we need * to put the folio later and twiddle the pagecache flags. */ int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index, struct folio *folio, unsigned int flags, gfp_t gfp_mask) { XA_STATE_ORDER(xas, xa, index, folio_order(folio)); retry: xas_lock(&xas); for (;;) { xas_store(&xas, folio); if (!xas_error(&xas)) break; xas_unlock(&xas); if (!xas_nomem(&xas, gfp_mask)) return xas_error(&xas); goto retry; } if (flags & NETFS_FLAG_PUT_MARK) xas_set_mark(&xas, NETFS_BUF_PUT_MARK); if (flags & NETFS_FLAG_PAGECACHE_MARK) xas_set_mark(&xas, NETFS_BUF_PAGECACHE_MARK); xas_unlock(&xas); return xas_error(&xas); } /* * Create the specified range of folios in the buffer attached to the read * request. The folios are marked with NETFS_BUF_PUT_MARK so that we know that * these need freeing later. */ int netfs_add_folios_to_buffer(struct xarray *buffer, struct address_space *mapping, pgoff_t index, pgoff_t to, gfp_t gfp_mask) { struct folio *folio; int ret; if (to + 1 == index) /* Page range is inclusive */ return 0; do { /* TODO: Figure out what order folio can be allocated here */ folio = filemap_alloc_folio(readahead_gfp_mask(mapping), 0); if (!folio) return -ENOMEM; folio->index = index; ret = netfs_xa_store_and_mark(buffer, index, folio, NETFS_FLAG_PUT_MARK, gfp_mask); if (ret < 0) { folio_put(folio); return ret; } index += folio_nr_pages(folio); } while (index <= to && index != 0); return 0; } /* * Clear an xarray buffer, putting a ref on the folios that have * NETFS_BUF_PUT_MARK set. */ void netfs_clear_buffer(struct xarray *buffer) { struct folio *folio; XA_STATE(xas, buffer, 0); rcu_read_lock(); xas_for_each_marked(&xas, folio, ULONG_MAX, NETFS_BUF_PUT_MARK) { folio_put(folio); } rcu_read_unlock(); xa_destroy(buffer); } /** * netfs_dirty_folio - Mark folio dirty and pin a cache object for writeback * @mapping: The mapping the folio belongs to. * @folio: The folio being dirtied. * * Set the dirty flag on a folio and pin an in-use cache object in memory so * that writeback can later write to it. This is intended to be called from * the filesystem's ->dirty_folio() method. * * Return: true if the dirty flag was set on the folio, false otherwise. */ bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio) { struct inode *inode = mapping->host; struct netfs_inode *ictx = netfs_inode(inode); struct fscache_cookie *cookie = netfs_i_cookie(ictx); bool need_use = false; _enter(""); if (!filemap_dirty_folio(mapping, folio)) return false; if (!fscache_cookie_valid(cookie)) return true; if (!(inode->i_state & I_PINNING_NETFS_WB)) { spin_lock(&inode->i_lock); if (!(inode->i_state & I_PINNING_NETFS_WB)) { inode->i_state |= I_PINNING_NETFS_WB; need_use = true; } spin_unlock(&inode->i_lock); if (need_use) fscache_use_cookie(cookie, true); } return true; } EXPORT_SYMBOL(netfs_dirty_folio); /** * netfs_unpin_writeback - Unpin writeback resources * @inode: The inode on which the cookie resides * @wbc: The writeback control * * Unpin the writeback resources pinned by netfs_dirty_folio(). This is * intended to be called as/by the netfs's ->write_inode() method. */ int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc) { struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode)); if (wbc->unpinned_netfs_wb) fscache_unuse_cookie(cookie, NULL, NULL); return 0; } EXPORT_SYMBOL(netfs_unpin_writeback); /** * netfs_clear_inode_writeback - Clear writeback resources pinned by an inode * @inode: The inode to clean up * @aux: Auxiliary data to apply to the inode * * Clear any writeback resources held by an inode when the inode is evicted. * This must be called before clear_inode() is called. */ void netfs_clear_inode_writeback(struct inode *inode, const void *aux) { struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode)); if (inode->i_state & I_PINNING_NETFS_WB) { loff_t i_size = i_size_read(inode); fscache_unuse_cookie(cookie, aux, &i_size); } } EXPORT_SYMBOL(netfs_clear_inode_writeback); /** * netfs_invalidate_folio - Invalidate or partially invalidate a folio * @folio: Folio proposed for release * @offset: Offset of the invalidated region * @length: Length of the invalidated region * * Invalidate part or all of a folio for a network filesystem. The folio will * be removed afterwards if the invalidated region covers the entire folio. */ void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) { struct netfs_folio *finfo = NULL; size_t flen = folio_size(folio); _enter("{%lx},%zx,%zx", folio_index(folio), offset, length); folio_wait_fscache(folio); if (!folio_test_private(folio)) return; finfo = netfs_folio_info(folio); if (offset == 0 && length >= flen) goto erase_completely; if (finfo) { /* We have a partially uptodate page from a streaming write. */ unsigned int fstart = finfo->dirty_offset; unsigned int fend = fstart + finfo->dirty_len; unsigned int end = offset + length; if (offset >= fend) return; if (end <= fstart) return; if (offset <= fstart && end >= fend) goto erase_completely; if (offset <= fstart && end > fstart) goto reduce_len; if (offset > fstart && end >= fend) goto move_start; /* A partial write was split. The caller has already zeroed * it, so just absorb the hole. */ } return; erase_completely: netfs_put_group(netfs_folio_group(folio)); folio_detach_private(folio); folio_clear_uptodate(folio); kfree(finfo); return; reduce_len: finfo->dirty_len = offset + length - finfo->dirty_offset; return; move_start: finfo->dirty_len -= offset - finfo->dirty_offset; finfo->dirty_offset = offset; } EXPORT_SYMBOL(netfs_invalidate_folio); /** * netfs_release_folio - Try to release a folio * @folio: Folio proposed for release * @gfp: Flags qualifying the release * * Request release of a folio and clean up its private state if it's not busy. * Returns true if the folio can now be released, false if not */ bool netfs_release_folio(struct folio *folio, gfp_t gfp) { struct netfs_inode *ctx = netfs_inode(folio_inode(folio)); unsigned long long end; end = folio_pos(folio) + folio_size(folio); if (end > ctx->zero_point) ctx->zero_point = end; if (folio_test_private(folio)) return false; if (folio_test_fscache(folio)) { if (current_is_kswapd() || !(gfp & __GFP_FS)) return false; folio_wait_fscache(folio); } fscache_note_page_release(netfs_i_cookie(ctx)); return true; } EXPORT_SYMBOL(netfs_release_folio);
5 6 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef __NET_SCHED_PIE_H #define __NET_SCHED_PIE_H #include <linux/ktime.h> #include <linux/skbuff.h> #include <linux/types.h> #include <net/inet_ecn.h> #include <net/pkt_sched.h> #define MAX_PROB (U64_MAX >> BITS_PER_BYTE) #define DTIME_INVALID U64_MAX #define QUEUE_THRESHOLD 16384 #define DQCOUNT_INVALID -1 #define PIE_SCALE 8 /** * struct pie_params - contains pie parameters * @target: target delay in pschedtime * @tupdate: interval at which drop probability is calculated * @limit: total number of packets that can be in the queue * @alpha: parameter to control drop probability * @beta: parameter to control drop probability * @ecn: is ECN marking of packets enabled * @bytemode: is drop probability scaled based on pkt size * @dq_rate_estimator: is Little's law used for qdelay calculation */ struct pie_params { psched_time_t target; u32 tupdate; u32 limit; u32 alpha; u32 beta; u8 ecn; u8 bytemode; u8 dq_rate_estimator; }; /** * struct pie_vars - contains pie variables * @qdelay: current queue delay * @qdelay_old: queue delay in previous qdelay calculation * @burst_time: burst time allowance * @dq_tstamp: timestamp at which dq rate was last calculated * @prob: drop probability * @accu_prob: accumulated drop probability * @dq_count: number of bytes dequeued in a measurement cycle * @avg_dq_rate: calculated average dq rate * @backlog_old: queue backlog during previous qdelay calculation */ struct pie_vars { psched_time_t qdelay; psched_time_t qdelay_old; psched_time_t burst_time; psched_time_t dq_tstamp; u64 prob; u64 accu_prob; u64 dq_count; u32 avg_dq_rate; u32 backlog_old; }; /** * struct pie_stats - contains pie stats * @packets_in: total number of packets enqueued * @dropped: packets dropped due to pie action * @overlimit: packets dropped due to lack of space in queue * @ecn_mark: packets marked with ECN * @maxq: maximum queue size */ struct pie_stats { u32 packets_in; u32 dropped; u32 overlimit; u32 ecn_mark; u32 maxq; }; /** * struct pie_skb_cb - contains private skb vars * @enqueue_time: timestamp when the packet is enqueued * @mem_usage: size of the skb during enqueue */ struct pie_skb_cb { psched_time_t enqueue_time; u32 mem_usage; }; static inline void pie_params_init(struct pie_params *params) { params->target = PSCHED_NS2TICKS(15 * NSEC_PER_MSEC); /* 15 ms */ params->tupdate = usecs_to_jiffies(15 * USEC_PER_MSEC); /* 15 ms */ params->limit = 1000; params->alpha = 2; params->beta = 20; params->ecn = false; params->bytemode = false; params->dq_rate_estimator = false; } static inline void pie_vars_init(struct pie_vars *vars) { vars->burst_time = PSCHED_NS2TICKS(150 * NSEC_PER_MSEC); /* 150 ms */ vars->dq_tstamp = DTIME_INVALID; vars->accu_prob = 0; vars->dq_count = DQCOUNT_INVALID; vars->avg_dq_rate = 0; } static inline struct pie_skb_cb *get_pie_cb(const struct sk_buff *skb) { qdisc_cb_private_validate(skb, sizeof(struct pie_skb_cb)); return (struct pie_skb_cb *)qdisc_skb_cb(skb)->data; } static inline psched_time_t pie_get_enqueue_time(const struct sk_buff *skb) { return get_pie_cb(skb)->enqueue_time; } static inline void pie_set_enqueue_time(struct sk_buff *skb) { get_pie_cb(skb)->enqueue_time = psched_get_time(); } bool pie_drop_early(struct Qdisc *sch, struct pie_params *params, struct pie_vars *vars, u32 backlog, u32 packet_size); void pie_process_dequeue(struct sk_buff *skb, struct pie_params *params, struct pie_vars *vars, u32 backlog); void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars, u32 backlog); #endif
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef LINUX_MLD_H #define LINUX_MLD_H #include <linux/in6.h> #include <linux/icmpv6.h> /* MLDv1 Query/Report/Done */ struct mld_msg { struct icmp6hdr mld_hdr; struct in6_addr mld_mca; }; #define mld_type mld_hdr.icmp6_type #define mld_code mld_hdr.icmp6_code #define mld_cksum mld_hdr.icmp6_cksum #define mld_maxdelay mld_hdr.icmp6_maxdelay #define mld_reserved mld_hdr.icmp6_dataun.un_data16[1] /* Multicast Listener Discovery version 2 headers */ /* MLDv2 Report */ struct mld2_grec { __u8 grec_type; __u8 grec_auxwords; __be16 grec_nsrcs; struct in6_addr grec_mca; struct in6_addr grec_src[]; }; struct mld2_report { struct icmp6hdr mld2r_hdr; struct mld2_grec mld2r_grec[]; }; #define mld2r_type mld2r_hdr.icmp6_type #define mld2r_resv1 mld2r_hdr.icmp6_code #define mld2r_cksum mld2r_hdr.icmp6_cksum #define mld2r_resv2 mld2r_hdr.icmp6_dataun.un_data16[0] #define mld2r_ngrec mld2r_hdr.icmp6_dataun.un_data16[1] /* MLDv2 Query */ struct mld2_query { struct icmp6hdr mld2q_hdr; struct in6_addr mld2q_mca; #if defined(__LITTLE_ENDIAN_BITFIELD) __u8 mld2q_qrv:3, mld2q_suppress:1, mld2q_resv2:4; #elif defined(__BIG_ENDIAN_BITFIELD) __u8 mld2q_resv2:4, mld2q_suppress:1, mld2q_qrv:3; #else #error "Please fix <asm/byteorder.h>" #endif __u8 mld2q_qqic; __be16 mld2q_nsrcs; struct in6_addr mld2q_srcs[]; }; #define mld2q_type mld2q_hdr.icmp6_type #define mld2q_code mld2q_hdr.icmp6_code #define mld2q_cksum mld2q_hdr.icmp6_cksum #define mld2q_mrc mld2q_hdr.icmp6_maxdelay #define mld2q_resv1 mld2q_hdr.icmp6_dataun.un_data16[1] /* RFC3810, 5.1.3. Maximum Response Code: * * If Maximum Response Code >= 32768, Maximum Response Code represents a * floating-point value as follows: * * 0 1 2 3 4 5 6 7 8 9 A B C D E F * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |1| exp | mant | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ #define MLDV2_MRC_EXP(value) (((value) >> 12) & 0x0007) #define MLDV2_MRC_MAN(value) ((value) & 0x0fff) /* RFC3810, 5.1.9. QQIC (Querier's Query Interval Code): * * If QQIC >= 128, QQIC represents a floating-point value as follows: * * 0 1 2 3 4 5 6 7 * +-+-+-+-+-+-+-+-+ * |1| exp | mant | * +-+-+-+-+-+-+-+-+ */ #define MLDV2_QQIC_EXP(value) (((value) >> 4) & 0x07) #define MLDV2_QQIC_MAN(value) ((value) & 0x0f) #define MLD_EXP_MIN_LIMIT 32768UL #define MLDV1_MRD_MAX_COMPAT (MLD_EXP_MIN_LIMIT - 1) #define MLD_MAX_QUEUE 8 #define MLD_MAX_SKBS 32 static inline unsigned long mldv2_mrc(const struct mld2_query *mlh2) { /* RFC3810, 5.1.3. Maximum Response Code */ unsigned long ret, mc_mrc = ntohs(mlh2->mld2q_mrc); if (mc_mrc < MLD_EXP_MIN_LIMIT) { ret = mc_mrc; } else { unsigned long mc_man, mc_exp; mc_exp = MLDV2_MRC_EXP(mc_mrc); mc_man = MLDV2_MRC_MAN(mc_mrc); ret = (mc_man | 0x1000) << (mc_exp + 3); } return ret; } #endif
22 22 29 1 2 26 1 24 1 24 1 23 4 23 1 28 2 1 1 1 22 22 22 22 22 22 22 22 22 22 17 2 3 2 1 2 2 2 2 20 20 22 1 1 22 21 22 22 22 21 22 22 22 22 7 8 7 12 22 22 22 21 20 20 1 20 1 22 22 22 22 22 22 22 21 21 22 21 22 21 21 22 20 1 3 19 21 1 1 21 20 22 20 21 20 20 22 22 22 22 22 22 22 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 110 12 12 108 1244 1245 1245 82 4 13 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 // SPDX-License-Identifier: GPL-2.0-only /* * VXLAN: Virtual eXtensible Local Area Network * * Copyright (c) 2012-2013 Vyatta Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/udp.h> #include <linux/igmp.h> #include <linux/if_ether.h> #include <linux/ethtool.h> #include <net/arp.h> #include <net/ndisc.h> #include <net/gro.h> #include <net/ipv6_stubs.h> #include <net/ip.h> #include <net/icmp.h> #include <net/rtnetlink.h> #include <net/inet_ecn.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/tun_proto.h> #include <net/vxlan.h> #include <net/nexthop.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ip6_tunnel.h> #include <net/ip6_checksum.h> #endif #include "vxlan_private.h" #define VXLAN_VERSION "0.1" #define FDB_AGE_DEFAULT 300 /* 5 min */ #define FDB_AGE_INTERVAL (10 * HZ) /* rescan interval */ /* UDP port for VXLAN traffic. * The IANA assigned port is 4789, but the Linux default is 8472 * for compatibility with early adopters. */ static unsigned short vxlan_port __read_mostly = 8472; module_param_named(udp_port, vxlan_port, ushort, 0444); MODULE_PARM_DESC(udp_port, "Destination UDP port"); static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); unsigned int vxlan_net_id; const u8 all_zeros_mac[ETH_ALEN + 2]; static struct rtnl_link_ops vxlan_link_ops; static int vxlan_sock_add(struct vxlan_dev *vxlan); static void vxlan_vs_del_dev(struct vxlan_dev *vxlan); /* salt for hash table */ static u32 vxlan_salt __read_mostly; static inline bool vxlan_collect_metadata(struct vxlan_sock *vs) { return vs->flags & VXLAN_F_COLLECT_METADATA || ip_tunnel_collect_metadata(); } /* Find VXLAN socket based on network namespace, address family, UDP port, * enabled unshareable flags and socket device binding (see l3mdev with * non-default VRF). */ static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family, __be16 port, u32 flags, int ifindex) { struct vxlan_sock *vs; flags &= VXLAN_F_RCV_FLAGS; hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) { if (inet_sk(vs->sock->sk)->inet_sport == port && vxlan_get_sk_family(vs) == family && vs->flags == flags && vs->sock->sk->sk_bound_dev_if == ifindex) return vs; } return NULL; } static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, int ifindex, __be32 vni, struct vxlan_vni_node **vninode) { struct vxlan_vni_node *vnode; struct vxlan_dev_node *node; /* For flow based devices, map all packets to VNI 0 */ if (vs->flags & VXLAN_F_COLLECT_METADATA && !(vs->flags & VXLAN_F_VNIFILTER)) vni = 0; hlist_for_each_entry_rcu(node, vni_head(vs, vni), hlist) { if (!node->vxlan) continue; vnode = NULL; if (node->vxlan->cfg.flags & VXLAN_F_VNIFILTER) { vnode = vxlan_vnifilter_lookup(node->vxlan, vni); if (!vnode) continue; } else if (node->vxlan->default_dst.remote_vni != vni) { continue; } if (IS_ENABLED(CONFIG_IPV6)) { const struct vxlan_config *cfg = &node->vxlan->cfg; if ((cfg->flags & VXLAN_F_IPV6_LINKLOCAL) && cfg->remote_ifindex != ifindex) continue; } if (vninode) *vninode = vnode; return node->vxlan; } return NULL; } /* Look up VNI in a per net namespace table */ static struct vxlan_dev *vxlan_find_vni(struct net *net, int ifindex, __be32 vni, sa_family_t family, __be16 port, u32 flags) { struct vxlan_sock *vs; vs = vxlan_find_sock(net, family, port, flags, ifindex); if (!vs) return NULL; return vxlan_vs_find_vni(vs, ifindex, vni, NULL); } /* Fill in neighbour message in skbuff. */ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, const struct vxlan_fdb *fdb, u32 portid, u32 seq, int type, unsigned int flags, const struct vxlan_rdst *rdst) { unsigned long now = jiffies; struct nda_cacheinfo ci; bool send_ip, send_eth; struct nlmsghdr *nlh; struct nexthop *nh; struct ndmsg *ndm; int nh_family; u32 nh_id; nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags); if (nlh == NULL) return -EMSGSIZE; ndm = nlmsg_data(nlh); memset(ndm, 0, sizeof(*ndm)); send_eth = send_ip = true; rcu_read_lock(); nh = rcu_dereference(fdb->nh); if (nh) { nh_family = nexthop_get_family(nh); nh_id = nh->id; } rcu_read_unlock(); if (type == RTM_GETNEIGH) { if (rdst) { send_ip = !vxlan_addr_any(&rdst->remote_ip); ndm->ndm_family = send_ip ? rdst->remote_ip.sa.sa_family : AF_INET; } else if (nh) { ndm->ndm_family = nh_family; } send_eth = !is_zero_ether_addr(fdb->eth_addr); } else ndm->ndm_family = AF_BRIDGE; ndm->ndm_state = fdb->state; ndm->ndm_ifindex = vxlan->dev->ifindex; ndm->ndm_flags = fdb->flags; if (rdst && rdst->offloaded) ndm->ndm_flags |= NTF_OFFLOADED; ndm->ndm_type = RTN_UNICAST; if (!net_eq(dev_net(vxlan->dev), vxlan->net) && nla_put_s32(skb, NDA_LINK_NETNSID, peernet2id(dev_net(vxlan->dev), vxlan->net))) goto nla_put_failure; if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr)) goto nla_put_failure; if (nh) { if (nla_put_u32(skb, NDA_NH_ID, nh_id)) goto nla_put_failure; } else if (rdst) { if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, &rdst->remote_ip)) goto nla_put_failure; if (rdst->remote_port && rdst->remote_port != vxlan->cfg.dst_port && nla_put_be16(skb, NDA_PORT, rdst->remote_port)) goto nla_put_failure; if (rdst->remote_vni != vxlan->default_dst.remote_vni && nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni))) goto nla_put_failure; if (rdst->remote_ifindex && nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex)) goto nla_put_failure; } if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && fdb->vni && nla_put_u32(skb, NDA_SRC_VNI, be32_to_cpu(fdb->vni))) goto nla_put_failure; ci.ndm_used = jiffies_to_clock_t(now - fdb->used); ci.ndm_confirmed = 0; ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated); ci.ndm_refcnt = 0; if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static inline size_t vxlan_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ + nla_total_size(sizeof(struct in6_addr)) /* NDA_DST */ + nla_total_size(sizeof(__be16)) /* NDA_PORT */ + nla_total_size(sizeof(__be32)) /* NDA_VNI */ + nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */ + nla_total_size(sizeof(__s32)) /* NDA_LINK_NETNSID */ + nla_total_size(sizeof(struct nda_cacheinfo)); } static void __vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, struct vxlan_rdst *rd, int type) { struct net *net = dev_net(vxlan->dev); struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC); if (skb == NULL) goto errout; err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, rd); if (err < 0) { /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); return; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); } static void vxlan_fdb_switchdev_notifier_info(const struct vxlan_dev *vxlan, const struct vxlan_fdb *fdb, const struct vxlan_rdst *rd, struct netlink_ext_ack *extack, struct switchdev_notifier_vxlan_fdb_info *fdb_info) { fdb_info->info.dev = vxlan->dev; fdb_info->info.extack = extack; fdb_info->remote_ip = rd->remote_ip; fdb_info->remote_port = rd->remote_port; fdb_info->remote_vni = rd->remote_vni; fdb_info->remote_ifindex = rd->remote_ifindex; memcpy(fdb_info->eth_addr, fdb->eth_addr, ETH_ALEN); fdb_info->vni = fdb->vni; fdb_info->offloaded = rd->offloaded; fdb_info->added_by_user = fdb->flags & NTF_VXLAN_ADDED_BY_USER; } static int vxlan_fdb_switchdev_call_notifiers(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, struct vxlan_rdst *rd, bool adding, struct netlink_ext_ack *extack) { struct switchdev_notifier_vxlan_fdb_info info; enum switchdev_notifier_type notifier_type; int ret; if (WARN_ON(!rd)) return 0; notifier_type = adding ? SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE : SWITCHDEV_VXLAN_FDB_DEL_TO_DEVICE; vxlan_fdb_switchdev_notifier_info(vxlan, fdb, rd, NULL, &info); ret = call_switchdev_notifiers(notifier_type, vxlan->dev, &info.info, extack); return notifier_to_errno(ret); } static int vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, struct vxlan_rdst *rd, int type, bool swdev_notify, struct netlink_ext_ack *extack) { int err; if (swdev_notify && rd) { switch (type) { case RTM_NEWNEIGH: err = vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd, true, extack); if (err) return err; break; case RTM_DELNEIGH: vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd, false, extack); break; } } __vxlan_fdb_notify(vxlan, fdb, rd, type); return 0; } static void vxlan_ip_miss(struct net_device *dev, union vxlan_addr *ipa) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb f = { .state = NUD_STALE, }; struct vxlan_rdst remote = { .remote_ip = *ipa, /* goes to NDA_DST */ .remote_vni = cpu_to_be32(VXLAN_N_VID), }; vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true, NULL); } static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN]) { struct vxlan_fdb f = { .state = NUD_STALE, }; struct vxlan_rdst remote = { }; memcpy(f.eth_addr, eth_addr, ETH_ALEN); vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true, NULL); } /* Hash Ethernet address */ static u32 eth_hash(const unsigned char *addr) { u64 value = get_unaligned((u64 *)addr); /* only want 6 bytes */ #ifdef __BIG_ENDIAN value >>= 16; #else value <<= 16; #endif return hash_64(value, FDB_HASH_BITS); } u32 eth_vni_hash(const unsigned char *addr, __be32 vni) { /* use 1 byte of OUI and 3 bytes of NIC */ u32 key = get_unaligned((u32 *)(addr + 2)); return jhash_2words(key, vni, vxlan_salt) & (FDB_HASH_SIZE - 1); } u32 fdb_head_index(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni) { if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) return eth_vni_hash(mac, vni); else return eth_hash(mac); } /* Hash chain to use given mac address */ static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni) { return &vxlan->fdb_head[fdb_head_index(vxlan, mac, vni)]; } /* Look up Ethernet address in forwarding table */ static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni) { struct hlist_head *head = vxlan_fdb_head(vxlan, mac, vni); struct vxlan_fdb *f; hlist_for_each_entry_rcu(f, head, hlist) { if (ether_addr_equal(mac, f->eth_addr)) { if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) { if (vni == f->vni) return f; } else { return f; } } } return NULL; } static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni) { struct vxlan_fdb *f; f = __vxlan_find_mac(vxlan, mac, vni); if (f && f->used != jiffies) f->used = jiffies; return f; } /* caller should hold vxlan->hash_lock */ static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f, union vxlan_addr *ip, __be16 port, __be32 vni, __u32 ifindex) { struct vxlan_rdst *rd; list_for_each_entry(rd, &f->remotes, list) { if (vxlan_addr_equal(&rd->remote_ip, ip) && rd->remote_port == port && rd->remote_vni == vni && rd->remote_ifindex == ifindex) return rd; } return NULL; } int vxlan_fdb_find_uc(struct net_device *dev, const u8 *mac, __be32 vni, struct switchdev_notifier_vxlan_fdb_info *fdb_info) { struct vxlan_dev *vxlan = netdev_priv(dev); u8 eth_addr[ETH_ALEN + 2] = { 0 }; struct vxlan_rdst *rdst; struct vxlan_fdb *f; int rc = 0; if (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac)) return -EINVAL; ether_addr_copy(eth_addr, mac); rcu_read_lock(); f = __vxlan_find_mac(vxlan, eth_addr, vni); if (!f) { rc = -ENOENT; goto out; } rdst = first_remote_rcu(f); vxlan_fdb_switchdev_notifier_info(vxlan, f, rdst, NULL, fdb_info); out: rcu_read_unlock(); return rc; } EXPORT_SYMBOL_GPL(vxlan_fdb_find_uc); static int vxlan_fdb_notify_one(struct notifier_block *nb, const struct vxlan_dev *vxlan, const struct vxlan_fdb *f, const struct vxlan_rdst *rdst, struct netlink_ext_ack *extack) { struct switchdev_notifier_vxlan_fdb_info fdb_info; int rc; vxlan_fdb_switchdev_notifier_info(vxlan, f, rdst, extack, &fdb_info); rc = nb->notifier_call(nb, SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE, &fdb_info); return notifier_to_errno(rc); } int vxlan_fdb_replay(const struct net_device *dev, __be32 vni, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan; struct vxlan_rdst *rdst; struct vxlan_fdb *f; unsigned int h; int rc = 0; if (!netif_is_vxlan(dev)) return -EINVAL; vxlan = netdev_priv(dev); for (h = 0; h < FDB_HASH_SIZE; ++h) { spin_lock_bh(&vxlan->hash_lock[h]); hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) { if (f->vni == vni) { list_for_each_entry(rdst, &f->remotes, list) { rc = vxlan_fdb_notify_one(nb, vxlan, f, rdst, extack); if (rc) goto unlock; } } } spin_unlock_bh(&vxlan->hash_lock[h]); } return 0; unlock: spin_unlock_bh(&vxlan->hash_lock[h]); return rc; } EXPORT_SYMBOL_GPL(vxlan_fdb_replay); void vxlan_fdb_clear_offload(const struct net_device *dev, __be32 vni) { struct vxlan_dev *vxlan; struct vxlan_rdst *rdst; struct vxlan_fdb *f; unsigned int h; if (!netif_is_vxlan(dev)) return; vxlan = netdev_priv(dev); for (h = 0; h < FDB_HASH_SIZE; ++h) { spin_lock_bh(&vxlan->hash_lock[h]); hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) if (f->vni == vni) list_for_each_entry(rdst, &f->remotes, list) rdst->offloaded = false; spin_unlock_bh(&vxlan->hash_lock[h]); } } EXPORT_SYMBOL_GPL(vxlan_fdb_clear_offload); /* Replace destination of unicast mac */ static int vxlan_fdb_replace(struct vxlan_fdb *f, union vxlan_addr *ip, __be16 port, __be32 vni, __u32 ifindex, struct vxlan_rdst *oldrd) { struct vxlan_rdst *rd; rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex); if (rd) return 0; rd = list_first_entry_or_null(&f->remotes, struct vxlan_rdst, list); if (!rd) return 0; *oldrd = *rd; dst_cache_reset(&rd->dst_cache); rd->remote_ip = *ip; rd->remote_port = port; rd->remote_vni = vni; rd->remote_ifindex = ifindex; rd->offloaded = false; return 1; } /* Add/update destinations for multicast */ static int vxlan_fdb_append(struct vxlan_fdb *f, union vxlan_addr *ip, __be16 port, __be32 vni, __u32 ifindex, struct vxlan_rdst **rdp) { struct vxlan_rdst *rd; rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex); if (rd) return 0; rd = kmalloc(sizeof(*rd), GFP_ATOMIC); if (rd == NULL) return -ENOMEM; if (dst_cache_init(&rd->dst_cache, GFP_ATOMIC)) { kfree(rd); return -ENOMEM; } rd->remote_ip = *ip; rd->remote_port = port; rd->offloaded = false; rd->remote_vni = vni; rd->remote_ifindex = ifindex; list_add_tail_rcu(&rd->list, &f->remotes); *rdp = rd; return 1; } static bool vxlan_parse_gpe_proto(struct vxlanhdr *hdr, __be16 *protocol) { struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)hdr; /* Need to have Next Protocol set for interfaces in GPE mode. */ if (!gpe->np_applied) return false; /* "The initial version is 0. If a receiver does not support the * version indicated it MUST drop the packet. */ if (gpe->version != 0) return false; /* "When the O bit is set to 1, the packet is an OAM packet and OAM * processing MUST occur." However, we don't implement OAM * processing, thus drop the packet. */ if (gpe->oam_flag) return false; *protocol = tun_p_to_eth_p(gpe->next_protocol); if (!*protocol) return false; return true; } static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb, unsigned int off, struct vxlanhdr *vh, size_t hdrlen, __be32 vni_field, struct gro_remcsum *grc, bool nopartial) { size_t start, offset; if (skb->remcsum_offload) return vh; if (!NAPI_GRO_CB(skb)->csum_valid) return NULL; start = vxlan_rco_start(vni_field); offset = start + vxlan_rco_offset(vni_field); vh = skb_gro_remcsum_process(skb, (void *)vh, off, hdrlen, start, offset, grc, nopartial); skb->remcsum_offload = 1; return vh; } static struct vxlanhdr *vxlan_gro_prepare_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb, struct gro_remcsum *grc) { struct sk_buff *p; struct vxlanhdr *vh, *vh2; unsigned int hlen, off_vx; struct vxlan_sock *vs = rcu_dereference_sk_user_data(sk); __be32 flags; skb_gro_remcsum_init(grc); off_vx = skb_gro_offset(skb); hlen = off_vx + sizeof(*vh); vh = skb_gro_header(skb, hlen, off_vx); if (unlikely(!vh)) return NULL; skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr)); flags = vh->vx_flags; if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) { vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr), vh->vx_vni, grc, !!(vs->flags & VXLAN_F_REMCSUM_NOPARTIAL)); if (!vh) return NULL; } skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */ list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; vh2 = (struct vxlanhdr *)(p->data + off_vx); if (vh->vx_flags != vh2->vx_flags || vh->vx_vni != vh2->vx_vni) { NAPI_GRO_CB(p)->same_flow = 0; continue; } } return vh; } static struct sk_buff *vxlan_gro_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb) { struct sk_buff *pp = NULL; struct gro_remcsum grc; int flush = 1; if (vxlan_gro_prepare_receive(sk, head, skb, &grc)) { pp = call_gro_receive(eth_gro_receive, head, skb); flush = 0; } skb_gro_flush_final_remcsum(skb, pp, flush, &grc); return pp; } static struct sk_buff *vxlan_gpe_gro_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb) { const struct packet_offload *ptype; struct sk_buff *pp = NULL; struct gro_remcsum grc; struct vxlanhdr *vh; __be16 protocol; int flush = 1; vh = vxlan_gro_prepare_receive(sk, head, skb, &grc); if (vh) { if (!vxlan_parse_gpe_proto(vh, &protocol)) goto out; ptype = gro_find_receive_by_type(protocol); if (!ptype) goto out; pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); flush = 0; } out: skb_gro_flush_final_remcsum(skb, pp, flush, &grc); return pp; } static int vxlan_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { /* Sets 'skb->inner_mac_header' since we are always called with * 'skb->encapsulation' set. */ return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr)); } static int vxlan_gpe_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { struct vxlanhdr *vh = (struct vxlanhdr *)(skb->data + nhoff); const struct packet_offload *ptype; int err = -ENOSYS; __be16 protocol; if (!vxlan_parse_gpe_proto(vh, &protocol)) return err; ptype = gro_find_complete_by_type(protocol); if (ptype) err = ptype->callbacks.gro_complete(skb, nhoff + sizeof(struct vxlanhdr)); return err; } static struct vxlan_fdb *vxlan_fdb_alloc(struct vxlan_dev *vxlan, const u8 *mac, __u16 state, __be32 src_vni, __u16 ndm_flags) { struct vxlan_fdb *f; f = kmalloc(sizeof(*f), GFP_ATOMIC); if (!f) return NULL; f->state = state; f->flags = ndm_flags; f->updated = f->used = jiffies; f->vni = src_vni; f->nh = NULL; RCU_INIT_POINTER(f->vdev, vxlan); INIT_LIST_HEAD(&f->nh_list); INIT_LIST_HEAD(&f->remotes); memcpy(f->eth_addr, mac, ETH_ALEN); return f; } static void vxlan_fdb_insert(struct vxlan_dev *vxlan, const u8 *mac, __be32 src_vni, struct vxlan_fdb *f) { ++vxlan->addrcnt; hlist_add_head_rcu(&f->hlist, vxlan_fdb_head(vxlan, mac, src_vni)); } static int vxlan_fdb_nh_update(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, u32 nhid, struct netlink_ext_ack *extack) { struct nexthop *old_nh = rtnl_dereference(fdb->nh); struct nexthop *nh; int err = -EINVAL; if (old_nh && old_nh->id == nhid) return 0; nh = nexthop_find_by_id(vxlan->net, nhid); if (!nh) { NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); goto err_inval; } if (!nexthop_get(nh)) { NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); nh = NULL; goto err_inval; } if (!nexthop_is_fdb(nh)) { NL_SET_ERR_MSG(extack, "Nexthop is not a fdb nexthop"); goto err_inval; } if (!nexthop_is_multipath(nh)) { NL_SET_ERR_MSG(extack, "Nexthop is not a multipath group"); goto err_inval; } /* check nexthop group family */ switch (vxlan->default_dst.remote_ip.sa.sa_family) { case AF_INET: if (!nexthop_has_v4(nh)) { err = -EAFNOSUPPORT; NL_SET_ERR_MSG(extack, "Nexthop group family not supported"); goto err_inval; } break; case AF_INET6: if (nexthop_has_v4(nh)) { err = -EAFNOSUPPORT; NL_SET_ERR_MSG(extack, "Nexthop group family not supported"); goto err_inval; } } if (old_nh) { list_del_rcu(&fdb->nh_list); nexthop_put(old_nh); } rcu_assign_pointer(fdb->nh, nh); list_add_tail_rcu(&fdb->nh_list, &nh->fdb_list); return 1; err_inval: if (nh) nexthop_put(nh); return err; } int vxlan_fdb_create(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, __u16 state, __be16 port, __be32 src_vni, __be32 vni, __u32 ifindex, __u16 ndm_flags, u32 nhid, struct vxlan_fdb **fdb, struct netlink_ext_ack *extack) { struct vxlan_rdst *rd = NULL; struct vxlan_fdb *f; int rc; if (vxlan->cfg.addrmax && vxlan->addrcnt >= vxlan->cfg.addrmax) return -ENOSPC; netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip); f = vxlan_fdb_alloc(vxlan, mac, state, src_vni, ndm_flags); if (!f) return -ENOMEM; if (nhid) rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack); else rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd); if (rc < 0) goto errout; *fdb = f; return 0; errout: kfree(f); return rc; } static void __vxlan_fdb_free(struct vxlan_fdb *f) { struct vxlan_rdst *rd, *nd; struct nexthop *nh; nh = rcu_dereference_raw(f->nh); if (nh) { rcu_assign_pointer(f->nh, NULL); rcu_assign_pointer(f->vdev, NULL); nexthop_put(nh); } list_for_each_entry_safe(rd, nd, &f->remotes, list) { dst_cache_destroy(&rd->dst_cache); kfree(rd); } kfree(f); } static void vxlan_fdb_free(struct rcu_head *head) { struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu); __vxlan_fdb_free(f); } static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, bool do_notify, bool swdev_notify) { struct vxlan_rdst *rd; netdev_dbg(vxlan->dev, "delete %pM\n", f->eth_addr); --vxlan->addrcnt; if (do_notify) { if (rcu_access_pointer(f->nh)) vxlan_fdb_notify(vxlan, f, NULL, RTM_DELNEIGH, swdev_notify, NULL); else list_for_each_entry(rd, &f->remotes, list) vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH, swdev_notify, NULL); } hlist_del_rcu(&f->hlist); list_del_rcu(&f->nh_list); call_rcu(&f->rcu, vxlan_fdb_free); } static void vxlan_dst_free(struct rcu_head *head) { struct vxlan_rdst *rd = container_of(head, struct vxlan_rdst, rcu); dst_cache_destroy(&rd->dst_cache); kfree(rd); } static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, union vxlan_addr *ip, __u16 state, __u16 flags, __be16 port, __be32 vni, __u32 ifindex, __u16 ndm_flags, struct vxlan_fdb *f, u32 nhid, bool swdev_notify, struct netlink_ext_ack *extack) { __u16 fdb_flags = (ndm_flags & ~NTF_USE); struct vxlan_rdst *rd = NULL; struct vxlan_rdst oldrd; int notify = 0; int rc = 0; int err; if (nhid && !rcu_access_pointer(f->nh)) { NL_SET_ERR_MSG(extack, "Cannot replace an existing non nexthop fdb with a nexthop"); return -EOPNOTSUPP; } if (nhid && (flags & NLM_F_APPEND)) { NL_SET_ERR_MSG(extack, "Cannot append to a nexthop fdb"); return -EOPNOTSUPP; } /* Do not allow an externally learned entry to take over an entry added * by the user. */ if (!(fdb_flags & NTF_EXT_LEARNED) || !(f->flags & NTF_VXLAN_ADDED_BY_USER)) { if (f->state != state) { f->state = state; f->updated = jiffies; notify = 1; } if (f->flags != fdb_flags) { f->flags = fdb_flags; f->updated = jiffies; notify = 1; } } if ((flags & NLM_F_REPLACE)) { /* Only change unicasts */ if (!(is_multicast_ether_addr(f->eth_addr) || is_zero_ether_addr(f->eth_addr))) { if (nhid) { rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack); if (rc < 0) return rc; } else { rc = vxlan_fdb_replace(f, ip, port, vni, ifindex, &oldrd); } notify |= rc; } else { NL_SET_ERR_MSG(extack, "Cannot replace non-unicast fdb entries"); return -EOPNOTSUPP; } } if ((flags & NLM_F_APPEND) && (is_multicast_ether_addr(f->eth_addr) || is_zero_ether_addr(f->eth_addr))) { rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd); if (rc < 0) return rc; notify |= rc; } if (ndm_flags & NTF_USE) f->used = jiffies; if (notify) { if (rd == NULL) rd = first_remote_rtnl(f); err = vxlan_fdb_notify(vxlan, f, rd, RTM_NEWNEIGH, swdev_notify, extack); if (err) goto err_notify; } return 0; err_notify: if (nhid) return err; if ((flags & NLM_F_REPLACE) && rc) *rd = oldrd; else if ((flags & NLM_F_APPEND) && rc) { list_del_rcu(&rd->list); call_rcu(&rd->rcu, vxlan_dst_free); } return err; } static int vxlan_fdb_update_create(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, __u16 state, __u16 flags, __be16 port, __be32 src_vni, __be32 vni, __u32 ifindex, __u16 ndm_flags, u32 nhid, bool swdev_notify, struct netlink_ext_ack *extack) { __u16 fdb_flags = (ndm_flags & ~NTF_USE); struct vxlan_fdb *f; int rc; /* Disallow replace to add a multicast entry */ if ((flags & NLM_F_REPLACE) && (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac))) return -EOPNOTSUPP; netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip); rc = vxlan_fdb_create(vxlan, mac, ip, state, port, src_vni, vni, ifindex, fdb_flags, nhid, &f, extack); if (rc < 0) return rc; vxlan_fdb_insert(vxlan, mac, src_vni, f); rc = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_NEWNEIGH, swdev_notify, extack); if (rc) goto err_notify; return 0; err_notify: vxlan_fdb_destroy(vxlan, f, false, false); return rc; } /* Add new entry to forwarding table -- assumes lock held */ int vxlan_fdb_update(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, __u16 state, __u16 flags, __be16 port, __be32 src_vni, __be32 vni, __u32 ifindex, __u16 ndm_flags, u32 nhid, bool swdev_notify, struct netlink_ext_ack *extack) { struct vxlan_fdb *f; f = __vxlan_find_mac(vxlan, mac, src_vni); if (f) { if (flags & NLM_F_EXCL) { netdev_dbg(vxlan->dev, "lost race to create %pM\n", mac); return -EEXIST; } return vxlan_fdb_update_existing(vxlan, ip, state, flags, port, vni, ifindex, ndm_flags, f, nhid, swdev_notify, extack); } else { if (!(flags & NLM_F_CREATE)) return -ENOENT; return vxlan_fdb_update_create(vxlan, mac, ip, state, flags, port, src_vni, vni, ifindex, ndm_flags, nhid, swdev_notify, extack); } } static void vxlan_fdb_dst_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, struct vxlan_rdst *rd, bool swdev_notify) { list_del_rcu(&rd->list); vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH, swdev_notify, NULL); call_rcu(&rd->rcu, vxlan_dst_free); } static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan, union vxlan_addr *ip, __be16 *port, __be32 *src_vni, __be32 *vni, u32 *ifindex, u32 *nhid, struct netlink_ext_ack *extack) { struct net *net = dev_net(vxlan->dev); int err; if (tb[NDA_NH_ID] && (tb[NDA_DST] || tb[NDA_VNI] || tb[NDA_IFINDEX] || tb[NDA_PORT])) { NL_SET_ERR_MSG(extack, "DST, VNI, ifindex and port are mutually exclusive with NH_ID"); return -EINVAL; } if (tb[NDA_DST]) { err = vxlan_nla_get_addr(ip, tb[NDA_DST]); if (err) { NL_SET_ERR_MSG(extack, "Unsupported address family"); return err; } } else { union vxlan_addr *remote = &vxlan->default_dst.remote_ip; if (remote->sa.sa_family == AF_INET) { ip->sin.sin_addr.s_addr = htonl(INADDR_ANY); ip->sa.sa_family = AF_INET; #if IS_ENABLED(CONFIG_IPV6) } else { ip->sin6.sin6_addr = in6addr_any; ip->sa.sa_family = AF_INET6; #endif } } if (tb[NDA_PORT]) { if (nla_len(tb[NDA_PORT]) != sizeof(__be16)) { NL_SET_ERR_MSG(extack, "Invalid vxlan port"); return -EINVAL; } *port = nla_get_be16(tb[NDA_PORT]); } else { *port = vxlan->cfg.dst_port; } if (tb[NDA_VNI]) { if (nla_len(tb[NDA_VNI]) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid vni"); return -EINVAL; } *vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI])); } else { *vni = vxlan->default_dst.remote_vni; } if (tb[NDA_SRC_VNI]) { if (nla_len(tb[NDA_SRC_VNI]) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid src vni"); return -EINVAL; } *src_vni = cpu_to_be32(nla_get_u32(tb[NDA_SRC_VNI])); } else { *src_vni = vxlan->default_dst.remote_vni; } if (tb[NDA_IFINDEX]) { struct net_device *tdev; if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid ifindex"); return -EINVAL; } *ifindex = nla_get_u32(tb[NDA_IFINDEX]); tdev = __dev_get_by_index(net, *ifindex); if (!tdev) { NL_SET_ERR_MSG(extack, "Device not found"); return -EADDRNOTAVAIL; } } else { *ifindex = 0; } if (tb[NDA_NH_ID]) *nhid = nla_get_u32(tb[NDA_NH_ID]); else *nhid = 0; return 0; } /* Add static entry (via netlink) */ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 flags, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); /* struct net *net = dev_net(vxlan->dev); */ union vxlan_addr ip; __be16 port; __be32 src_vni, vni; u32 ifindex, nhid; u32 hash_index; int err; if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) { pr_info("RTM_NEWNEIGH with invalid state %#x\n", ndm->ndm_state); return -EINVAL; } if (!tb || (!tb[NDA_DST] && !tb[NDA_NH_ID])) return -EINVAL; err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex, &nhid, extack); if (err) return err; if (vxlan->default_dst.remote_ip.sa.sa_family != ip.sa.sa_family) return -EAFNOSUPPORT; hash_index = fdb_head_index(vxlan, addr, src_vni); spin_lock_bh(&vxlan->hash_lock[hash_index]); err = vxlan_fdb_update(vxlan, addr, &ip, ndm->ndm_state, flags, port, src_vni, vni, ifindex, ndm->ndm_flags | NTF_VXLAN_ADDED_BY_USER, nhid, true, extack); spin_unlock_bh(&vxlan->hash_lock[hash_index]); return err; } int __vxlan_fdb_delete(struct vxlan_dev *vxlan, const unsigned char *addr, union vxlan_addr ip, __be16 port, __be32 src_vni, __be32 vni, u32 ifindex, bool swdev_notify) { struct vxlan_rdst *rd = NULL; struct vxlan_fdb *f; int err = -ENOENT; f = vxlan_find_mac(vxlan, addr, src_vni); if (!f) return err; if (!vxlan_addr_any(&ip)) { rd = vxlan_fdb_find_rdst(f, &ip, port, vni, ifindex); if (!rd) goto out; } /* remove a destination if it's not the only one on the list, * otherwise destroy the fdb entry */ if (rd && !list_is_singular(&f->remotes)) { vxlan_fdb_dst_destroy(vxlan, f, rd, swdev_notify); goto out; } vxlan_fdb_destroy(vxlan, f, true, swdev_notify); out: return 0; } /* Delete entry (via netlink) */ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); union vxlan_addr ip; __be32 src_vni, vni; u32 ifindex, nhid; u32 hash_index; __be16 port; int err; err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex, &nhid, extack); if (err) return err; hash_index = fdb_head_index(vxlan, addr, src_vni); spin_lock_bh(&vxlan->hash_lock[hash_index]); err = __vxlan_fdb_delete(vxlan, addr, ip, port, src_vni, vni, ifindex, true); spin_unlock_bh(&vxlan->hash_lock[hash_index]); return err; } /* Dump forwarding table */ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx) { struct vxlan_dev *vxlan = netdev_priv(dev); unsigned int h; int err = 0; for (h = 0; h < FDB_HASH_SIZE; ++h) { struct vxlan_fdb *f; rcu_read_lock(); hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) { struct vxlan_rdst *rd; if (rcu_access_pointer(f->nh)) { if (*idx < cb->args[2]) goto skip_nh; err = vxlan_fdb_info(skb, vxlan, f, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, NLM_F_MULTI, NULL); if (err < 0) { rcu_read_unlock(); goto out; } skip_nh: *idx += 1; continue; } list_for_each_entry_rcu(rd, &f->remotes, list) { if (*idx < cb->args[2]) goto skip; err = vxlan_fdb_info(skb, vxlan, f, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, NLM_F_MULTI, rd); if (err < 0) { rcu_read_unlock(); goto out; } skip: *idx += 1; } } rcu_read_unlock(); } out: return err; } static int vxlan_fdb_get(struct sk_buff *skb, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u32 portid, u32 seq, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb *f; __be32 vni; int err; if (tb[NDA_VNI]) vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI])); else vni = vxlan->default_dst.remote_vni; rcu_read_lock(); f = __vxlan_find_mac(vxlan, addr, vni); if (!f) { NL_SET_ERR_MSG(extack, "Fdb entry not found"); err = -ENOENT; goto errout; } err = vxlan_fdb_info(skb, vxlan, f, portid, seq, RTM_NEWNEIGH, 0, first_remote_rcu(f)); errout: rcu_read_unlock(); return err; } /* Watch incoming packets to learn mapping between Ethernet address * and Tunnel endpoint. * Return true if packet is bogus and should be dropped. */ static bool vxlan_snoop(struct net_device *dev, union vxlan_addr *src_ip, const u8 *src_mac, u32 src_ifindex, __be32 vni) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb *f; u32 ifindex = 0; #if IS_ENABLED(CONFIG_IPV6) if (src_ip->sa.sa_family == AF_INET6 && (ipv6_addr_type(&src_ip->sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL)) ifindex = src_ifindex; #endif f = vxlan_find_mac(vxlan, src_mac, vni); if (likely(f)) { struct vxlan_rdst *rdst = first_remote_rcu(f); if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip) && rdst->remote_ifindex == ifindex)) return false; /* Don't migrate static entries, drop packets */ if (f->state & (NUD_PERMANENT | NUD_NOARP)) return true; /* Don't override an fdb with nexthop with a learnt entry */ if (rcu_access_pointer(f->nh)) return true; if (net_ratelimit()) netdev_info(dev, "%pM migrated from %pIS to %pIS\n", src_mac, &rdst->remote_ip.sa, &src_ip->sa); rdst->remote_ip = *src_ip; f->updated = jiffies; vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH, true, NULL); } else { u32 hash_index = fdb_head_index(vxlan, src_mac, vni); /* learned new entry */ spin_lock(&vxlan->hash_lock[hash_index]); /* close off race between vxlan_flush and incoming packets */ if (netif_running(dev)) vxlan_fdb_update(vxlan, src_mac, src_ip, NUD_REACHABLE, NLM_F_EXCL|NLM_F_CREATE, vxlan->cfg.dst_port, vni, vxlan->default_dst.remote_vni, ifindex, NTF_SELF, 0, true, NULL); spin_unlock(&vxlan->hash_lock[hash_index]); } return false; } static bool __vxlan_sock_release_prep(struct vxlan_sock *vs) { struct vxlan_net *vn; if (!vs) return false; if (!refcount_dec_and_test(&vs->refcnt)) return false; vn = net_generic(sock_net(vs->sock->sk), vxlan_net_id); spin_lock(&vn->sock_lock); hlist_del_rcu(&vs->hlist); udp_tunnel_notify_del_rx_port(vs->sock, (vs->flags & VXLAN_F_GPE) ? UDP_TUNNEL_TYPE_VXLAN_GPE : UDP_TUNNEL_TYPE_VXLAN); spin_unlock(&vn->sock_lock); return true; } static void vxlan_sock_release(struct vxlan_dev *vxlan) { struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock); #if IS_ENABLED(CONFIG_IPV6) struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock); RCU_INIT_POINTER(vxlan->vn6_sock, NULL); #endif RCU_INIT_POINTER(vxlan->vn4_sock, NULL); synchronize_net(); if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) vxlan_vs_del_vnigrp(vxlan); else vxlan_vs_del_dev(vxlan); if (__vxlan_sock_release_prep(sock4)) { udp_tunnel_sock_release(sock4->sock); kfree(sock4); } #if IS_ENABLED(CONFIG_IPV6) if (__vxlan_sock_release_prep(sock6)) { udp_tunnel_sock_release(sock6->sock); kfree(sock6); } #endif } static bool vxlan_remcsum(struct vxlanhdr *unparsed, struct sk_buff *skb, u32 vxflags) { size_t start, offset; if (!(unparsed->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload) goto out; start = vxlan_rco_start(unparsed->vx_vni); offset = start + vxlan_rco_offset(unparsed->vx_vni); if (!pskb_may_pull(skb, offset + sizeof(u16))) return false; skb_remcsum_process(skb, (void *)(vxlan_hdr(skb) + 1), start, offset, !!(vxflags & VXLAN_F_REMCSUM_NOPARTIAL)); out: unparsed->vx_flags &= ~VXLAN_HF_RCO; unparsed->vx_vni &= VXLAN_VNI_MASK; return true; } static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed, struct sk_buff *skb, u32 vxflags, struct vxlan_metadata *md) { struct vxlanhdr_gbp *gbp = (struct vxlanhdr_gbp *)unparsed; struct metadata_dst *tun_dst; if (!(unparsed->vx_flags & VXLAN_HF_GBP)) goto out; md->gbp = ntohs(gbp->policy_id); tun_dst = (struct metadata_dst *)skb_dst(skb); if (tun_dst) { tun_dst->u.tun_info.key.tun_flags |= TUNNEL_VXLAN_OPT; tun_dst->u.tun_info.options_len = sizeof(*md); } if (gbp->dont_learn) md->gbp |= VXLAN_GBP_DONT_LEARN; if (gbp->policy_applied) md->gbp |= VXLAN_GBP_POLICY_APPLIED; /* In flow-based mode, GBP is carried in dst_metadata */ if (!(vxflags & VXLAN_F_COLLECT_METADATA)) skb->mark = md->gbp; out: unparsed->vx_flags &= ~VXLAN_GBP_USED_BITS; } static bool vxlan_set_mac(struct vxlan_dev *vxlan, struct vxlan_sock *vs, struct sk_buff *skb, __be32 vni) { union vxlan_addr saddr; u32 ifindex = skb->dev->ifindex; skb_reset_mac_header(skb); skb->protocol = eth_type_trans(skb, vxlan->dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); /* Ignore packet loops (and multicast echo) */ if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr)) return false; /* Get address from the outer IP header */ if (vxlan_get_sk_family(vs) == AF_INET) { saddr.sin.sin_addr.s_addr = ip_hdr(skb)->saddr; saddr.sa.sa_family = AF_INET; #if IS_ENABLED(CONFIG_IPV6) } else { saddr.sin6.sin6_addr = ipv6_hdr(skb)->saddr; saddr.sa.sa_family = AF_INET6; #endif } if ((vxlan->cfg.flags & VXLAN_F_LEARN) && vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source, ifindex, vni)) return false; return true; } static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph, struct sk_buff *skb) { int err = 0; if (vxlan_get_sk_family(vs) == AF_INET) err = IP_ECN_decapsulate(oiph, skb); #if IS_ENABLED(CONFIG_IPV6) else err = IP6_ECN_decapsulate(oiph, skb); #endif if (unlikely(err) && log_ecn_error) { if (vxlan_get_sk_family(vs) == AF_INET) net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", &((struct iphdr *)oiph)->saddr, ((struct iphdr *)oiph)->tos); else net_info_ratelimited("non-ECT from %pI6\n", &((struct ipv6hdr *)oiph)->saddr); } return err <= 1; } /* Callback from net/ipv4/udp.c to receive packets */ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) { struct vxlan_vni_node *vninode = NULL; struct vxlan_dev *vxlan; struct vxlan_sock *vs; struct vxlanhdr unparsed; struct vxlan_metadata _md; struct vxlan_metadata *md = &_md; __be16 protocol = htons(ETH_P_TEB); bool raw_proto = false; void *oiph; __be32 vni = 0; /* Need UDP and VXLAN header to be present */ if (!pskb_may_pull(skb, VXLAN_HLEN)) goto drop; unparsed = *vxlan_hdr(skb); /* VNI flag always required to be set */ if (!(unparsed.vx_flags & VXLAN_HF_VNI)) { netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n", ntohl(vxlan_hdr(skb)->vx_flags), ntohl(vxlan_hdr(skb)->vx_vni)); /* Return non vxlan pkt */ goto drop; } unparsed.vx_flags &= ~VXLAN_HF_VNI; unparsed.vx_vni &= ~VXLAN_VNI_MASK; vs = rcu_dereference_sk_user_data(sk); if (!vs) goto drop; vni = vxlan_vni(vxlan_hdr(skb)->vx_vni); vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni, &vninode); if (!vxlan) goto drop; /* For backwards compatibility, only allow reserved fields to be * used by VXLAN extensions if explicitly requested. */ if (vs->flags & VXLAN_F_GPE) { if (!vxlan_parse_gpe_proto(&unparsed, &protocol)) goto drop; unparsed.vx_flags &= ~VXLAN_GPE_USED_BITS; raw_proto = true; } if (__iptunnel_pull_header(skb, VXLAN_HLEN, protocol, raw_proto, !net_eq(vxlan->net, dev_net(vxlan->dev)))) goto drop; if (vs->flags & VXLAN_F_REMCSUM_RX) if (unlikely(!vxlan_remcsum(&unparsed, skb, vs->flags))) goto drop; if (vxlan_collect_metadata(vs)) { struct metadata_dst *tun_dst; tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), TUNNEL_KEY, key32_to_tunnel_id(vni), sizeof(*md)); if (!tun_dst) goto drop; md = ip_tunnel_info_opts(&tun_dst->u.tun_info); skb_dst_set(skb, (struct dst_entry *)tun_dst); } else { memset(md, 0, sizeof(*md)); } if (vs->flags & VXLAN_F_GBP) vxlan_parse_gbp_hdr(&unparsed, skb, vs->flags, md); /* Note that GBP and GPE can never be active together. This is * ensured in vxlan_dev_configure. */ if (unparsed.vx_flags || unparsed.vx_vni) { /* If there are any unprocessed flags remaining treat * this as a malformed packet. This behavior diverges from * VXLAN RFC (RFC7348) which stipulates that bits in reserved * in reserved fields are to be ignored. The approach here * maintains compatibility with previous stack code, and also * is more robust and provides a little more security in * adding extensions to VXLAN. */ goto drop; } if (!raw_proto) { if (!vxlan_set_mac(vxlan, vs, skb, vni)) goto drop; } else { skb_reset_mac_header(skb); skb->dev = vxlan->dev; skb->pkt_type = PACKET_HOST; } oiph = skb_network_header(skb); skb_reset_network_header(skb); if (!vxlan_ecn_decapsulate(vs, oiph, skb)) { ++vxlan->dev->stats.rx_frame_errors; ++vxlan->dev->stats.rx_errors; vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX_ERRORS, 0); goto drop; } rcu_read_lock(); if (unlikely(!(vxlan->dev->flags & IFF_UP))) { rcu_read_unlock(); dev_core_stats_rx_dropped_inc(vxlan->dev); vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX_DROPS, 0); goto drop; } dev_sw_netstats_rx_add(vxlan->dev, skb->len); vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX, skb->len); gro_cells_receive(&vxlan->gro_cells, skb); rcu_read_unlock(); return 0; drop: /* Consume bad packet */ kfree_skb(skb); return 0; } /* Callback from net/ipv{4,6}/udp.c to check that we have a VNI for errors */ static int vxlan_err_lookup(struct sock *sk, struct sk_buff *skb) { struct vxlan_dev *vxlan; struct vxlan_sock *vs; struct vxlanhdr *hdr; __be32 vni; if (!pskb_may_pull(skb, skb_transport_offset(skb) + VXLAN_HLEN)) return -EINVAL; hdr = vxlan_hdr(skb); if (!(hdr->vx_flags & VXLAN_HF_VNI)) return -EINVAL; vs = rcu_dereference_sk_user_data(sk); if (!vs) return -ENOENT; vni = vxlan_vni(hdr->vx_vni); vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni, NULL); if (!vxlan) return -ENOENT; return 0; } static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) { struct vxlan_dev *vxlan = netdev_priv(dev); struct arphdr *parp; u8 *arpptr, *sha; __be32 sip, tip; struct neighbour *n; if (dev->flags & IFF_NOARP) goto out; if (!pskb_may_pull(skb, arp_hdr_len(dev))) { dev->stats.tx_dropped++; goto out; } parp = arp_hdr(skb); if ((parp->ar_hrd != htons(ARPHRD_ETHER) && parp->ar_hrd != htons(ARPHRD_IEEE802)) || parp->ar_pro != htons(ETH_P_IP) || parp->ar_op != htons(ARPOP_REQUEST) || parp->ar_hln != dev->addr_len || parp->ar_pln != 4) goto out; arpptr = (u8 *)parp + sizeof(struct arphdr); sha = arpptr; arpptr += dev->addr_len; /* sha */ memcpy(&sip, arpptr, sizeof(sip)); arpptr += sizeof(sip); arpptr += dev->addr_len; /* tha */ memcpy(&tip, arpptr, sizeof(tip)); if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) goto out; n = neigh_lookup(&arp_tbl, &tip, dev); if (n) { struct vxlan_fdb *f; struct sk_buff *reply; if (!(READ_ONCE(n->nud_state) & NUD_CONNECTED)) { neigh_release(n); goto out; } f = vxlan_find_mac(vxlan, n->ha, vni); if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) { /* bridge-local neighbor */ neigh_release(n); goto out; } reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, n->ha, sha); neigh_release(n); if (reply == NULL) goto out; skb_reset_mac_header(reply); __skb_pull(reply, skb_network_offset(reply)); reply->ip_summed = CHECKSUM_UNNECESSARY; reply->pkt_type = PACKET_HOST; if (netif_rx(reply) == NET_RX_DROP) { dev->stats.rx_dropped++; vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_RX_DROPS, 0); } } else if (vxlan->cfg.flags & VXLAN_F_L3MISS) { union vxlan_addr ipa = { .sin.sin_addr.s_addr = tip, .sin.sin_family = AF_INET, }; vxlan_ip_miss(dev, &ipa); } out: consume_skb(skb); return NETDEV_TX_OK; } #if IS_ENABLED(CONFIG_IPV6) static struct sk_buff *vxlan_na_create(struct sk_buff *request, struct neighbour *n, bool isrouter) { struct net_device *dev = request->dev; struct sk_buff *reply; struct nd_msg *ns, *na; struct ipv6hdr *pip6; u8 *daddr; int na_olen = 8; /* opt hdr + ETH_ALEN for target */ int ns_olen; int i, len; if (dev == NULL || !pskb_may_pull(request, request->len)) return NULL; len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) + sizeof(*na) + na_olen + dev->needed_tailroom; reply = alloc_skb(len, GFP_ATOMIC); if (reply == NULL) return NULL; reply->protocol = htons(ETH_P_IPV6); reply->dev = dev; skb_reserve(reply, LL_RESERVED_SPACE(request->dev)); skb_push(reply, sizeof(struct ethhdr)); skb_reset_mac_header(reply); ns = (struct nd_msg *)(ipv6_hdr(request) + 1); daddr = eth_hdr(request)->h_source; ns_olen = request->len - skb_network_offset(request) - sizeof(struct ipv6hdr) - sizeof(*ns); for (i = 0; i < ns_olen-1; i += (ns->opt[i+1]<<3)) { if (!ns->opt[i + 1]) { kfree_skb(reply); return NULL; } if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) { daddr = ns->opt + i + sizeof(struct nd_opt_hdr); break; } } /* Ethernet header */ ether_addr_copy(eth_hdr(reply)->h_dest, daddr); ether_addr_copy(eth_hdr(reply)->h_source, n->ha); eth_hdr(reply)->h_proto = htons(ETH_P_IPV6); reply->protocol = htons(ETH_P_IPV6); skb_pull(reply, sizeof(struct ethhdr)); skb_reset_network_header(reply); skb_put(reply, sizeof(struct ipv6hdr)); /* IPv6 header */ pip6 = ipv6_hdr(reply); memset(pip6, 0, sizeof(struct ipv6hdr)); pip6->version = 6; pip6->priority = ipv6_hdr(request)->priority; pip6->nexthdr = IPPROTO_ICMPV6; pip6->hop_limit = 255; pip6->daddr = ipv6_hdr(request)->saddr; pip6->saddr = *(struct in6_addr *)n->primary_key; skb_pull(reply, sizeof(struct ipv6hdr)); skb_reset_transport_header(reply); /* Neighbor Advertisement */ na = skb_put_zero(reply, sizeof(*na) + na_olen); na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT; na->icmph.icmp6_router = isrouter; na->icmph.icmp6_override = 1; na->icmph.icmp6_solicited = 1; na->target = ns->target; ether_addr_copy(&na->opt[2], n->ha); na->opt[0] = ND_OPT_TARGET_LL_ADDR; na->opt[1] = na_olen >> 3; na->icmph.icmp6_cksum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, sizeof(*na)+na_olen, IPPROTO_ICMPV6, csum_partial(na, sizeof(*na)+na_olen, 0)); pip6->payload_len = htons(sizeof(*na)+na_olen); skb_push(reply, sizeof(struct ipv6hdr)); reply->ip_summed = CHECKSUM_UNNECESSARY; return reply; } static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) { struct vxlan_dev *vxlan = netdev_priv(dev); const struct in6_addr *daddr; const struct ipv6hdr *iphdr; struct inet6_dev *in6_dev; struct neighbour *n; struct nd_msg *msg; rcu_read_lock(); in6_dev = __in6_dev_get(dev); if (!in6_dev) goto out; iphdr = ipv6_hdr(skb); daddr = &iphdr->daddr; msg = (struct nd_msg *)(iphdr + 1); if (ipv6_addr_loopback(daddr) || ipv6_addr_is_multicast(&msg->target)) goto out; n = neigh_lookup(ipv6_stub->nd_tbl, &msg->target, dev); if (n) { struct vxlan_fdb *f; struct sk_buff *reply; if (!(READ_ONCE(n->nud_state) & NUD_CONNECTED)) { neigh_release(n); goto out; } f = vxlan_find_mac(vxlan, n->ha, vni); if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) { /* bridge-local neighbor */ neigh_release(n); goto out; } reply = vxlan_na_create(skb, n, !!(f ? f->flags & NTF_ROUTER : 0)); neigh_release(n); if (reply == NULL) goto out; if (netif_rx(reply) == NET_RX_DROP) { dev->stats.rx_dropped++; vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_RX_DROPS, 0); } } else if (vxlan->cfg.flags & VXLAN_F_L3MISS) { union vxlan_addr ipa = { .sin6.sin6_addr = msg->target, .sin6.sin6_family = AF_INET6, }; vxlan_ip_miss(dev, &ipa); } out: rcu_read_unlock(); consume_skb(skb); return NETDEV_TX_OK; } #endif static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb) { struct vxlan_dev *vxlan = netdev_priv(dev); struct neighbour *n; if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) return false; n = NULL; switch (ntohs(eth_hdr(skb)->h_proto)) { case ETH_P_IP: { struct iphdr *pip; if (!pskb_may_pull(skb, sizeof(struct iphdr))) return false; pip = ip_hdr(skb); n = neigh_lookup(&arp_tbl, &pip->daddr, dev); if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) { union vxlan_addr ipa = { .sin.sin_addr.s_addr = pip->daddr, .sin.sin_family = AF_INET, }; vxlan_ip_miss(dev, &ipa); return false; } break; } #if IS_ENABLED(CONFIG_IPV6) case ETH_P_IPV6: { struct ipv6hdr *pip6; if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) return false; pip6 = ipv6_hdr(skb); n = neigh_lookup(ipv6_stub->nd_tbl, &pip6->daddr, dev); if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) { union vxlan_addr ipa = { .sin6.sin6_addr = pip6->daddr, .sin6.sin6_family = AF_INET6, }; vxlan_ip_miss(dev, &ipa); return false; } break; } #endif default: return false; } if (n) { bool diff; diff = !ether_addr_equal(eth_hdr(skb)->h_dest, n->ha); if (diff) { memcpy(eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, dev->addr_len); memcpy(eth_hdr(skb)->h_dest, n->ha, dev->addr_len); } neigh_release(n); return diff; } return false; } static int vxlan_build_gpe_hdr(struct vxlanhdr *vxh, __be16 protocol) { struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)vxh; gpe->np_applied = 1; gpe->next_protocol = tun_p_from_eth_p(protocol); if (!gpe->next_protocol) return -EPFNOSUPPORT; return 0; } static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst, int iphdr_len, __be32 vni, struct vxlan_metadata *md, u32 vxflags, bool udp_sum) { struct vxlanhdr *vxh; int min_headroom; int err; int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; __be16 inner_protocol = htons(ETH_P_TEB); if ((vxflags & VXLAN_F_REMCSUM_TX) && skb->ip_summed == CHECKSUM_PARTIAL) { int csum_start = skb_checksum_start_offset(skb); if (csum_start <= VXLAN_MAX_REMCSUM_START && !(csum_start & VXLAN_RCO_SHIFT_MASK) && (skb->csum_offset == offsetof(struct udphdr, check) || skb->csum_offset == offsetof(struct tcphdr, check))) type |= SKB_GSO_TUNNEL_REMCSUM; } min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len + VXLAN_HLEN + iphdr_len; /* Need space for new headers (invalidates iph ptr) */ err = skb_cow_head(skb, min_headroom); if (unlikely(err)) return err; err = iptunnel_handle_offloads(skb, type); if (err) return err; vxh = __skb_push(skb, sizeof(*vxh)); vxh->vx_flags = VXLAN_HF_VNI; vxh->vx_vni = vxlan_vni_field(vni); if (type & SKB_GSO_TUNNEL_REMCSUM) { unsigned int start; start = skb_checksum_start_offset(skb) - sizeof(struct vxlanhdr); vxh->vx_vni |= vxlan_compute_rco(start, skb->csum_offset); vxh->vx_flags |= VXLAN_HF_RCO; if (!skb_is_gso(skb)) { skb->ip_summed = CHECKSUM_NONE; skb->encapsulation = 0; } } if (vxflags & VXLAN_F_GBP) vxlan_build_gbp_hdr(vxh, md); if (vxflags & VXLAN_F_GPE) { err = vxlan_build_gpe_hdr(vxh, skb->protocol); if (err < 0) return err; inner_protocol = skb->protocol; } skb_set_inner_protocol(skb, inner_protocol); return 0; } /* Bypass encapsulation if the destination is local */ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, struct vxlan_dev *dst_vxlan, __be32 vni, bool snoop) { union vxlan_addr loopback; union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip; struct net_device *dev; int len = skb->len; skb->pkt_type = PACKET_HOST; skb->encapsulation = 0; skb->dev = dst_vxlan->dev; __skb_pull(skb, skb_network_offset(skb)); if (remote_ip->sa.sa_family == AF_INET) { loopback.sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK); loopback.sa.sa_family = AF_INET; #if IS_ENABLED(CONFIG_IPV6) } else { loopback.sin6.sin6_addr = in6addr_loopback; loopback.sa.sa_family = AF_INET6; #endif } rcu_read_lock(); dev = skb->dev; if (unlikely(!(dev->flags & IFF_UP))) { kfree_skb(skb); goto drop; } if ((dst_vxlan->cfg.flags & VXLAN_F_LEARN) && snoop) vxlan_snoop(dev, &loopback, eth_hdr(skb)->h_source, 0, vni); dev_sw_netstats_tx_add(src_vxlan->dev, 1, len); vxlan_vnifilter_count(src_vxlan, vni, NULL, VXLAN_VNI_STATS_TX, len); if (__netif_rx(skb) == NET_RX_SUCCESS) { dev_sw_netstats_rx_add(dst_vxlan->dev, len); vxlan_vnifilter_count(dst_vxlan, vni, NULL, VXLAN_VNI_STATS_RX, len); } else { drop: dev->stats.rx_dropped++; vxlan_vnifilter_count(dst_vxlan, vni, NULL, VXLAN_VNI_STATS_RX_DROPS, 0); } rcu_read_unlock(); } static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev, struct vxlan_dev *vxlan, int addr_family, __be16 dst_port, int dst_ifindex, __be32 vni, struct dst_entry *dst, u32 rt_flags) { #if IS_ENABLED(CONFIG_IPV6) /* IPv6 rt-flags are checked against RTF_LOCAL, but the value of * RTF_LOCAL is equal to RTCF_LOCAL. So to keep code simple * we can use RTCF_LOCAL which works for ipv4 and ipv6 route entry. */ BUILD_BUG_ON(RTCF_LOCAL != RTF_LOCAL); #endif /* Bypass encapsulation if the destination is local */ if (rt_flags & RTCF_LOCAL && !(rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && vxlan->cfg.flags & VXLAN_F_LOCALBYPASS) { struct vxlan_dev *dst_vxlan; dst_release(dst); dst_vxlan = vxlan_find_vni(vxlan->net, dst_ifindex, vni, addr_family, dst_port, vxlan->cfg.flags); if (!dst_vxlan) { dev->stats.tx_errors++; vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_ERRORS, 0); kfree_skb(skb); return -ENOENT; } vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni, true); return 1; } return 0; } void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, __be32 default_vni, struct vxlan_rdst *rdst, bool did_rsc) { struct dst_cache *dst_cache; struct ip_tunnel_info *info; struct ip_tunnel_key *pkey; struct ip_tunnel_key key; struct vxlan_dev *vxlan = netdev_priv(dev); const struct iphdr *old_iph = ip_hdr(skb); struct vxlan_metadata _md; struct vxlan_metadata *md = &_md; unsigned int pkt_len = skb->len; __be16 src_port = 0, dst_port; struct dst_entry *ndst = NULL; int addr_family; __u8 tos, ttl; int ifindex; int err; u32 flags = vxlan->cfg.flags; bool use_cache; bool udp_sum = false; bool xnet = !net_eq(vxlan->net, dev_net(vxlan->dev)); __be32 vni = 0; info = skb_tunnel_info(skb); use_cache = ip_tunnel_dst_cache_usable(skb, info); if (rdst) { memset(&key, 0, sizeof(key)); pkey = &key; if (vxlan_addr_any(&rdst->remote_ip)) { if (did_rsc) { /* short-circuited back to local bridge */ vxlan_encap_bypass(skb, vxlan, vxlan, default_vni, true); return; } goto drop; } addr_family = vxlan->cfg.saddr.sa.sa_family; dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port; vni = (rdst->remote_vni) ? : default_vni; ifindex = rdst->remote_ifindex; if (addr_family == AF_INET) { key.u.ipv4.src = vxlan->cfg.saddr.sin.sin_addr.s_addr; key.u.ipv4.dst = rdst->remote_ip.sin.sin_addr.s_addr; } else { key.u.ipv6.src = vxlan->cfg.saddr.sin6.sin6_addr; key.u.ipv6.dst = rdst->remote_ip.sin6.sin6_addr; } dst_cache = &rdst->dst_cache; md->gbp = skb->mark; if (flags & VXLAN_F_TTL_INHERIT) { ttl = ip_tunnel_get_ttl(old_iph, skb); } else { ttl = vxlan->cfg.ttl; if (!ttl && vxlan_addr_multicast(&rdst->remote_ip)) ttl = 1; } tos = vxlan->cfg.tos; if (tos == 1) tos = ip_tunnel_get_dsfield(old_iph, skb); if (tos && !info) use_cache = false; if (addr_family == AF_INET) udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM_TX); else udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX); #if IS_ENABLED(CONFIG_IPV6) switch (vxlan->cfg.label_policy) { case VXLAN_LABEL_FIXED: key.label = vxlan->cfg.label; break; case VXLAN_LABEL_INHERIT: key.label = ip_tunnel_get_flowlabel(old_iph, skb); break; default: DEBUG_NET_WARN_ON_ONCE(1); goto drop; } #endif } else { if (!info) { WARN_ONCE(1, "%s: Missing encapsulation instructions\n", dev->name); goto drop; } pkey = &info->key; addr_family = ip_tunnel_info_af(info); dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port; vni = tunnel_id_to_key32(info->key.tun_id); ifindex = 0; dst_cache = &info->dst_cache; if (info->key.tun_flags & TUNNEL_VXLAN_OPT) { if (info->options_len < sizeof(*md)) goto drop; md = ip_tunnel_info_opts(info); } ttl = info->key.ttl; tos = info->key.tos; udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM); } src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min, vxlan->cfg.port_max, true); rcu_read_lock(); if (addr_family == AF_INET) { struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock); struct rtable *rt; __be16 df = 0; __be32 saddr; if (!ifindex) ifindex = sock4->sock->sk->sk_bound_dev_if; rt = udp_tunnel_dst_lookup(skb, dev, vxlan->net, ifindex, &saddr, pkey, src_port, dst_port, tos, use_cache ? dst_cache : NULL); if (IS_ERR(rt)) { err = PTR_ERR(rt); goto tx_error; } if (!info) { /* Bypass encapsulation if the destination is local */ err = encap_bypass_if_local(skb, dev, vxlan, AF_INET, dst_port, ifindex, vni, &rt->dst, rt->rt_flags); if (err) goto out_unlock; if (vxlan->cfg.df == VXLAN_DF_SET) { df = htons(IP_DF); } else if (vxlan->cfg.df == VXLAN_DF_INHERIT) { struct ethhdr *eth = eth_hdr(skb); if (ntohs(eth->h_proto) == ETH_P_IPV6 || (ntohs(eth->h_proto) == ETH_P_IP && old_iph->frag_off & htons(IP_DF))) df = htons(IP_DF); } } else if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) { df = htons(IP_DF); } ndst = &rt->dst; err = skb_tunnel_check_pmtu(skb, ndst, vxlan_headroom(flags & VXLAN_F_GPE), netif_is_any_bridge_port(dev)); if (err < 0) { goto tx_error; } else if (err) { if (info) { struct ip_tunnel_info *unclone; unclone = skb_tunnel_info_unclone(skb); if (unlikely(!unclone)) goto tx_error; unclone->key.u.ipv4.src = pkey->u.ipv4.dst; unclone->key.u.ipv4.dst = saddr; } vxlan_encap_bypass(skb, vxlan, vxlan, vni, false); dst_release(ndst); goto out_unlock; } tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); err = vxlan_build_skb(skb, ndst, sizeof(struct iphdr), vni, md, flags, udp_sum); if (err < 0) goto tx_error; udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, saddr, pkey->u.ipv4.dst, tos, ttl, df, src_port, dst_port, xnet, !udp_sum); #if IS_ENABLED(CONFIG_IPV6) } else { struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock); struct in6_addr saddr; if (!ifindex) ifindex = sock6->sock->sk->sk_bound_dev_if; ndst = udp_tunnel6_dst_lookup(skb, dev, vxlan->net, sock6->sock, ifindex, &saddr, pkey, src_port, dst_port, tos, use_cache ? dst_cache : NULL); if (IS_ERR(ndst)) { err = PTR_ERR(ndst); ndst = NULL; goto tx_error; } if (!info) { u32 rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags; err = encap_bypass_if_local(skb, dev, vxlan, AF_INET6, dst_port, ifindex, vni, ndst, rt6i_flags); if (err) goto out_unlock; } err = skb_tunnel_check_pmtu(skb, ndst, vxlan_headroom((flags & VXLAN_F_GPE) | VXLAN_F_IPV6), netif_is_any_bridge_port(dev)); if (err < 0) { goto tx_error; } else if (err) { if (info) { struct ip_tunnel_info *unclone; unclone = skb_tunnel_info_unclone(skb); if (unlikely(!unclone)) goto tx_error; unclone->key.u.ipv6.src = pkey->u.ipv6.dst; unclone->key.u.ipv6.dst = saddr; } vxlan_encap_bypass(skb, vxlan, vxlan, vni, false); dst_release(ndst); goto out_unlock; } tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip6_dst_hoplimit(ndst); skb_scrub_packet(skb, xnet); err = vxlan_build_skb(skb, ndst, sizeof(struct ipv6hdr), vni, md, flags, udp_sum); if (err < 0) goto tx_error; udp_tunnel6_xmit_skb(ndst, sock6->sock->sk, skb, dev, &saddr, &pkey->u.ipv6.dst, tos, ttl, pkey->label, src_port, dst_port, !udp_sum); #endif } vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX, pkt_len); out_unlock: rcu_read_unlock(); return; drop: dev->stats.tx_dropped++; vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); dev_kfree_skb(skb); return; tx_error: rcu_read_unlock(); if (err == -ELOOP) dev->stats.collisions++; else if (err == -ENETUNREACH) dev->stats.tx_carrier_errors++; dst_release(ndst); dev->stats.tx_errors++; vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_ERRORS, 0); kfree_skb(skb); } static void vxlan_xmit_nh(struct sk_buff *skb, struct net_device *dev, struct vxlan_fdb *f, __be32 vni, bool did_rsc) { struct vxlan_rdst nh_rdst; struct nexthop *nh; bool do_xmit; u32 hash; memset(&nh_rdst, 0, sizeof(struct vxlan_rdst)); hash = skb_get_hash(skb); rcu_read_lock(); nh = rcu_dereference(f->nh); if (!nh) { rcu_read_unlock(); goto drop; } do_xmit = vxlan_fdb_nh_path_select(nh, hash, &nh_rdst); rcu_read_unlock(); if (likely(do_xmit)) vxlan_xmit_one(skb, dev, vni, &nh_rdst, did_rsc); else goto drop; return; drop: dev->stats.tx_dropped++; vxlan_vnifilter_count(netdev_priv(dev), vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); dev_kfree_skb(skb); } static netdev_tx_t vxlan_xmit_nhid(struct sk_buff *skb, struct net_device *dev, u32 nhid, __be32 vni) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst nh_rdst; struct nexthop *nh; bool do_xmit; u32 hash; memset(&nh_rdst, 0, sizeof(struct vxlan_rdst)); hash = skb_get_hash(skb); rcu_read_lock(); nh = nexthop_find_by_id(dev_net(dev), nhid); if (unlikely(!nh || !nexthop_is_fdb(nh) || !nexthop_is_multipath(nh))) { rcu_read_unlock(); goto drop; } do_xmit = vxlan_fdb_nh_path_select(nh, hash, &nh_rdst); rcu_read_unlock(); if (vxlan->cfg.saddr.sa.sa_family != nh_rdst.remote_ip.sa.sa_family) goto drop; if (likely(do_xmit)) vxlan_xmit_one(skb, dev, vni, &nh_rdst, false); else goto drop; return NETDEV_TX_OK; drop: dev->stats.tx_dropped++; vxlan_vnifilter_count(netdev_priv(dev), vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); dev_kfree_skb(skb); return NETDEV_TX_OK; } /* Transmit local packets over Vxlan * * Outer IP header inherits ECN and DF from inner header. * Outer UDP destination is the VXLAN assigned port. * source port is based on hash of flow */ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *rdst, *fdst = NULL; const struct ip_tunnel_info *info; bool did_rsc = false; struct vxlan_fdb *f; struct ethhdr *eth; __be32 vni = 0; u32 nhid = 0; info = skb_tunnel_info(skb); skb_reset_mac_header(skb); if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) { if (info && info->mode & IP_TUNNEL_INFO_BRIDGE && info->mode & IP_TUNNEL_INFO_TX) { vni = tunnel_id_to_key32(info->key.tun_id); nhid = info->key.nhid; } else { if (info && info->mode & IP_TUNNEL_INFO_TX) vxlan_xmit_one(skb, dev, vni, NULL, false); else kfree_skb(skb); return NETDEV_TX_OK; } } if (vxlan->cfg.flags & VXLAN_F_PROXY) { eth = eth_hdr(skb); if (ntohs(eth->h_proto) == ETH_P_ARP) return arp_reduce(dev, skb, vni); #if IS_ENABLED(CONFIG_IPV6) else if (ntohs(eth->h_proto) == ETH_P_IPV6 && pskb_may_pull(skb, sizeof(struct ipv6hdr) + sizeof(struct nd_msg)) && ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) { struct nd_msg *m = (struct nd_msg *)(ipv6_hdr(skb) + 1); if (m->icmph.icmp6_code == 0 && m->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) return neigh_reduce(dev, skb, vni); } #endif } if (nhid) return vxlan_xmit_nhid(skb, dev, nhid, vni); if (vxlan->cfg.flags & VXLAN_F_MDB) { struct vxlan_mdb_entry *mdb_entry; rcu_read_lock(); mdb_entry = vxlan_mdb_entry_skb_get(vxlan, skb, vni); if (mdb_entry) { netdev_tx_t ret; ret = vxlan_mdb_xmit(vxlan, mdb_entry, skb); rcu_read_unlock(); return ret; } rcu_read_unlock(); } eth = eth_hdr(skb); f = vxlan_find_mac(vxlan, eth->h_dest, vni); did_rsc = false; if (f && (f->flags & NTF_ROUTER) && (vxlan->cfg.flags & VXLAN_F_RSC) && (ntohs(eth->h_proto) == ETH_P_IP || ntohs(eth->h_proto) == ETH_P_IPV6)) { did_rsc = route_shortcircuit(dev, skb); if (did_rsc) f = vxlan_find_mac(vxlan, eth->h_dest, vni); } if (f == NULL) { f = vxlan_find_mac(vxlan, all_zeros_mac, vni); if (f == NULL) { if ((vxlan->cfg.flags & VXLAN_F_L2MISS) && !is_multicast_ether_addr(eth->h_dest)) vxlan_fdb_miss(vxlan, eth->h_dest); dev->stats.tx_dropped++; vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); kfree_skb(skb); return NETDEV_TX_OK; } } if (rcu_access_pointer(f->nh)) { vxlan_xmit_nh(skb, dev, f, (vni ? : vxlan->default_dst.remote_vni), did_rsc); } else { list_for_each_entry_rcu(rdst, &f->remotes, list) { struct sk_buff *skb1; if (!fdst) { fdst = rdst; continue; } skb1 = skb_clone(skb, GFP_ATOMIC); if (skb1) vxlan_xmit_one(skb1, dev, vni, rdst, did_rsc); } if (fdst) vxlan_xmit_one(skb, dev, vni, fdst, did_rsc); else kfree_skb(skb); } return NETDEV_TX_OK; } /* Walk the forwarding table and purge stale entries */ static void vxlan_cleanup(struct timer_list *t) { struct vxlan_dev *vxlan = from_timer(vxlan, t, age_timer); unsigned long next_timer = jiffies + FDB_AGE_INTERVAL; unsigned int h; if (!netif_running(vxlan->dev)) return; for (h = 0; h < FDB_HASH_SIZE; ++h) { struct hlist_node *p, *n; spin_lock(&vxlan->hash_lock[h]); hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { struct vxlan_fdb *f = container_of(p, struct vxlan_fdb, hlist); unsigned long timeout; if (f->state & (NUD_PERMANENT | NUD_NOARP)) continue; if (f->flags & NTF_EXT_LEARNED) continue; timeout = f->used + vxlan->cfg.age_interval * HZ; if (time_before_eq(timeout, jiffies)) { netdev_dbg(vxlan->dev, "garbage collect %pM\n", f->eth_addr); f->state = NUD_STALE; vxlan_fdb_destroy(vxlan, f, true, true); } else if (time_before(timeout, next_timer)) next_timer = timeout; } spin_unlock(&vxlan->hash_lock[h]); } mod_timer(&vxlan->age_timer, next_timer); } static void vxlan_vs_del_dev(struct vxlan_dev *vxlan) { struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); spin_lock(&vn->sock_lock); hlist_del_init_rcu(&vxlan->hlist4.hlist); #if IS_ENABLED(CONFIG_IPV6) hlist_del_init_rcu(&vxlan->hlist6.hlist); #endif spin_unlock(&vn->sock_lock); } static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan, struct vxlan_dev_node *node) { struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); __be32 vni = vxlan->default_dst.remote_vni; node->vxlan = vxlan; spin_lock(&vn->sock_lock); hlist_add_head_rcu(&node->hlist, vni_head(vs, vni)); spin_unlock(&vn->sock_lock); } /* Setup stats when device is created */ static int vxlan_init(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); int err; if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) vxlan_vnigroup_init(vxlan); dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) { err = -ENOMEM; goto err_vnigroup_uninit; } err = gro_cells_init(&vxlan->gro_cells, dev); if (err) goto err_free_percpu; err = vxlan_mdb_init(vxlan); if (err) goto err_gro_cells_destroy; return 0; err_gro_cells_destroy: gro_cells_destroy(&vxlan->gro_cells); err_free_percpu: free_percpu(dev->tstats); err_vnigroup_uninit: if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) vxlan_vnigroup_uninit(vxlan); return err; } static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan, __be32 vni) { struct vxlan_fdb *f; u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, vni); spin_lock_bh(&vxlan->hash_lock[hash_index]); f = __vxlan_find_mac(vxlan, all_zeros_mac, vni); if (f) vxlan_fdb_destroy(vxlan, f, true, true); spin_unlock_bh(&vxlan->hash_lock[hash_index]); } static void vxlan_uninit(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); vxlan_mdb_fini(vxlan); if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) vxlan_vnigroup_uninit(vxlan); gro_cells_destroy(&vxlan->gro_cells); vxlan_fdb_delete_default(vxlan, vxlan->cfg.vni); free_percpu(dev->tstats); } /* Start ageing timer and join group when device is brought up */ static int vxlan_open(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); int ret; ret = vxlan_sock_add(vxlan); if (ret < 0) return ret; ret = vxlan_multicast_join(vxlan); if (ret) { vxlan_sock_release(vxlan); return ret; } if (vxlan->cfg.age_interval) mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL); return ret; } struct vxlan_fdb_flush_desc { bool ignore_default_entry; unsigned long state; unsigned long state_mask; unsigned long flags; unsigned long flags_mask; __be32 src_vni; u32 nhid; __be32 vni; __be16 port; union vxlan_addr dst_ip; }; static bool vxlan_fdb_is_default_entry(const struct vxlan_fdb *f, const struct vxlan_dev *vxlan) { return is_zero_ether_addr(f->eth_addr) && f->vni == vxlan->cfg.vni; } static bool vxlan_fdb_nhid_matches(const struct vxlan_fdb *f, u32 nhid) { struct nexthop *nh = rtnl_dereference(f->nh); return nh && nh->id == nhid; } static bool vxlan_fdb_flush_matches(const struct vxlan_fdb *f, const struct vxlan_dev *vxlan, const struct vxlan_fdb_flush_desc *desc) { if (desc->state_mask && (f->state & desc->state_mask) != desc->state) return false; if (desc->flags_mask && (f->flags & desc->flags_mask) != desc->flags) return false; if (desc->ignore_default_entry && vxlan_fdb_is_default_entry(f, vxlan)) return false; if (desc->src_vni && f->vni != desc->src_vni) return false; if (desc->nhid && !vxlan_fdb_nhid_matches(f, desc->nhid)) return false; return true; } static bool vxlan_fdb_flush_should_match_remotes(const struct vxlan_fdb_flush_desc *desc) { return desc->vni || desc->port || desc->dst_ip.sa.sa_family; } static bool vxlan_fdb_flush_remote_matches(const struct vxlan_fdb_flush_desc *desc, const struct vxlan_rdst *rd) { if (desc->vni && rd->remote_vni != desc->vni) return false; if (desc->port && rd->remote_port != desc->port) return false; if (desc->dst_ip.sa.sa_family && !vxlan_addr_equal(&rd->remote_ip, &desc->dst_ip)) return false; return true; } static void vxlan_fdb_flush_match_remotes(struct vxlan_fdb *f, struct vxlan_dev *vxlan, const struct vxlan_fdb_flush_desc *desc, bool *p_destroy_fdb) { bool remotes_flushed = false; struct vxlan_rdst *rd, *tmp; list_for_each_entry_safe(rd, tmp, &f->remotes, list) { if (!vxlan_fdb_flush_remote_matches(desc, rd)) continue; vxlan_fdb_dst_destroy(vxlan, f, rd, true); remotes_flushed = true; } *p_destroy_fdb = remotes_flushed && list_empty(&f->remotes); } /* Purge the forwarding table */ static void vxlan_flush(struct vxlan_dev *vxlan, const struct vxlan_fdb_flush_desc *desc) { bool match_remotes = vxlan_fdb_flush_should_match_remotes(desc); unsigned int h; for (h = 0; h < FDB_HASH_SIZE; ++h) { struct hlist_node *p, *n; spin_lock_bh(&vxlan->hash_lock[h]); hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { struct vxlan_fdb *f = container_of(p, struct vxlan_fdb, hlist); if (!vxlan_fdb_flush_matches(f, vxlan, desc)) continue; if (match_remotes) { bool destroy_fdb = false; vxlan_fdb_flush_match_remotes(f, vxlan, desc, &destroy_fdb); if (!destroy_fdb) continue; } vxlan_fdb_destroy(vxlan, f, true, true); } spin_unlock_bh(&vxlan->hash_lock[h]); } } static const struct nla_policy vxlan_del_bulk_policy[NDA_MAX + 1] = { [NDA_SRC_VNI] = { .type = NLA_U32 }, [NDA_NH_ID] = { .type = NLA_U32 }, [NDA_VNI] = { .type = NLA_U32 }, [NDA_PORT] = { .type = NLA_U16 }, [NDA_DST] = NLA_POLICY_RANGE(NLA_BINARY, sizeof(struct in_addr), sizeof(struct in6_addr)), [NDA_NDM_STATE_MASK] = { .type = NLA_U16 }, [NDA_NDM_FLAGS_MASK] = { .type = NLA_U8 }, }; #define VXLAN_FDB_FLUSH_IGNORED_NDM_FLAGS (NTF_MASTER | NTF_SELF) #define VXLAN_FDB_FLUSH_ALLOWED_NDM_STATES (NUD_PERMANENT | NUD_NOARP) #define VXLAN_FDB_FLUSH_ALLOWED_NDM_FLAGS (NTF_EXT_LEARNED | NTF_OFFLOADED | \ NTF_ROUTER) static int vxlan_fdb_delete_bulk(struct nlmsghdr *nlh, struct net_device *dev, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb_flush_desc desc = {}; struct ndmsg *ndm = nlmsg_data(nlh); struct nlattr *tb[NDA_MAX + 1]; u8 ndm_flags; int err; ndm_flags = ndm->ndm_flags & ~VXLAN_FDB_FLUSH_IGNORED_NDM_FLAGS; err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, vxlan_del_bulk_policy, extack); if (err) return err; if (ndm_flags & ~VXLAN_FDB_FLUSH_ALLOWED_NDM_FLAGS) { NL_SET_ERR_MSG(extack, "Unsupported fdb flush ndm flag bits set"); return -EINVAL; } if (ndm->ndm_state & ~VXLAN_FDB_FLUSH_ALLOWED_NDM_STATES) { NL_SET_ERR_MSG(extack, "Unsupported fdb flush ndm state bits set"); return -EINVAL; } desc.state = ndm->ndm_state; desc.flags = ndm_flags; if (tb[NDA_NDM_STATE_MASK]) desc.state_mask = nla_get_u16(tb[NDA_NDM_STATE_MASK]); if (tb[NDA_NDM_FLAGS_MASK]) desc.flags_mask = nla_get_u8(tb[NDA_NDM_FLAGS_MASK]); if (tb[NDA_SRC_VNI]) desc.src_vni = cpu_to_be32(nla_get_u32(tb[NDA_SRC_VNI])); if (tb[NDA_NH_ID]) desc.nhid = nla_get_u32(tb[NDA_NH_ID]); if (tb[NDA_VNI]) desc.vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI])); if (tb[NDA_PORT]) desc.port = nla_get_be16(tb[NDA_PORT]); if (tb[NDA_DST]) { union vxlan_addr ip; err = vxlan_nla_get_addr(&ip, tb[NDA_DST]); if (err) { NL_SET_ERR_MSG_ATTR(extack, tb[NDA_DST], "Unsupported address family"); return err; } desc.dst_ip = ip; } vxlan_flush(vxlan, &desc); return 0; } /* Cleanup timer and forwarding table on shutdown */ static int vxlan_stop(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb_flush_desc desc = { /* Default entry is deleted at vxlan_uninit. */ .ignore_default_entry = true, .state = 0, .state_mask = NUD_PERMANENT | NUD_NOARP, }; vxlan_multicast_leave(vxlan); del_timer_sync(&vxlan->age_timer); vxlan_flush(vxlan, &desc); vxlan_sock_release(vxlan); return 0; } /* Stub, nothing needs to be done. */ static void vxlan_set_multicast_list(struct net_device *dev) { } static int vxlan_change_mtu(struct net_device *dev, int new_mtu) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *dst = &vxlan->default_dst; struct net_device *lowerdev = __dev_get_by_index(vxlan->net, dst->remote_ifindex); /* This check is different than dev->max_mtu, because it looks at * the lowerdev->mtu, rather than the static dev->max_mtu */ if (lowerdev) { int max_mtu = lowerdev->mtu - vxlan_headroom(vxlan->cfg.flags); if (new_mtu > max_mtu) return -EINVAL; } dev->mtu = new_mtu; return 0; } static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) { struct vxlan_dev *vxlan = netdev_priv(dev); struct ip_tunnel_info *info = skb_tunnel_info(skb); __be16 sport, dport; sport = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min, vxlan->cfg.port_max, true); dport = info->key.tp_dst ? : vxlan->cfg.dst_port; if (ip_tunnel_info_af(info) == AF_INET) { struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock); struct rtable *rt; if (!sock4) return -EIO; rt = udp_tunnel_dst_lookup(skb, dev, vxlan->net, 0, &info->key.u.ipv4.src, &info->key, sport, dport, info->key.tos, &info->dst_cache); if (IS_ERR(rt)) return PTR_ERR(rt); ip_rt_put(rt); } else { #if IS_ENABLED(CONFIG_IPV6) struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock); struct dst_entry *ndst; if (!sock6) return -EIO; ndst = udp_tunnel6_dst_lookup(skb, dev, vxlan->net, sock6->sock, 0, &info->key.u.ipv6.src, &info->key, sport, dport, info->key.tos, &info->dst_cache); if (IS_ERR(ndst)) return PTR_ERR(ndst); dst_release(ndst); #else /* !CONFIG_IPV6 */ return -EPFNOSUPPORT; #endif } info->key.tp_src = sport; info->key.tp_dst = dport; return 0; } static const struct net_device_ops vxlan_netdev_ether_ops = { .ndo_init = vxlan_init, .ndo_uninit = vxlan_uninit, .ndo_open = vxlan_open, .ndo_stop = vxlan_stop, .ndo_start_xmit = vxlan_xmit, .ndo_get_stats64 = dev_get_tstats64, .ndo_set_rx_mode = vxlan_set_multicast_list, .ndo_change_mtu = vxlan_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, .ndo_fdb_add = vxlan_fdb_add, .ndo_fdb_del = vxlan_fdb_delete, .ndo_fdb_del_bulk = vxlan_fdb_delete_bulk, .ndo_fdb_dump = vxlan_fdb_dump, .ndo_fdb_get = vxlan_fdb_get, .ndo_mdb_add = vxlan_mdb_add, .ndo_mdb_del = vxlan_mdb_del, .ndo_mdb_del_bulk = vxlan_mdb_del_bulk, .ndo_mdb_dump = vxlan_mdb_dump, .ndo_mdb_get = vxlan_mdb_get, .ndo_fill_metadata_dst = vxlan_fill_metadata_dst, }; static const struct net_device_ops vxlan_netdev_raw_ops = { .ndo_init = vxlan_init, .ndo_uninit = vxlan_uninit, .ndo_open = vxlan_open, .ndo_stop = vxlan_stop, .ndo_start_xmit = vxlan_xmit, .ndo_get_stats64 = dev_get_tstats64, .ndo_change_mtu = vxlan_change_mtu, .ndo_fill_metadata_dst = vxlan_fill_metadata_dst, }; /* Info for udev, that this is a virtual tunnel endpoint */ static struct device_type vxlan_type = { .name = "vxlan", }; /* Calls the ndo_udp_tunnel_add of the caller in order to * supply the listening VXLAN udp ports. Callers are expected * to implement the ndo_udp_tunnel_add. */ static void vxlan_offload_rx_ports(struct net_device *dev, bool push) { struct vxlan_sock *vs; struct net *net = dev_net(dev); struct vxlan_net *vn = net_generic(net, vxlan_net_id); unsigned int i; spin_lock(&vn->sock_lock); for (i = 0; i < PORT_HASH_SIZE; ++i) { hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) { unsigned short type; if (vs->flags & VXLAN_F_GPE) type = UDP_TUNNEL_TYPE_VXLAN_GPE; else type = UDP_TUNNEL_TYPE_VXLAN; if (push) udp_tunnel_push_rx_port(dev, vs->sock, type); else udp_tunnel_drop_rx_port(dev, vs->sock, type); } } spin_unlock(&vn->sock_lock); } /* Initialize the device structure. */ static void vxlan_setup(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); unsigned int h; eth_hw_addr_random(dev); ether_setup(dev); dev->needs_free_netdev = true; SET_NETDEV_DEVTYPE(dev, &vxlan_type); dev->features |= NETIF_F_LLTX; dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST; dev->features |= NETIF_F_RXCSUM; dev->features |= NETIF_F_GSO_SOFTWARE; dev->vlan_features = dev->features; dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST; dev->hw_features |= NETIF_F_RXCSUM; dev->hw_features |= NETIF_F_GSO_SOFTWARE; netif_keep_dst(dev); dev->priv_flags |= IFF_NO_QUEUE | IFF_CHANGE_PROTO_DOWN; /* MTU range: 68 - 65535 */ dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = ETH_MAX_MTU; INIT_LIST_HEAD(&vxlan->next); timer_setup(&vxlan->age_timer, vxlan_cleanup, TIMER_DEFERRABLE); vxlan->dev = dev; for (h = 0; h < FDB_HASH_SIZE; ++h) { spin_lock_init(&vxlan->hash_lock[h]); INIT_HLIST_HEAD(&vxlan->fdb_head[h]); } } static void vxlan_ether_setup(struct net_device *dev) { dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; dev->netdev_ops = &vxlan_netdev_ether_ops; } static void vxlan_raw_setup(struct net_device *dev) { dev->header_ops = NULL; dev->type = ARPHRD_NONE; dev->hard_header_len = 0; dev->addr_len = 0; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; dev->netdev_ops = &vxlan_netdev_raw_ops; } static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_UNSPEC] = { .strict_start_type = IFLA_VXLAN_LOCALBYPASS }, [IFLA_VXLAN_ID] = { .type = NLA_U32 }, [IFLA_VXLAN_GROUP] = { .len = sizeof_field(struct iphdr, daddr) }, [IFLA_VXLAN_GROUP6] = { .len = sizeof(struct in6_addr) }, [IFLA_VXLAN_LINK] = { .type = NLA_U32 }, [IFLA_VXLAN_LOCAL] = { .len = sizeof_field(struct iphdr, saddr) }, [IFLA_VXLAN_LOCAL6] = { .len = sizeof(struct in6_addr) }, [IFLA_VXLAN_TOS] = { .type = NLA_U8 }, [IFLA_VXLAN_TTL] = { .type = NLA_U8 }, [IFLA_VXLAN_LABEL] = { .type = NLA_U32 }, [IFLA_VXLAN_LEARNING] = { .type = NLA_U8 }, [IFLA_VXLAN_AGEING] = { .type = NLA_U32 }, [IFLA_VXLAN_LIMIT] = { .type = NLA_U32 }, [IFLA_VXLAN_PORT_RANGE] = { .len = sizeof(struct ifla_vxlan_port_range) }, [IFLA_VXLAN_PROXY] = { .type = NLA_U8 }, [IFLA_VXLAN_RSC] = { .type = NLA_U8 }, [IFLA_VXLAN_L2MISS] = { .type = NLA_U8 }, [IFLA_VXLAN_L3MISS] = { .type = NLA_U8 }, [IFLA_VXLAN_COLLECT_METADATA] = { .type = NLA_U8 }, [IFLA_VXLAN_PORT] = { .type = NLA_U16 }, [IFLA_VXLAN_UDP_CSUM] = { .type = NLA_U8 }, [IFLA_VXLAN_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 }, [IFLA_VXLAN_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 }, [IFLA_VXLAN_REMCSUM_TX] = { .type = NLA_U8 }, [IFLA_VXLAN_REMCSUM_RX] = { .type = NLA_U8 }, [IFLA_VXLAN_GBP] = { .type = NLA_FLAG, }, [IFLA_VXLAN_GPE] = { .type = NLA_FLAG, }, [IFLA_VXLAN_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG }, [IFLA_VXLAN_TTL_INHERIT] = { .type = NLA_FLAG }, [IFLA_VXLAN_DF] = { .type = NLA_U8 }, [IFLA_VXLAN_VNIFILTER] = { .type = NLA_U8 }, [IFLA_VXLAN_LOCALBYPASS] = NLA_POLICY_MAX(NLA_U8, 1), [IFLA_VXLAN_LABEL_POLICY] = NLA_POLICY_MAX(NLA_U32, VXLAN_LABEL_MAX), }; static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS], "Provided link layer address is not Ethernet"); return -EINVAL; } if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS], "Provided Ethernet address is not unicast"); return -EADDRNOTAVAIL; } } if (tb[IFLA_MTU]) { u32 mtu = nla_get_u32(tb[IFLA_MTU]); if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU], "MTU must be between 68 and 65535"); return -EINVAL; } } if (!data) { NL_SET_ERR_MSG(extack, "Required attributes not provided to perform the operation"); return -EINVAL; } if (data[IFLA_VXLAN_ID]) { u32 id = nla_get_u32(data[IFLA_VXLAN_ID]); if (id >= VXLAN_N_VID) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_ID], "VXLAN ID must be lower than 16777216"); return -ERANGE; } } if (data[IFLA_VXLAN_PORT_RANGE]) { const struct ifla_vxlan_port_range *p = nla_data(data[IFLA_VXLAN_PORT_RANGE]); if (ntohs(p->high) < ntohs(p->low)) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_PORT_RANGE], "Invalid source port range"); return -EINVAL; } } if (data[IFLA_VXLAN_DF]) { enum ifla_vxlan_df df = nla_get_u8(data[IFLA_VXLAN_DF]); if (df < 0 || df > VXLAN_DF_MAX) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_DF], "Invalid DF attribute"); return -EINVAL; } } return 0; } static void vxlan_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo) { strscpy(drvinfo->version, VXLAN_VERSION, sizeof(drvinfo->version)); strscpy(drvinfo->driver, "vxlan", sizeof(drvinfo->driver)); } static int vxlan_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *dst = &vxlan->default_dst; struct net_device *lowerdev = __dev_get_by_index(vxlan->net, dst->remote_ifindex); if (!lowerdev) { cmd->base.duplex = DUPLEX_UNKNOWN; cmd->base.port = PORT_OTHER; cmd->base.speed = SPEED_UNKNOWN; return 0; } return __ethtool_get_link_ksettings(lowerdev, cmd); } static const struct ethtool_ops vxlan_ethtool_ops = { .get_drvinfo = vxlan_get_drvinfo, .get_link = ethtool_op_get_link, .get_link_ksettings = vxlan_get_link_ksettings, }; static struct socket *vxlan_create_sock(struct net *net, bool ipv6, __be16 port, u32 flags, int ifindex) { struct socket *sock; struct udp_port_cfg udp_conf; int err; memset(&udp_conf, 0, sizeof(udp_conf)); if (ipv6) { udp_conf.family = AF_INET6; udp_conf.use_udp6_rx_checksums = !(flags & VXLAN_F_UDP_ZERO_CSUM6_RX); udp_conf.ipv6_v6only = 1; } else { udp_conf.family = AF_INET; } udp_conf.local_udp_port = port; udp_conf.bind_ifindex = ifindex; /* Open UDP socket */ err = udp_sock_create(net, &udp_conf, &sock); if (err < 0) return ERR_PTR(err); udp_allow_gso(sock->sk); return sock; } /* Create new listen socket if needed */ static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6, __be16 port, u32 flags, int ifindex) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_sock *vs; struct socket *sock; unsigned int h; struct udp_tunnel_sock_cfg tunnel_cfg; vs = kzalloc(sizeof(*vs), GFP_KERNEL); if (!vs) return ERR_PTR(-ENOMEM); for (h = 0; h < VNI_HASH_SIZE; ++h) INIT_HLIST_HEAD(&vs->vni_list[h]); sock = vxlan_create_sock(net, ipv6, port, flags, ifindex); if (IS_ERR(sock)) { kfree(vs); return ERR_CAST(sock); } vs->sock = sock; refcount_set(&vs->refcnt, 1); vs->flags = (flags & VXLAN_F_RCV_FLAGS); spin_lock(&vn->sock_lock); hlist_add_head_rcu(&vs->hlist, vs_head(net, port)); udp_tunnel_notify_add_rx_port(sock, (vs->flags & VXLAN_F_GPE) ? UDP_TUNNEL_TYPE_VXLAN_GPE : UDP_TUNNEL_TYPE_VXLAN); spin_unlock(&vn->sock_lock); /* Mark socket as an encapsulation socket. */ memset(&tunnel_cfg, 0, sizeof(tunnel_cfg)); tunnel_cfg.sk_user_data = vs; tunnel_cfg.encap_type = 1; tunnel_cfg.encap_rcv = vxlan_rcv; tunnel_cfg.encap_err_lookup = vxlan_err_lookup; tunnel_cfg.encap_destroy = NULL; if (vs->flags & VXLAN_F_GPE) { tunnel_cfg.gro_receive = vxlan_gpe_gro_receive; tunnel_cfg.gro_complete = vxlan_gpe_gro_complete; } else { tunnel_cfg.gro_receive = vxlan_gro_receive; tunnel_cfg.gro_complete = vxlan_gro_complete; } setup_udp_tunnel_sock(net, sock, &tunnel_cfg); return vs; } static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6) { struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); bool metadata = vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA; struct vxlan_sock *vs = NULL; struct vxlan_dev_node *node; int l3mdev_index = 0; if (vxlan->cfg.remote_ifindex) l3mdev_index = l3mdev_master_upper_ifindex_by_index( vxlan->net, vxlan->cfg.remote_ifindex); if (!vxlan->cfg.no_share) { spin_lock(&vn->sock_lock); vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET, vxlan->cfg.dst_port, vxlan->cfg.flags, l3mdev_index); if (vs && !refcount_inc_not_zero(&vs->refcnt)) { spin_unlock(&vn->sock_lock); return -EBUSY; } spin_unlock(&vn->sock_lock); } if (!vs) vs = vxlan_socket_create(vxlan->net, ipv6, vxlan->cfg.dst_port, vxlan->cfg.flags, l3mdev_index); if (IS_ERR(vs)) return PTR_ERR(vs); #if IS_ENABLED(CONFIG_IPV6) if (ipv6) { rcu_assign_pointer(vxlan->vn6_sock, vs); node = &vxlan->hlist6; } else #endif { rcu_assign_pointer(vxlan->vn4_sock, vs); node = &vxlan->hlist4; } if (metadata && (vxlan->cfg.flags & VXLAN_F_VNIFILTER)) vxlan_vs_add_vnigrp(vxlan, vs, ipv6); else vxlan_vs_add_dev(vs, vxlan, node); return 0; } static int vxlan_sock_add(struct vxlan_dev *vxlan) { bool metadata = vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA; bool ipv6 = vxlan->cfg.flags & VXLAN_F_IPV6 || metadata; bool ipv4 = !ipv6 || metadata; int ret = 0; RCU_INIT_POINTER(vxlan->vn4_sock, NULL); #if IS_ENABLED(CONFIG_IPV6) RCU_INIT_POINTER(vxlan->vn6_sock, NULL); if (ipv6) { ret = __vxlan_sock_add(vxlan, true); if (ret < 0 && ret != -EAFNOSUPPORT) ipv4 = false; } #endif if (ipv4) ret = __vxlan_sock_add(vxlan, false); if (ret < 0) vxlan_sock_release(vxlan); return ret; } int vxlan_vni_in_use(struct net *src_net, struct vxlan_dev *vxlan, struct vxlan_config *conf, __be32 vni) { struct vxlan_net *vn = net_generic(src_net, vxlan_net_id); struct vxlan_dev *tmp; list_for_each_entry(tmp, &vn->vxlan_list, next) { if (tmp == vxlan) continue; if (tmp->cfg.flags & VXLAN_F_VNIFILTER) { if (!vxlan_vnifilter_lookup(tmp, vni)) continue; } else if (tmp->cfg.vni != vni) { continue; } if (tmp->cfg.dst_port != conf->dst_port) continue; if ((tmp->cfg.flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)) != (conf->flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6))) continue; if ((conf->flags & VXLAN_F_IPV6_LINKLOCAL) && tmp->cfg.remote_ifindex != conf->remote_ifindex) continue; return -EEXIST; } return 0; } static int vxlan_config_validate(struct net *src_net, struct vxlan_config *conf, struct net_device **lower, struct vxlan_dev *old, struct netlink_ext_ack *extack) { bool use_ipv6 = false; if (conf->flags & VXLAN_F_GPE) { /* For now, allow GPE only together with * COLLECT_METADATA. This can be relaxed later; in such * case, the other side of the PtP link will have to be * provided. */ if ((conf->flags & ~VXLAN_F_ALLOWED_GPE) || !(conf->flags & VXLAN_F_COLLECT_METADATA)) { NL_SET_ERR_MSG(extack, "VXLAN GPE does not support this combination of attributes"); return -EINVAL; } } if (!conf->remote_ip.sa.sa_family && !conf->saddr.sa.sa_family) { /* Unless IPv6 is explicitly requested, assume IPv4 */ conf->remote_ip.sa.sa_family = AF_INET; conf->saddr.sa.sa_family = AF_INET; } else if (!conf->remote_ip.sa.sa_family) { conf->remote_ip.sa.sa_family = conf->saddr.sa.sa_family; } else if (!conf->saddr.sa.sa_family) { conf->saddr.sa.sa_family = conf->remote_ip.sa.sa_family; } if (conf->saddr.sa.sa_family != conf->remote_ip.sa.sa_family) { NL_SET_ERR_MSG(extack, "Local and remote address must be from the same family"); return -EINVAL; } if (vxlan_addr_multicast(&conf->saddr)) { NL_SET_ERR_MSG(extack, "Local address cannot be multicast"); return -EINVAL; } if (conf->saddr.sa.sa_family == AF_INET6) { if (!IS_ENABLED(CONFIG_IPV6)) { NL_SET_ERR_MSG(extack, "IPv6 support not enabled in the kernel"); return -EPFNOSUPPORT; } use_ipv6 = true; conf->flags |= VXLAN_F_IPV6; if (!(conf->flags & VXLAN_F_COLLECT_METADATA)) { int local_type = ipv6_addr_type(&conf->saddr.sin6.sin6_addr); int remote_type = ipv6_addr_type(&conf->remote_ip.sin6.sin6_addr); if (local_type & IPV6_ADDR_LINKLOCAL) { if (!(remote_type & IPV6_ADDR_LINKLOCAL) && (remote_type != IPV6_ADDR_ANY)) { NL_SET_ERR_MSG(extack, "Invalid combination of local and remote address scopes"); return -EINVAL; } conf->flags |= VXLAN_F_IPV6_LINKLOCAL; } else { if (remote_type == (IPV6_ADDR_UNICAST | IPV6_ADDR_LINKLOCAL)) { NL_SET_ERR_MSG(extack, "Invalid combination of local and remote address scopes"); return -EINVAL; } conf->flags &= ~VXLAN_F_IPV6_LINKLOCAL; } } } if (conf->label && !use_ipv6) { NL_SET_ERR_MSG(extack, "Label attribute only applies to IPv6 VXLAN devices"); return -EINVAL; } if (conf->label_policy && !use_ipv6) { NL_SET_ERR_MSG(extack, "Label policy only applies to IPv6 VXLAN devices"); return -EINVAL; } if (conf->remote_ifindex) { struct net_device *lowerdev; lowerdev = __dev_get_by_index(src_net, conf->remote_ifindex); if (!lowerdev) { NL_SET_ERR_MSG(extack, "Invalid local interface, device not found"); return -ENODEV; } #if IS_ENABLED(CONFIG_IPV6) if (use_ipv6) { struct inet6_dev *idev = __in6_dev_get(lowerdev); if (idev && idev->cnf.disable_ipv6) { NL_SET_ERR_MSG(extack, "IPv6 support disabled by administrator"); return -EPERM; } } #endif *lower = lowerdev; } else { if (vxlan_addr_multicast(&conf->remote_ip)) { NL_SET_ERR_MSG(extack, "Local interface required for multicast remote destination"); return -EINVAL; } #if IS_ENABLED(CONFIG_IPV6) if (conf->flags & VXLAN_F_IPV6_LINKLOCAL) { NL_SET_ERR_MSG(extack, "Local interface required for link-local local/remote addresses"); return -EINVAL; } #endif *lower = NULL; } if (!conf->dst_port) { if (conf->flags & VXLAN_F_GPE) conf->dst_port = htons(IANA_VXLAN_GPE_UDP_PORT); else conf->dst_port = htons(vxlan_port); } if (!conf->age_interval) conf->age_interval = FDB_AGE_DEFAULT; if (vxlan_vni_in_use(src_net, old, conf, conf->vni)) { NL_SET_ERR_MSG(extack, "A VXLAN device with the specified VNI already exists"); return -EEXIST; } return 0; } static void vxlan_config_apply(struct net_device *dev, struct vxlan_config *conf, struct net_device *lowerdev, struct net *src_net, bool changelink) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *dst = &vxlan->default_dst; unsigned short needed_headroom = ETH_HLEN; int max_mtu = ETH_MAX_MTU; u32 flags = conf->flags; if (!changelink) { if (flags & VXLAN_F_GPE) vxlan_raw_setup(dev); else vxlan_ether_setup(dev); if (conf->mtu) dev->mtu = conf->mtu; vxlan->net = src_net; } dst->remote_vni = conf->vni; memcpy(&dst->remote_ip, &conf->remote_ip, sizeof(conf->remote_ip)); if (lowerdev) { dst->remote_ifindex = conf->remote_ifindex; netif_inherit_tso_max(dev, lowerdev); needed_headroom = lowerdev->hard_header_len; needed_headroom += lowerdev->needed_headroom; dev->needed_tailroom = lowerdev->needed_tailroom; max_mtu = lowerdev->mtu - vxlan_headroom(flags); if (max_mtu < ETH_MIN_MTU) max_mtu = ETH_MIN_MTU; if (!changelink && !conf->mtu) dev->mtu = max_mtu; } if (dev->mtu > max_mtu) dev->mtu = max_mtu; if (flags & VXLAN_F_COLLECT_METADATA) flags |= VXLAN_F_IPV6; needed_headroom += vxlan_headroom(flags); dev->needed_headroom = needed_headroom; memcpy(&vxlan->cfg, conf, sizeof(*conf)); } static int vxlan_dev_configure(struct net *src_net, struct net_device *dev, struct vxlan_config *conf, bool changelink, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); struct net_device *lowerdev; int ret; ret = vxlan_config_validate(src_net, conf, &lowerdev, vxlan, extack); if (ret) return ret; vxlan_config_apply(dev, conf, lowerdev, src_net, changelink); return 0; } static int __vxlan_dev_create(struct net *net, struct net_device *dev, struct vxlan_config *conf, struct netlink_ext_ack *extack) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_dev *vxlan = netdev_priv(dev); struct net_device *remote_dev = NULL; struct vxlan_fdb *f = NULL; bool unregister = false; struct vxlan_rdst *dst; int err; dst = &vxlan->default_dst; err = vxlan_dev_configure(net, dev, conf, false, extack); if (err) return err; dev->ethtool_ops = &vxlan_ethtool_ops; /* create an fdb entry for a valid default destination */ if (!vxlan_addr_any(&dst->remote_ip)) { err = vxlan_fdb_create(vxlan, all_zeros_mac, &dst->remote_ip, NUD_REACHABLE | NUD_PERMANENT, vxlan->cfg.dst_port, dst->remote_vni, dst->remote_vni, dst->remote_ifindex, NTF_SELF, 0, &f, extack); if (err) return err; } err = register_netdevice(dev); if (err) goto errout; unregister = true; if (dst->remote_ifindex) { remote_dev = __dev_get_by_index(net, dst->remote_ifindex); if (!remote_dev) { err = -ENODEV; goto errout; } err = netdev_upper_dev_link(remote_dev, dev, extack); if (err) goto errout; } err = rtnl_configure_link(dev, NULL, 0, NULL); if (err < 0) goto unlink; if (f) { vxlan_fdb_insert(vxlan, all_zeros_mac, dst->remote_vni, f); /* notify default fdb entry */ err = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_NEWNEIGH, true, extack); if (err) { vxlan_fdb_destroy(vxlan, f, false, false); if (remote_dev) netdev_upper_dev_unlink(remote_dev, dev); goto unregister; } } list_add(&vxlan->next, &vn->vxlan_list); if (remote_dev) dst->remote_dev = remote_dev; return 0; unlink: if (remote_dev) netdev_upper_dev_unlink(remote_dev, dev); errout: /* unregister_netdevice() destroys the default FDB entry with deletion * notification. But the addition notification was not sent yet, so * destroy the entry by hand here. */ if (f) __vxlan_fdb_free(f); unregister: if (unregister) unregister_netdevice(dev); return err; } /* Set/clear flags based on attribute */ static int vxlan_nl2flag(struct vxlan_config *conf, struct nlattr *tb[], int attrtype, unsigned long mask, bool changelink, bool changelink_supported, struct netlink_ext_ack *extack) { unsigned long flags; if (!tb[attrtype]) return 0; if (changelink && !changelink_supported) { vxlan_flag_attr_error(attrtype, extack); return -EOPNOTSUPP; } if (vxlan_policy[attrtype].type == NLA_FLAG) flags = conf->flags | mask; else if (nla_get_u8(tb[attrtype])) flags = conf->flags | mask; else flags = conf->flags & ~mask; conf->flags = flags; return 0; } static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], struct net_device *dev, struct vxlan_config *conf, bool changelink, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); int err = 0; memset(conf, 0, sizeof(*conf)); /* if changelink operation, start with old existing cfg */ if (changelink) memcpy(conf, &vxlan->cfg, sizeof(*conf)); if (data[IFLA_VXLAN_ID]) { __be32 vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID])); if (changelink && (vni != conf->vni)) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_ID], "Cannot change VNI"); return -EOPNOTSUPP; } conf->vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID])); } if (data[IFLA_VXLAN_GROUP]) { if (changelink && (conf->remote_ip.sa.sa_family != AF_INET)) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP], "New group address family does not match old group"); return -EOPNOTSUPP; } conf->remote_ip.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_GROUP]); conf->remote_ip.sa.sa_family = AF_INET; } else if (data[IFLA_VXLAN_GROUP6]) { if (!IS_ENABLED(CONFIG_IPV6)) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP6], "IPv6 support not enabled in the kernel"); return -EPFNOSUPPORT; } if (changelink && (conf->remote_ip.sa.sa_family != AF_INET6)) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP6], "New group address family does not match old group"); return -EOPNOTSUPP; } conf->remote_ip.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_GROUP6]); conf->remote_ip.sa.sa_family = AF_INET6; } if (data[IFLA_VXLAN_LOCAL]) { if (changelink && (conf->saddr.sa.sa_family != AF_INET)) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL], "New local address family does not match old"); return -EOPNOTSUPP; } conf->saddr.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_LOCAL]); conf->saddr.sa.sa_family = AF_INET; } else if (data[IFLA_VXLAN_LOCAL6]) { if (!IS_ENABLED(CONFIG_IPV6)) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL6], "IPv6 support not enabled in the kernel"); return -EPFNOSUPPORT; } if (changelink && (conf->saddr.sa.sa_family != AF_INET6)) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL6], "New local address family does not match old"); return -EOPNOTSUPP; } /* TODO: respect scope id */ conf->saddr.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_LOCAL6]); conf->saddr.sa.sa_family = AF_INET6; } if (data[IFLA_VXLAN_LINK]) conf->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]); if (data[IFLA_VXLAN_TOS]) conf->tos = nla_get_u8(data[IFLA_VXLAN_TOS]); if (data[IFLA_VXLAN_TTL]) conf->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]); if (data[IFLA_VXLAN_TTL_INHERIT]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_TTL_INHERIT, VXLAN_F_TTL_INHERIT, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_LABEL]) conf->label = nla_get_be32(data[IFLA_VXLAN_LABEL]) & IPV6_FLOWLABEL_MASK; if (data[IFLA_VXLAN_LABEL_POLICY]) conf->label_policy = nla_get_u32(data[IFLA_VXLAN_LABEL_POLICY]); if (data[IFLA_VXLAN_LEARNING]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LEARNING, VXLAN_F_LEARN, changelink, true, extack); if (err) return err; } else if (!changelink) { /* default to learn on a new device */ conf->flags |= VXLAN_F_LEARN; } if (data[IFLA_VXLAN_AGEING]) conf->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]); if (data[IFLA_VXLAN_PROXY]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_PROXY, VXLAN_F_PROXY, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_RSC]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_RSC, VXLAN_F_RSC, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_L2MISS]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_L2MISS, VXLAN_F_L2MISS, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_L3MISS]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_L3MISS, VXLAN_F_L3MISS, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_LIMIT]) { if (changelink) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LIMIT], "Cannot change limit"); return -EOPNOTSUPP; } conf->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]); } if (data[IFLA_VXLAN_COLLECT_METADATA]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_COLLECT_METADATA, VXLAN_F_COLLECT_METADATA, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_PORT_RANGE]) { if (!changelink) { const struct ifla_vxlan_port_range *p = nla_data(data[IFLA_VXLAN_PORT_RANGE]); conf->port_min = ntohs(p->low); conf->port_max = ntohs(p->high); } else { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT_RANGE], "Cannot change port range"); return -EOPNOTSUPP; } } if (data[IFLA_VXLAN_PORT]) { if (changelink) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT], "Cannot change port"); return -EOPNOTSUPP; } conf->dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]); } if (data[IFLA_VXLAN_UDP_CSUM]) { if (changelink) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_UDP_CSUM], "Cannot change UDP_CSUM flag"); return -EOPNOTSUPP; } if (!nla_get_u8(data[IFLA_VXLAN_UDP_CSUM])) conf->flags |= VXLAN_F_UDP_ZERO_CSUM_TX; } if (data[IFLA_VXLAN_LOCALBYPASS]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LOCALBYPASS, VXLAN_F_LOCALBYPASS, changelink, true, extack); if (err) return err; } else if (!changelink) { /* default to local bypass on a new device */ conf->flags |= VXLAN_F_LOCALBYPASS; } if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_TX, VXLAN_F_UDP_ZERO_CSUM6_TX, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, VXLAN_F_UDP_ZERO_CSUM6_RX, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_REMCSUM_TX]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_TX, VXLAN_F_REMCSUM_TX, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_REMCSUM_RX]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_RX, VXLAN_F_REMCSUM_RX, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_GBP]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_GBP, VXLAN_F_GBP, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_GPE]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_GPE, VXLAN_F_GPE, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_NOPARTIAL, VXLAN_F_REMCSUM_NOPARTIAL, changelink, false, extack); if (err) return err; } if (tb[IFLA_MTU]) { if (changelink) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU], "Cannot change mtu"); return -EOPNOTSUPP; } conf->mtu = nla_get_u32(tb[IFLA_MTU]); } if (data[IFLA_VXLAN_DF]) conf->df = nla_get_u8(data[IFLA_VXLAN_DF]); if (data[IFLA_VXLAN_VNIFILTER]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_VNIFILTER, VXLAN_F_VNIFILTER, changelink, false, extack); if (err) return err; if ((conf->flags & VXLAN_F_VNIFILTER) && !(conf->flags & VXLAN_F_COLLECT_METADATA)) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_VNIFILTER], "vxlan vnifilter only valid in collect metadata mode"); return -EINVAL; } } return 0; } static int vxlan_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct vxlan_config conf; int err; err = vxlan_nl2conf(tb, data, dev, &conf, false, extack); if (err) return err; return __vxlan_dev_create(src_net, dev, &conf, extack); } static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); struct net_device *lowerdev; struct vxlan_config conf; struct vxlan_rdst *dst; int err; dst = &vxlan->default_dst; err = vxlan_nl2conf(tb, data, dev, &conf, true, extack); if (err) return err; err = vxlan_config_validate(vxlan->net, &conf, &lowerdev, vxlan, extack); if (err) return err; if (dst->remote_dev == lowerdev) lowerdev = NULL; err = netdev_adjacent_change_prepare(dst->remote_dev, lowerdev, dev, extack); if (err) return err; /* handle default dst entry */ if (!vxlan_addr_equal(&conf.remote_ip, &dst->remote_ip)) { u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, conf.vni); spin_lock_bh(&vxlan->hash_lock[hash_index]); if (!vxlan_addr_any(&conf.remote_ip)) { err = vxlan_fdb_update(vxlan, all_zeros_mac, &conf.remote_ip, NUD_REACHABLE | NUD_PERMANENT, NLM_F_APPEND | NLM_F_CREATE, vxlan->cfg.dst_port, conf.vni, conf.vni, conf.remote_ifindex, NTF_SELF, 0, true, extack); if (err) { spin_unlock_bh(&vxlan->hash_lock[hash_index]); netdev_adjacent_change_abort(dst->remote_dev, lowerdev, dev); return err; } } if (!vxlan_addr_any(&dst->remote_ip)) __vxlan_fdb_delete(vxlan, all_zeros_mac, dst->remote_ip, vxlan->cfg.dst_port, dst->remote_vni, dst->remote_vni, dst->remote_ifindex, true); spin_unlock_bh(&vxlan->hash_lock[hash_index]); /* If vni filtering device, also update fdb entries of * all vnis that were using default remote ip */ if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) { err = vxlan_vnilist_update_group(vxlan, &dst->remote_ip, &conf.remote_ip, extack); if (err) { netdev_adjacent_change_abort(dst->remote_dev, lowerdev, dev); return err; } } } if (conf.age_interval != vxlan->cfg.age_interval) mod_timer(&vxlan->age_timer, jiffies); netdev_adjacent_change_commit(dst->remote_dev, lowerdev, dev); if (lowerdev && lowerdev != dst->remote_dev) dst->remote_dev = lowerdev; vxlan_config_apply(dev, &conf, lowerdev, vxlan->net, true); return 0; } static void vxlan_dellink(struct net_device *dev, struct list_head *head) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb_flush_desc desc = { /* Default entry is deleted at vxlan_uninit. */ .ignore_default_entry = true, }; vxlan_flush(vxlan, &desc); list_del(&vxlan->next); unregister_netdevice_queue(dev, head); if (vxlan->default_dst.remote_dev) netdev_upper_dev_unlink(vxlan->default_dst.remote_dev, dev); } static size_t vxlan_get_size(const struct net_device *dev) { return nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_ID */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL_INHERIT */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TOS */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_DF */ nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LABEL_POLICY */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LEARNING */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_PROXY */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_RSC */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L2MISS */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L3MISS */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_COLLECT_METADATA */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */ nla_total_size(sizeof(__be16)) + /* IFLA_VXLAN_PORT */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_CSUM */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_TX */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LOCALBYPASS */ /* IFLA_VXLAN_PORT_RANGE */ nla_total_size(sizeof(struct ifla_vxlan_port_range)) + nla_total_size(0) + /* IFLA_VXLAN_GBP */ nla_total_size(0) + /* IFLA_VXLAN_GPE */ nla_total_size(0) + /* IFLA_VXLAN_REMCSUM_NOPARTIAL */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_VNIFILTER */ 0; } static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) { const struct vxlan_dev *vxlan = netdev_priv(dev); const struct vxlan_rdst *dst = &vxlan->default_dst; struct ifla_vxlan_port_range ports = { .low = htons(vxlan->cfg.port_min), .high = htons(vxlan->cfg.port_max), }; if (nla_put_u32(skb, IFLA_VXLAN_ID, be32_to_cpu(dst->remote_vni))) goto nla_put_failure; if (!vxlan_addr_any(&dst->remote_ip)) { if (dst->remote_ip.sa.sa_family == AF_INET) { if (nla_put_in_addr(skb, IFLA_VXLAN_GROUP, dst->remote_ip.sin.sin_addr.s_addr)) goto nla_put_failure; #if IS_ENABLED(CONFIG_IPV6) } else { if (nla_put_in6_addr(skb, IFLA_VXLAN_GROUP6, &dst->remote_ip.sin6.sin6_addr)) goto nla_put_failure; #endif } } if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex)) goto nla_put_failure; if (!vxlan_addr_any(&vxlan->cfg.saddr)) { if (vxlan->cfg.saddr.sa.sa_family == AF_INET) { if (nla_put_in_addr(skb, IFLA_VXLAN_LOCAL, vxlan->cfg.saddr.sin.sin_addr.s_addr)) goto nla_put_failure; #if IS_ENABLED(CONFIG_IPV6) } else { if (nla_put_in6_addr(skb, IFLA_VXLAN_LOCAL6, &vxlan->cfg.saddr.sin6.sin6_addr)) goto nla_put_failure; #endif } } if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) || nla_put_u8(skb, IFLA_VXLAN_TTL_INHERIT, !!(vxlan->cfg.flags & VXLAN_F_TTL_INHERIT)) || nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) || nla_put_u8(skb, IFLA_VXLAN_DF, vxlan->cfg.df) || nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) || nla_put_u32(skb, IFLA_VXLAN_LABEL_POLICY, vxlan->cfg.label_policy) || nla_put_u8(skb, IFLA_VXLAN_LEARNING, !!(vxlan->cfg.flags & VXLAN_F_LEARN)) || nla_put_u8(skb, IFLA_VXLAN_PROXY, !!(vxlan->cfg.flags & VXLAN_F_PROXY)) || nla_put_u8(skb, IFLA_VXLAN_RSC, !!(vxlan->cfg.flags & VXLAN_F_RSC)) || nla_put_u8(skb, IFLA_VXLAN_L2MISS, !!(vxlan->cfg.flags & VXLAN_F_L2MISS)) || nla_put_u8(skb, IFLA_VXLAN_L3MISS, !!(vxlan->cfg.flags & VXLAN_F_L3MISS)) || nla_put_u8(skb, IFLA_VXLAN_COLLECT_METADATA, !!(vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)) || nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->cfg.age_interval) || nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->cfg.addrmax) || nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port) || nla_put_u8(skb, IFLA_VXLAN_UDP_CSUM, !(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM_TX)) || nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX, !!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) || nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, !!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_RX)) || nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX, !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_TX)) || nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX, !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX)) || nla_put_u8(skb, IFLA_VXLAN_LOCALBYPASS, !!(vxlan->cfg.flags & VXLAN_F_LOCALBYPASS))) goto nla_put_failure; if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports)) goto nla_put_failure; if (vxlan->cfg.flags & VXLAN_F_GBP && nla_put_flag(skb, IFLA_VXLAN_GBP)) goto nla_put_failure; if (vxlan->cfg.flags & VXLAN_F_GPE && nla_put_flag(skb, IFLA_VXLAN_GPE)) goto nla_put_failure; if (vxlan->cfg.flags & VXLAN_F_REMCSUM_NOPARTIAL && nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL)) goto nla_put_failure; if (vxlan->cfg.flags & VXLAN_F_VNIFILTER && nla_put_u8(skb, IFLA_VXLAN_VNIFILTER, !!(vxlan->cfg.flags & VXLAN_F_VNIFILTER))) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static struct net *vxlan_get_link_net(const struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); return vxlan->net; } static struct rtnl_link_ops vxlan_link_ops __read_mostly = { .kind = "vxlan", .maxtype = IFLA_VXLAN_MAX, .policy = vxlan_policy, .priv_size = sizeof(struct vxlan_dev), .setup = vxlan_setup, .validate = vxlan_validate, .newlink = vxlan_newlink, .changelink = vxlan_changelink, .dellink = vxlan_dellink, .get_size = vxlan_get_size, .fill_info = vxlan_fill_info, .get_link_net = vxlan_get_link_net, }; struct net_device *vxlan_dev_create(struct net *net, const char *name, u8 name_assign_type, struct vxlan_config *conf) { struct nlattr *tb[IFLA_MAX + 1]; struct net_device *dev; int err; memset(&tb, 0, sizeof(tb)); dev = rtnl_create_link(net, name, name_assign_type, &vxlan_link_ops, tb, NULL); if (IS_ERR(dev)) return dev; err = __vxlan_dev_create(net, dev, conf, NULL); if (err < 0) { free_netdev(dev); return ERR_PTR(err); } err = rtnl_configure_link(dev, NULL, 0, NULL); if (err < 0) { LIST_HEAD(list_kill); vxlan_dellink(dev, &list_kill); unregister_netdevice_many(&list_kill); return ERR_PTR(err); } return dev; } EXPORT_SYMBOL_GPL(vxlan_dev_create); static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn, struct net_device *dev) { struct vxlan_dev *vxlan, *next; LIST_HEAD(list_kill); list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) { struct vxlan_rdst *dst = &vxlan->default_dst; /* In case we created vxlan device with carrier * and we loose the carrier due to module unload * we also need to remove vxlan device. In other * cases, it's not necessary and remote_ifindex * is 0 here, so no matches. */ if (dst->remote_ifindex == dev->ifindex) vxlan_dellink(vxlan->dev, &list_kill); } unregister_netdevice_many(&list_kill); } static int vxlan_netdevice_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); if (event == NETDEV_UNREGISTER) vxlan_handle_lowerdev_unregister(vn, dev); else if (event == NETDEV_UDP_TUNNEL_PUSH_INFO) vxlan_offload_rx_ports(dev, true); else if (event == NETDEV_UDP_TUNNEL_DROP_INFO) vxlan_offload_rx_ports(dev, false); return NOTIFY_DONE; } static struct notifier_block vxlan_notifier_block __read_mostly = { .notifier_call = vxlan_netdevice_event, }; static void vxlan_fdb_offloaded_set(struct net_device *dev, struct switchdev_notifier_vxlan_fdb_info *fdb_info) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *rdst; struct vxlan_fdb *f; u32 hash_index; hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni); spin_lock_bh(&vxlan->hash_lock[hash_index]); f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); if (!f) goto out; rdst = vxlan_fdb_find_rdst(f, &fdb_info->remote_ip, fdb_info->remote_port, fdb_info->remote_vni, fdb_info->remote_ifindex); if (!rdst) goto out; rdst->offloaded = fdb_info->offloaded; out: spin_unlock_bh(&vxlan->hash_lock[hash_index]); } static int vxlan_fdb_external_learn_add(struct net_device *dev, struct switchdev_notifier_vxlan_fdb_info *fdb_info) { struct vxlan_dev *vxlan = netdev_priv(dev); struct netlink_ext_ack *extack; u32 hash_index; int err; hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni); extack = switchdev_notifier_info_to_extack(&fdb_info->info); spin_lock_bh(&vxlan->hash_lock[hash_index]); err = vxlan_fdb_update(vxlan, fdb_info->eth_addr, &fdb_info->remote_ip, NUD_REACHABLE, NLM_F_CREATE | NLM_F_REPLACE, fdb_info->remote_port, fdb_info->vni, fdb_info->remote_vni, fdb_info->remote_ifindex, NTF_USE | NTF_SELF | NTF_EXT_LEARNED, 0, false, extack); spin_unlock_bh(&vxlan->hash_lock[hash_index]); return err; } static int vxlan_fdb_external_learn_del(struct net_device *dev, struct switchdev_notifier_vxlan_fdb_info *fdb_info) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb *f; u32 hash_index; int err = 0; hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni); spin_lock_bh(&vxlan->hash_lock[hash_index]); f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); if (!f) err = -ENOENT; else if (f->flags & NTF_EXT_LEARNED) err = __vxlan_fdb_delete(vxlan, fdb_info->eth_addr, fdb_info->remote_ip, fdb_info->remote_port, fdb_info->vni, fdb_info->remote_vni, fdb_info->remote_ifindex, false); spin_unlock_bh(&vxlan->hash_lock[hash_index]); return err; } static int vxlan_switchdev_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = switchdev_notifier_info_to_dev(ptr); struct switchdev_notifier_vxlan_fdb_info *fdb_info; int err = 0; switch (event) { case SWITCHDEV_VXLAN_FDB_OFFLOADED: vxlan_fdb_offloaded_set(dev, ptr); break; case SWITCHDEV_VXLAN_FDB_ADD_TO_BRIDGE: fdb_info = ptr; err = vxlan_fdb_external_learn_add(dev, fdb_info); if (err) { err = notifier_from_errno(err); break; } fdb_info->offloaded = true; vxlan_fdb_offloaded_set(dev, fdb_info); break; case SWITCHDEV_VXLAN_FDB_DEL_TO_BRIDGE: fdb_info = ptr; err = vxlan_fdb_external_learn_del(dev, fdb_info); if (err) { err = notifier_from_errno(err); break; } fdb_info->offloaded = false; vxlan_fdb_offloaded_set(dev, fdb_info); break; } return err; } static struct notifier_block vxlan_switchdev_notifier_block __read_mostly = { .notifier_call = vxlan_switchdev_event, }; static void vxlan_fdb_nh_flush(struct nexthop *nh) { struct vxlan_fdb *fdb; struct vxlan_dev *vxlan; u32 hash_index; rcu_read_lock(); list_for_each_entry_rcu(fdb, &nh->fdb_list, nh_list) { vxlan = rcu_dereference(fdb->vdev); WARN_ON(!vxlan); hash_index = fdb_head_index(vxlan, fdb->eth_addr, vxlan->default_dst.remote_vni); spin_lock_bh(&vxlan->hash_lock[hash_index]); if (!hlist_unhashed(&fdb->hlist)) vxlan_fdb_destroy(vxlan, fdb, false, false); spin_unlock_bh(&vxlan->hash_lock[hash_index]); } rcu_read_unlock(); } static int vxlan_nexthop_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct nh_notifier_info *info = ptr; struct nexthop *nh; if (event != NEXTHOP_EVENT_DEL) return NOTIFY_DONE; nh = nexthop_find_by_id(info->net, info->id); if (!nh) return NOTIFY_DONE; vxlan_fdb_nh_flush(nh); return NOTIFY_DONE; } static __net_init int vxlan_init_net(struct net *net) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); unsigned int h; INIT_LIST_HEAD(&vn->vxlan_list); spin_lock_init(&vn->sock_lock); vn->nexthop_notifier_block.notifier_call = vxlan_nexthop_event; for (h = 0; h < PORT_HASH_SIZE; ++h) INIT_HLIST_HEAD(&vn->sock_list[h]); return register_nexthop_notifier(net, &vn->nexthop_notifier_block, NULL); } static void vxlan_destroy_tunnels(struct net *net, struct list_head *head) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_dev *vxlan, *next; struct net_device *dev, *aux; for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == &vxlan_link_ops) unregister_netdevice_queue(dev, head); list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) { /* If vxlan->dev is in the same netns, it has already been added * to the list by the previous loop. */ if (!net_eq(dev_net(vxlan->dev), net)) unregister_netdevice_queue(vxlan->dev, head); } } static void __net_exit vxlan_exit_batch_net(struct list_head *net_list) { struct net *net; LIST_HEAD(list); unsigned int h; list_for_each_entry(net, net_list, exit_list) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); unregister_nexthop_notifier(net, &vn->nexthop_notifier_block); } rtnl_lock(); list_for_each_entry(net, net_list, exit_list) vxlan_destroy_tunnels(net, &list); unregister_netdevice_many(&list); rtnl_unlock(); list_for_each_entry(net, net_list, exit_list) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); for (h = 0; h < PORT_HASH_SIZE; ++h) WARN_ON_ONCE(!hlist_empty(&vn->sock_list[h])); } } static struct pernet_operations vxlan_net_ops = { .init = vxlan_init_net, .exit_batch = vxlan_exit_batch_net, .id = &vxlan_net_id, .size = sizeof(struct vxlan_net), }; static int __init vxlan_init_module(void) { int rc; get_random_bytes(&vxlan_salt, sizeof(vxlan_salt)); rc = register_pernet_subsys(&vxlan_net_ops); if (rc) goto out1; rc = register_netdevice_notifier(&vxlan_notifier_block); if (rc) goto out2; rc = register_switchdev_notifier(&vxlan_switchdev_notifier_block); if (rc) goto out3; rc = rtnl_link_register(&vxlan_link_ops); if (rc) goto out4; vxlan_vnifilter_init(); return 0; out4: unregister_switchdev_notifier(&vxlan_switchdev_notifier_block); out3: unregister_netdevice_notifier(&vxlan_notifier_block); out2: unregister_pernet_subsys(&vxlan_net_ops); out1: return rc; } late_initcall(vxlan_init_module); static void __exit vxlan_cleanup_module(void) { vxlan_vnifilter_uninit(); rtnl_link_unregister(&vxlan_link_ops); unregister_switchdev_notifier(&vxlan_switchdev_notifier_block); unregister_netdevice_notifier(&vxlan_notifier_block); unregister_pernet_subsys(&vxlan_net_ops); /* rcu_barrier() is called by netns */ } module_exit(vxlan_cleanup_module); MODULE_LICENSE("GPL"); MODULE_VERSION(VXLAN_VERSION); MODULE_AUTHOR("Stephen Hemminger <stephen@networkplumber.org>"); MODULE_DESCRIPTION("Driver for VXLAN encapsulated traffic"); MODULE_ALIAS_RTNL_LINK("vxlan");
1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 // SPDX-License-Identifier: GPL-2.0-or-later /* * Bridge per vlan tunnel port dst_metadata netlink control interface * * Authors: * Roopa Prabhu <roopa@cumulusnetworks.com> */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/etherdevice.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/sock.h> #include <uapi/linux/if_bridge.h> #include <net/dst_metadata.h> #include "br_private.h" #include "br_private_tunnel.h" static size_t __get_vlan_tinfo_size(void) { return nla_total_size(0) + /* nest IFLA_BRIDGE_VLAN_TUNNEL_INFO */ nla_total_size(sizeof(u32)) + /* IFLA_BRIDGE_VLAN_TUNNEL_ID */ nla_total_size(sizeof(u16)) + /* IFLA_BRIDGE_VLAN_TUNNEL_VID */ nla_total_size(sizeof(u16)); /* IFLA_BRIDGE_VLAN_TUNNEL_FLAGS */ } bool vlan_tunid_inrange(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *v_last) { __be32 tunid_curr = tunnel_id_to_key32(v_curr->tinfo.tunnel_id); __be32 tunid_last = tunnel_id_to_key32(v_last->tinfo.tunnel_id); return (be32_to_cpu(tunid_curr) - be32_to_cpu(tunid_last)) == 1; } static int __get_num_vlan_tunnel_infos(struct net_bridge_vlan_group *vg) { struct net_bridge_vlan *v, *vtbegin = NULL, *vtend = NULL; int num_tinfos = 0; /* Count number of vlan infos */ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { /* only a context, bridge vlan not activated */ if (!br_vlan_should_use(v) || !v->tinfo.tunnel_id) continue; if (!vtbegin) { goto initvars; } else if ((v->vid - vtend->vid) == 1 && vlan_tunid_inrange(v, vtend)) { vtend = v; continue; } else { if ((vtend->vid - vtbegin->vid) > 0) num_tinfos += 2; else num_tinfos += 1; } initvars: vtbegin = v; vtend = v; } if (vtbegin && vtend) { if ((vtend->vid - vtbegin->vid) > 0) num_tinfos += 2; else num_tinfos += 1; } return num_tinfos; } int br_get_vlan_tunnel_info_size(struct net_bridge_vlan_group *vg) { int num_tinfos; if (!vg) return 0; rcu_read_lock(); num_tinfos = __get_num_vlan_tunnel_infos(vg); rcu_read_unlock(); return num_tinfos * __get_vlan_tinfo_size(); } static int br_fill_vlan_tinfo(struct sk_buff *skb, u16 vid, __be64 tunnel_id, u16 flags) { __be32 tid = tunnel_id_to_key32(tunnel_id); struct nlattr *tmap; tmap = nla_nest_start_noflag(skb, IFLA_BRIDGE_VLAN_TUNNEL_INFO); if (!tmap) return -EMSGSIZE; if (nla_put_u32(skb, IFLA_BRIDGE_VLAN_TUNNEL_ID, be32_to_cpu(tid))) goto nla_put_failure; if (nla_put_u16(skb, IFLA_BRIDGE_VLAN_TUNNEL_VID, vid)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_BRIDGE_VLAN_TUNNEL_FLAGS, flags)) goto nla_put_failure; nla_nest_end(skb, tmap); return 0; nla_put_failure: nla_nest_cancel(skb, tmap); return -EMSGSIZE; } static int br_fill_vlan_tinfo_range(struct sk_buff *skb, struct net_bridge_vlan *vtbegin, struct net_bridge_vlan *vtend) { int err; if (vtend && (vtend->vid - vtbegin->vid) > 0) { /* add range to skb */ err = br_fill_vlan_tinfo(skb, vtbegin->vid, vtbegin->tinfo.tunnel_id, BRIDGE_VLAN_INFO_RANGE_BEGIN); if (err) return err; err = br_fill_vlan_tinfo(skb, vtend->vid, vtend->tinfo.tunnel_id, BRIDGE_VLAN_INFO_RANGE_END); if (err) return err; } else { err = br_fill_vlan_tinfo(skb, vtbegin->vid, vtbegin->tinfo.tunnel_id, 0); if (err) return err; } return 0; } int br_fill_vlan_tunnel_info(struct sk_buff *skb, struct net_bridge_vlan_group *vg) { struct net_bridge_vlan *vtbegin = NULL; struct net_bridge_vlan *vtend = NULL; struct net_bridge_vlan *v; int err; /* Count number of vlan infos */ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { /* only a context, bridge vlan not activated */ if (!br_vlan_should_use(v)) continue; if (!v->tinfo.tunnel_dst) continue; if (!vtbegin) { goto initvars; } else if ((v->vid - vtend->vid) == 1 && vlan_tunid_inrange(v, vtend)) { vtend = v; continue; } else { err = br_fill_vlan_tinfo_range(skb, vtbegin, vtend); if (err) return err; } initvars: vtbegin = v; vtend = v; } if (vtbegin) { err = br_fill_vlan_tinfo_range(skb, vtbegin, vtend); if (err) return err; } return 0; } static const struct nla_policy vlan_tunnel_policy[IFLA_BRIDGE_VLAN_TUNNEL_MAX + 1] = { [IFLA_BRIDGE_VLAN_TUNNEL_UNSPEC] = { .strict_start_type = IFLA_BRIDGE_VLAN_TUNNEL_FLAGS + 1 }, [IFLA_BRIDGE_VLAN_TUNNEL_ID] = { .type = NLA_U32 }, [IFLA_BRIDGE_VLAN_TUNNEL_VID] = { .type = NLA_U16 }, [IFLA_BRIDGE_VLAN_TUNNEL_FLAGS] = { .type = NLA_U16 }, }; int br_vlan_tunnel_info(const struct net_bridge_port *p, int cmd, u16 vid, u32 tun_id, bool *changed) { int err = 0; if (!p) return -EINVAL; switch (cmd) { case RTM_SETLINK: err = nbp_vlan_tunnel_info_add(p, vid, tun_id); if (!err) *changed = true; break; case RTM_DELLINK: if (!nbp_vlan_tunnel_info_delete(p, vid)) *changed = true; break; } return err; } int br_parse_vlan_tunnel_info(struct nlattr *attr, struct vtunnel_info *tinfo) { struct nlattr *tb[IFLA_BRIDGE_VLAN_TUNNEL_MAX + 1]; u32 tun_id; u16 vid, flags = 0; int err; memset(tinfo, 0, sizeof(*tinfo)); err = nla_parse_nested_deprecated(tb, IFLA_BRIDGE_VLAN_TUNNEL_MAX, attr, vlan_tunnel_policy, NULL); if (err < 0) return err; if (!tb[IFLA_BRIDGE_VLAN_TUNNEL_ID] || !tb[IFLA_BRIDGE_VLAN_TUNNEL_VID]) return -EINVAL; tun_id = nla_get_u32(tb[IFLA_BRIDGE_VLAN_TUNNEL_ID]); vid = nla_get_u16(tb[IFLA_BRIDGE_VLAN_TUNNEL_VID]); if (vid >= VLAN_VID_MASK) return -ERANGE; if (tb[IFLA_BRIDGE_VLAN_TUNNEL_FLAGS]) flags = nla_get_u16(tb[IFLA_BRIDGE_VLAN_TUNNEL_FLAGS]); tinfo->tunid = tun_id; tinfo->vid = vid; tinfo->flags = flags; return 0; } /* send a notification if v_curr can't enter the range and start a new one */ static void __vlan_tunnel_handle_range(const struct net_bridge_port *p, struct net_bridge_vlan **v_start, struct net_bridge_vlan **v_end, int v_curr, bool curr_change) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; vg = nbp_vlan_group(p); if (!vg) return; v = br_vlan_find(vg, v_curr); if (!*v_start) goto out_init; if (v && curr_change && br_vlan_can_enter_range(v, *v_end)) { *v_end = v; return; } br_vlan_notify(p->br, p, (*v_start)->vid, (*v_end)->vid, RTM_NEWVLAN); out_init: /* we start a range only if there are any changes to notify about */ *v_start = curr_change ? v : NULL; *v_end = *v_start; } int br_process_vlan_tunnel_info(const struct net_bridge *br, const struct net_bridge_port *p, int cmd, struct vtunnel_info *tinfo_curr, struct vtunnel_info *tinfo_last, bool *changed) { int err; if (tinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) { if (tinfo_last->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) return -EINVAL; memcpy(tinfo_last, tinfo_curr, sizeof(struct vtunnel_info)); } else if (tinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_END) { struct net_bridge_vlan *v_start = NULL, *v_end = NULL; int t, v; if (!(tinfo_last->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN)) return -EINVAL; if ((tinfo_curr->vid - tinfo_last->vid) != (tinfo_curr->tunid - tinfo_last->tunid)) return -EINVAL; t = tinfo_last->tunid; for (v = tinfo_last->vid; v <= tinfo_curr->vid; v++) { bool curr_change = false; err = br_vlan_tunnel_info(p, cmd, v, t, &curr_change); if (err) break; t++; if (curr_change) *changed = curr_change; __vlan_tunnel_handle_range(p, &v_start, &v_end, v, curr_change); } if (v_start && v_end) br_vlan_notify(br, p, v_start->vid, v_end->vid, RTM_NEWVLAN); if (err) return err; memset(tinfo_last, 0, sizeof(struct vtunnel_info)); memset(tinfo_curr, 0, sizeof(struct vtunnel_info)); } else { if (tinfo_last->flags) return -EINVAL; err = br_vlan_tunnel_info(p, cmd, tinfo_curr->vid, tinfo_curr->tunid, changed); if (err) return err; br_vlan_notify(br, p, tinfo_curr->vid, 0, RTM_NEWVLAN); memset(tinfo_last, 0, sizeof(struct vtunnel_info)); memset(tinfo_curr, 0, sizeof(struct vtunnel_info)); } return 0; }
79 65 77 69 66 65 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 // SPDX-License-Identifier: GPL-2.0-only /* * linux/net/sunrpc/sunrpc_syms.c * * Symbols exported by the sunrpc module. * * Copyright (C) 1997 Olaf Kirch <okir@monad.swb.de> */ #include <linux/module.h> #include <linux/types.h> #include <linux/uio.h> #include <linux/unistd.h> #include <linux/init.h> #include <linux/sunrpc/sched.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/auth.h> #include <linux/workqueue.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/sunrpc/xprtsock.h> #include "sunrpc.h" #include "sysfs.h" #include "netns.h" unsigned int sunrpc_net_id; EXPORT_SYMBOL_GPL(sunrpc_net_id); static __net_init int sunrpc_init_net(struct net *net) { int err; struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); err = rpc_proc_init(net); if (err) goto err_proc; err = ip_map_cache_create(net); if (err) goto err_ipmap; err = unix_gid_cache_create(net); if (err) goto err_unixgid; err = rpc_pipefs_init_net(net); if (err) goto err_pipefs; INIT_LIST_HEAD(&sn->all_clients); spin_lock_init(&sn->rpc_client_lock); spin_lock_init(&sn->rpcb_clnt_lock); return 0; err_pipefs: unix_gid_cache_destroy(net); err_unixgid: ip_map_cache_destroy(net); err_ipmap: rpc_proc_exit(net); err_proc: return err; } static __net_exit void sunrpc_exit_net(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); rpc_pipefs_exit_net(net); unix_gid_cache_destroy(net); ip_map_cache_destroy(net); rpc_proc_exit(net); WARN_ON_ONCE(!list_empty(&sn->all_clients)); } static struct pernet_operations sunrpc_net_ops = { .init = sunrpc_init_net, .exit = sunrpc_exit_net, .id = &sunrpc_net_id, .size = sizeof(struct sunrpc_net), }; static int __init init_sunrpc(void) { int err = rpc_init_mempool(); if (err) goto out; err = rpcauth_init_module(); if (err) goto out2; cache_initialize(); err = register_pernet_subsys(&sunrpc_net_ops); if (err) goto out3; err = register_rpc_pipefs(); if (err) goto out4; err = rpc_sysfs_init(); if (err) goto out5; sunrpc_debugfs_init(); #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) rpc_register_sysctl(); #endif svc_init_xprt_sock(); /* svc sock transport */ init_socket_xprt(); /* clnt sock transport */ return 0; out5: unregister_rpc_pipefs(); out4: unregister_pernet_subsys(&sunrpc_net_ops); out3: rpcauth_remove_module(); out2: rpc_destroy_mempool(); out: return err; } static void __exit cleanup_sunrpc(void) { rpc_sysfs_exit(); rpc_cleanup_clids(); xprt_cleanup_ids(); xprt_multipath_cleanup_ids(); rpcauth_remove_module(); cleanup_socket_xprt(); svc_cleanup_xprt_sock(); sunrpc_debugfs_exit(); unregister_rpc_pipefs(); rpc_destroy_mempool(); unregister_pernet_subsys(&sunrpc_net_ops); auth_domain_cleanup(); #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) rpc_unregister_sysctl(); #endif rcu_barrier(); /* Wait for completion of call_rcu()'s */ } MODULE_LICENSE("GPL"); fs_initcall(init_sunrpc); /* Ensure we're initialised before nfs */ module_exit(cleanup_sunrpc);
648 105 105 494 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_NAMEI_H #define _LINUX_NAMEI_H #include <linux/fs.h> #include <linux/kernel.h> #include <linux/path.h> #include <linux/fcntl.h> #include <linux/errno.h> enum { MAX_NESTED_LINKS = 8 }; #define MAXSYMLINKS 40 /* * Type of the last component on LOOKUP_PARENT */ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT}; /* pathwalk mode */ #define LOOKUP_FOLLOW 0x0001 /* follow links at the end */ #define LOOKUP_DIRECTORY 0x0002 /* require a directory */ #define LOOKUP_AUTOMOUNT 0x0004 /* force terminal automount */ #define LOOKUP_EMPTY 0x4000 /* accept empty path [user_... only] */ #define LOOKUP_DOWN 0x8000 /* follow mounts in the starting point */ #define LOOKUP_MOUNTPOINT 0x0080 /* follow mounts in the end */ #define LOOKUP_REVAL 0x0020 /* tell ->d_revalidate() to trust no cache */ #define LOOKUP_RCU 0x0040 /* RCU pathwalk mode; semi-internal */ /* These tell filesystem methods that we are dealing with the final component... */ #define LOOKUP_OPEN 0x0100 /* ... in open */ #define LOOKUP_CREATE 0x0200 /* ... in object creation */ #define LOOKUP_EXCL 0x0400 /* ... in exclusive creation */ #define LOOKUP_RENAME_TARGET 0x0800 /* ... in destination of rename() */ /* internal use only */ #define LOOKUP_PARENT 0x0010 /* Scoping flags for lookup. */ #define LOOKUP_NO_SYMLINKS 0x010000 /* No symlink crossing. */ #define LOOKUP_NO_MAGICLINKS 0x020000 /* No nd_jump_link() crossing. */ #define LOOKUP_NO_XDEV 0x040000 /* No mountpoint crossing. */ #define LOOKUP_BENEATH 0x080000 /* No escaping from starting point. */ #define LOOKUP_IN_ROOT 0x100000 /* Treat dirfd as fs root. */ #define LOOKUP_CACHED 0x200000 /* Only do cached lookup */ /* LOOKUP_* flags which do scope-related checks based on the dirfd. */ #define LOOKUP_IS_SCOPED (LOOKUP_BENEATH | LOOKUP_IN_ROOT) extern int path_pts(struct path *path); extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty); static inline int user_path_at(int dfd, const char __user *name, unsigned flags, struct path *path) { return user_path_at_empty(dfd, name, flags, path, NULL); } struct dentry *lookup_one_qstr_excl(const struct qstr *name, struct dentry *base, unsigned int flags); extern int kern_path(const char *, unsigned, struct path *); extern struct dentry *kern_path_create(int, const char *, struct path *, unsigned int); extern struct dentry *user_path_create(int, const char __user *, struct path *, unsigned int); extern void done_path_create(struct path *, struct dentry *); extern struct dentry *kern_path_locked(const char *, struct path *); extern struct dentry *user_path_locked_at(int , const char __user *, struct path *); int vfs_path_parent_lookup(struct filename *filename, unsigned int flags, struct path *parent, struct qstr *last, int *type, const struct path *root); int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); extern struct dentry *try_lookup_one_len(const char *, struct dentry *, int); extern struct dentry *lookup_one_len(const char *, struct dentry *, int); extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int); extern struct dentry *lookup_positive_unlocked(const char *, struct dentry *, int); struct dentry *lookup_one(struct mnt_idmap *, const char *, struct dentry *, int); struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len); struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len); extern int follow_down_one(struct path *); extern int follow_down(struct path *path, unsigned int flags); extern int follow_up(struct path *); extern struct dentry *lock_rename(struct dentry *, struct dentry *); extern struct dentry *lock_rename_child(struct dentry *, struct dentry *); extern void unlock_rename(struct dentry *, struct dentry *); /** * mode_strip_umask - handle vfs umask stripping * @dir: parent directory of the new inode * @mode: mode of the new inode to be created in @dir * * In most filesystems, umask stripping depends on whether or not the * filesystem supports POSIX ACLs. If the filesystem doesn't support it umask * stripping is done directly in here. If the filesystem does support POSIX * ACLs umask stripping is deferred until the filesystem calls * posix_acl_create(). * * Some filesystems (like NFSv4) also want to avoid umask stripping by the * VFS, but don't support POSIX ACLs. Those filesystems can set SB_I_NOUMASK * to get this effect without declaring that they support POSIX ACLs. * * Returns: mode */ static inline umode_t __must_check mode_strip_umask(const struct inode *dir, umode_t mode) { if (!IS_POSIXACL(dir) && !(dir->i_sb->s_iflags & SB_I_NOUMASK)) mode &= ~current_umask(); return mode; } extern int __must_check nd_jump_link(const struct path *path); static inline void nd_terminate_link(void *name, size_t len, size_t maxlen) { ((char *) name)[min(len, maxlen)] = '\0'; } /** * retry_estale - determine whether the caller should retry an operation * @error: the error that would currently be returned * @flags: flags being used for next lookup attempt * * Check to see if the error code was -ESTALE, and then determine whether * to retry the call based on whether "flags" already has LOOKUP_REVAL set. * * Returns true if the caller should try the operation again. */ static inline bool retry_estale(const long error, const unsigned int flags) { return unlikely(error == -ESTALE && !(flags & LOOKUP_REVAL)); } #endif /* _LINUX_NAMEI_H */
159 158 159 1 1 159 158 115 43 158 43 114 159 1 2 177 177 63 2 63 63 61 67 2 66 66 65 121 79 66 79 7 79 9 9 9 58 1 21 1 433 363 89 300 176 116 13 434 425 125 125 117 425 318 504 507 66 62 70 66 129 79 72 72 63 72 79 13 4 79 44 25 68 2 1 343 314 316 317 304 482 482 459 6 35 35 482 479 3 505 23 184 71 71 66 66 5 5 1 4 4 4 1 2 2 68 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 // SPDX-License-Identifier: GPL-2.0-or-later /* Manage a process's keyrings * * Copyright (C) 2004-2005, 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/init.h> #include <linux/sched.h> #include <linux/sched/user.h> #include <linux/keyctl.h> #include <linux/fs.h> #include <linux/err.h> #include <linux/mutex.h> #include <linux/security.h> #include <linux/user_namespace.h> #include <linux/uaccess.h> #include <linux/init_task.h> #include <keys/request_key_auth-type.h> #include "internal.h" /* Session keyring create vs join semaphore */ static DEFINE_MUTEX(key_session_mutex); /* The root user's tracking struct */ struct key_user root_key_user = { .usage = REFCOUNT_INIT(3), .cons_lock = __MUTEX_INITIALIZER(root_key_user.cons_lock), .lock = __SPIN_LOCK_UNLOCKED(root_key_user.lock), .nkeys = ATOMIC_INIT(2), .nikeys = ATOMIC_INIT(2), .uid = GLOBAL_ROOT_UID, }; /* * Get or create a user register keyring. */ static struct key *get_user_register(struct user_namespace *user_ns) { struct key *reg_keyring = READ_ONCE(user_ns->user_keyring_register); if (reg_keyring) return reg_keyring; down_write(&user_ns->keyring_sem); /* Make sure there's a register keyring. It gets owned by the * user_namespace's owner. */ reg_keyring = user_ns->user_keyring_register; if (!reg_keyring) { reg_keyring = keyring_alloc(".user_reg", user_ns->owner, INVALID_GID, &init_cred, KEY_POS_WRITE | KEY_POS_SEARCH | KEY_USR_VIEW | KEY_USR_READ, 0, NULL, NULL); if (!IS_ERR(reg_keyring)) smp_store_release(&user_ns->user_keyring_register, reg_keyring); } up_write(&user_ns->keyring_sem); /* We don't return a ref since the keyring is pinned by the user_ns */ return reg_keyring; } /* * Look up the user and user session keyrings for the current process's UID, * creating them if they don't exist. */ int look_up_user_keyrings(struct key **_user_keyring, struct key **_user_session_keyring) { const struct cred *cred = current_cred(); struct user_namespace *user_ns = current_user_ns(); struct key *reg_keyring, *uid_keyring, *session_keyring; key_perm_t user_keyring_perm; key_ref_t uid_keyring_r, session_keyring_r; uid_t uid = from_kuid(user_ns, cred->user->uid); char buf[20]; int ret; user_keyring_perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL; kenter("%u", uid); reg_keyring = get_user_register(user_ns); if (IS_ERR(reg_keyring)) return PTR_ERR(reg_keyring); down_write(&user_ns->keyring_sem); ret = 0; /* Get the user keyring. Note that there may be one in existence * already as it may have been pinned by a session, but the user_struct * pointing to it may have been destroyed by setuid. */ snprintf(buf, sizeof(buf), "_uid.%u", uid); uid_keyring_r = keyring_search(make_key_ref(reg_keyring, true), &key_type_keyring, buf, false); kdebug("_uid %p", uid_keyring_r); if (uid_keyring_r == ERR_PTR(-EAGAIN)) { uid_keyring = keyring_alloc(buf, cred->user->uid, INVALID_GID, cred, user_keyring_perm, KEY_ALLOC_UID_KEYRING | KEY_ALLOC_IN_QUOTA, NULL, reg_keyring); if (IS_ERR(uid_keyring)) { ret = PTR_ERR(uid_keyring); goto error; } } else if (IS_ERR(uid_keyring_r)) { ret = PTR_ERR(uid_keyring_r); goto error; } else { uid_keyring = key_ref_to_ptr(uid_keyring_r); } /* Get a default session keyring (which might also exist already) */ snprintf(buf, sizeof(buf), "_uid_ses.%u", uid); session_keyring_r = keyring_search(make_key_ref(reg_keyring, true), &key_type_keyring, buf, false); kdebug("_uid_ses %p", session_keyring_r); if (session_keyring_r == ERR_PTR(-EAGAIN)) { session_keyring = keyring_alloc(buf, cred->user->uid, INVALID_GID, cred, user_keyring_perm, KEY_ALLOC_UID_KEYRING | KEY_ALLOC_IN_QUOTA, NULL, NULL); if (IS_ERR(session_keyring)) { ret = PTR_ERR(session_keyring); goto error_release; } /* We install a link from the user session keyring to * the user keyring. */ ret = key_link(session_keyring, uid_keyring); if (ret < 0) goto error_release_session; /* And only then link the user-session keyring to the * register. */ ret = key_link(reg_keyring, session_keyring); if (ret < 0) goto error_release_session; } else if (IS_ERR(session_keyring_r)) { ret = PTR_ERR(session_keyring_r); goto error_release; } else { session_keyring = key_ref_to_ptr(session_keyring_r); } up_write(&user_ns->keyring_sem); if (_user_session_keyring) *_user_session_keyring = session_keyring; else key_put(session_keyring); if (_user_keyring) *_user_keyring = uid_keyring; else key_put(uid_keyring); kleave(" = 0"); return 0; error_release_session: key_put(session_keyring); error_release: key_put(uid_keyring); error: up_write(&user_ns->keyring_sem); kleave(" = %d", ret); return ret; } /* * Get the user session keyring if it exists, but don't create it if it * doesn't. */ struct key *get_user_session_keyring_rcu(const struct cred *cred) { struct key *reg_keyring = READ_ONCE(cred->user_ns->user_keyring_register); key_ref_t session_keyring_r; char buf[20]; struct keyring_search_context ctx = { .index_key.type = &key_type_keyring, .index_key.description = buf, .cred = cred, .match_data.cmp = key_default_cmp, .match_data.raw_data = buf, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .flags = KEYRING_SEARCH_DO_STATE_CHECK, }; if (!reg_keyring) return NULL; ctx.index_key.desc_len = snprintf(buf, sizeof(buf), "_uid_ses.%u", from_kuid(cred->user_ns, cred->user->uid)); session_keyring_r = keyring_search_rcu(make_key_ref(reg_keyring, true), &ctx); if (IS_ERR(session_keyring_r)) return NULL; return key_ref_to_ptr(session_keyring_r); } /* * Install a thread keyring to the given credentials struct if it didn't have * one already. This is allowed to overrun the quota. * * Return: 0 if a thread keyring is now present; -errno on failure. */ int install_thread_keyring_to_cred(struct cred *new) { struct key *keyring; if (new->thread_keyring) return 0; keyring = keyring_alloc("_tid", new->uid, new->gid, new, KEY_POS_ALL | KEY_USR_VIEW, KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); new->thread_keyring = keyring; return 0; } /* * Install a thread keyring to the current task if it didn't have one already. * * Return: 0 if a thread keyring is now present; -errno on failure. */ static int install_thread_keyring(void) { struct cred *new; int ret; new = prepare_creds(); if (!new) return -ENOMEM; ret = install_thread_keyring_to_cred(new); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); } /* * Install a process keyring to the given credentials struct if it didn't have * one already. This is allowed to overrun the quota. * * Return: 0 if a process keyring is now present; -errno on failure. */ int install_process_keyring_to_cred(struct cred *new) { struct key *keyring; if (new->process_keyring) return 0; keyring = keyring_alloc("_pid", new->uid, new->gid, new, KEY_POS_ALL | KEY_USR_VIEW, KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); new->process_keyring = keyring; return 0; } /* * Install a process keyring to the current task if it didn't have one already. * * Return: 0 if a process keyring is now present; -errno on failure. */ static int install_process_keyring(void) { struct cred *new; int ret; new = prepare_creds(); if (!new) return -ENOMEM; ret = install_process_keyring_to_cred(new); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); } /* * Install the given keyring as the session keyring of the given credentials * struct, replacing the existing one if any. If the given keyring is NULL, * then install a new anonymous session keyring. * @cred can not be in use by any task yet. * * Return: 0 on success; -errno on failure. */ int install_session_keyring_to_cred(struct cred *cred, struct key *keyring) { unsigned long flags; struct key *old; might_sleep(); /* create an empty session keyring */ if (!keyring) { flags = KEY_ALLOC_QUOTA_OVERRUN; if (cred->session_keyring) flags = KEY_ALLOC_IN_QUOTA; keyring = keyring_alloc("_ses", cred->uid, cred->gid, cred, KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ, flags, NULL, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); } else { __key_get(keyring); } /* install the keyring */ old = cred->session_keyring; cred->session_keyring = keyring; if (old) key_put(old); return 0; } /* * Install the given keyring as the session keyring of the current task, * replacing the existing one if any. If the given keyring is NULL, then * install a new anonymous session keyring. * * Return: 0 on success; -errno on failure. */ static int install_session_keyring(struct key *keyring) { struct cred *new; int ret; new = prepare_creds(); if (!new) return -ENOMEM; ret = install_session_keyring_to_cred(new, keyring); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); } /* * Handle the fsuid changing. */ void key_fsuid_changed(struct cred *new_cred) { /* update the ownership of the thread keyring */ if (new_cred->thread_keyring) { down_write(&new_cred->thread_keyring->sem); new_cred->thread_keyring->uid = new_cred->fsuid; up_write(&new_cred->thread_keyring->sem); } } /* * Handle the fsgid changing. */ void key_fsgid_changed(struct cred *new_cred) { /* update the ownership of the thread keyring */ if (new_cred->thread_keyring) { down_write(&new_cred->thread_keyring->sem); new_cred->thread_keyring->gid = new_cred->fsgid; up_write(&new_cred->thread_keyring->sem); } } /* * Search the process keyrings attached to the supplied cred for the first * matching key under RCU conditions (the caller must be holding the RCU read * lock). * * The search criteria are the type and the match function. The description is * given to the match function as a parameter, but doesn't otherwise influence * the search. Typically the match function will compare the description * parameter to the key's description. * * This can only search keyrings that grant Search permission to the supplied * credentials. Keyrings linked to searched keyrings will also be searched if * they grant Search permission too. Keys can only be found if they grant * Search permission to the credentials. * * Returns a pointer to the key with the key usage count incremented if * successful, -EAGAIN if we didn't find any matching key or -ENOKEY if we only * matched negative keys. * * In the case of a successful return, the possession attribute is set on the * returned key reference. */ key_ref_t search_cred_keyrings_rcu(struct keyring_search_context *ctx) { struct key *user_session; key_ref_t key_ref, ret, err; const struct cred *cred = ctx->cred; /* we want to return -EAGAIN or -ENOKEY if any of the keyrings were * searchable, but we failed to find a key or we found a negative key; * otherwise we want to return a sample error (probably -EACCES) if * none of the keyrings were searchable * * in terms of priority: success > -ENOKEY > -EAGAIN > other error */ key_ref = NULL; ret = NULL; err = ERR_PTR(-EAGAIN); /* search the thread keyring first */ if (cred->thread_keyring) { key_ref = keyring_search_rcu( make_key_ref(cred->thread_keyring, 1), ctx); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* search the process keyring second */ if (cred->process_keyring) { key_ref = keyring_search_rcu( make_key_ref(cred->process_keyring, 1), ctx); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ if (ret) break; fallthrough; case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* search the session keyring */ if (cred->session_keyring) { key_ref = keyring_search_rcu( make_key_ref(cred->session_keyring, 1), ctx); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ if (ret) break; fallthrough; case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* or search the user-session keyring */ else if ((user_session = get_user_session_keyring_rcu(cred))) { key_ref = keyring_search_rcu(make_key_ref(user_session, 1), ctx); key_put(user_session); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ if (ret) break; fallthrough; case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* no key - decide on the error we're going to go for */ key_ref = ret ? ret : err; found: return key_ref; } /* * Search the process keyrings attached to the supplied cred for the first * matching key in the manner of search_my_process_keyrings(), but also search * the keys attached to the assumed authorisation key using its credentials if * one is available. * * The caller must be holding the RCU read lock. * * Return same as search_cred_keyrings_rcu(). */ key_ref_t search_process_keyrings_rcu(struct keyring_search_context *ctx) { struct request_key_auth *rka; key_ref_t key_ref, ret = ERR_PTR(-EACCES), err; key_ref = search_cred_keyrings_rcu(ctx); if (!IS_ERR(key_ref)) goto found; err = key_ref; /* if this process has an instantiation authorisation key, then we also * search the keyrings of the process mentioned there * - we don't permit access to request_key auth keys via this method */ if (ctx->cred->request_key_auth && ctx->cred == current_cred() && ctx->index_key.type != &key_type_request_key_auth ) { const struct cred *cred = ctx->cred; if (key_validate(cred->request_key_auth) == 0) { rka = ctx->cred->request_key_auth->payload.data[0]; //// was search_process_keyrings() [ie. recursive] ctx->cred = rka->cred; key_ref = search_cred_keyrings_rcu(ctx); ctx->cred = cred; if (!IS_ERR(key_ref)) goto found; ret = key_ref; } } /* no key - decide on the error we're going to go for */ if (err == ERR_PTR(-ENOKEY) || ret == ERR_PTR(-ENOKEY)) key_ref = ERR_PTR(-ENOKEY); else if (err == ERR_PTR(-EACCES)) key_ref = ret; else key_ref = err; found: return key_ref; } /* * See if the key we're looking at is the target key. */ bool lookup_user_key_possessed(const struct key *key, const struct key_match_data *match_data) { return key == match_data->raw_data; } /* * Look up a key ID given us by userspace with a given permissions mask to get * the key it refers to. * * Flags can be passed to request that special keyrings be created if referred * to directly, to permit partially constructed keys to be found and to skip * validity and permission checks on the found key. * * Returns a pointer to the key with an incremented usage count if successful; * -EINVAL if the key ID is invalid; -ENOKEY if the key ID does not correspond * to a key or the best found key was a negative key; -EKEYREVOKED or * -EKEYEXPIRED if the best found key was revoked or expired; -EACCES if the * found key doesn't grant the requested permit or the LSM denied access to it; * or -ENOMEM if a special keyring couldn't be created. * * In the case of a successful return, the possession attribute is set on the * returned key reference. */ key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags, enum key_need_perm need_perm) { struct keyring_search_context ctx = { .match_data.cmp = lookup_user_key_possessed, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .flags = (KEYRING_SEARCH_NO_STATE_CHECK | KEYRING_SEARCH_RECURSE), }; struct request_key_auth *rka; struct key *key, *user_session; key_ref_t key_ref, skey_ref; int ret; try_again: ctx.cred = get_current_cred(); key_ref = ERR_PTR(-ENOKEY); switch (id) { case KEY_SPEC_THREAD_KEYRING: if (!ctx.cred->thread_keyring) { if (!(lflags & KEY_LOOKUP_CREATE)) goto error; ret = install_thread_keyring(); if (ret < 0) { key_ref = ERR_PTR(ret); goto error; } goto reget_creds; } key = ctx.cred->thread_keyring; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_PROCESS_KEYRING: if (!ctx.cred->process_keyring) { if (!(lflags & KEY_LOOKUP_CREATE)) goto error; ret = install_process_keyring(); if (ret < 0) { key_ref = ERR_PTR(ret); goto error; } goto reget_creds; } key = ctx.cred->process_keyring; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_SESSION_KEYRING: if (!ctx.cred->session_keyring) { /* always install a session keyring upon access if one * doesn't exist yet */ ret = look_up_user_keyrings(NULL, &user_session); if (ret < 0) goto error; if (lflags & KEY_LOOKUP_CREATE) ret = join_session_keyring(NULL); else ret = install_session_keyring(user_session); key_put(user_session); if (ret < 0) goto error; goto reget_creds; } else if (test_bit(KEY_FLAG_UID_KEYRING, &ctx.cred->session_keyring->flags) && lflags & KEY_LOOKUP_CREATE) { ret = join_session_keyring(NULL); if (ret < 0) goto error; goto reget_creds; } key = ctx.cred->session_keyring; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_USER_KEYRING: ret = look_up_user_keyrings(&key, NULL); if (ret < 0) goto error; key_ref = make_key_ref(key, 1); break; case KEY_SPEC_USER_SESSION_KEYRING: ret = look_up_user_keyrings(NULL, &key); if (ret < 0) goto error; key_ref = make_key_ref(key, 1); break; case KEY_SPEC_GROUP_KEYRING: /* group keyrings are not yet supported */ key_ref = ERR_PTR(-EINVAL); goto error; case KEY_SPEC_REQKEY_AUTH_KEY: key = ctx.cred->request_key_auth; if (!key) goto error; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_REQUESTOR_KEYRING: if (!ctx.cred->request_key_auth) goto error; down_read(&ctx.cred->request_key_auth->sem); if (test_bit(KEY_FLAG_REVOKED, &ctx.cred->request_key_auth->flags)) { key_ref = ERR_PTR(-EKEYREVOKED); key = NULL; } else { rka = ctx.cred->request_key_auth->payload.data[0]; key = rka->dest_keyring; __key_get(key); } up_read(&ctx.cred->request_key_auth->sem); if (!key) goto error; key_ref = make_key_ref(key, 1); break; default: key_ref = ERR_PTR(-EINVAL); if (id < 1) goto error; key = key_lookup(id); if (IS_ERR(key)) { key_ref = ERR_CAST(key); goto error; } key_ref = make_key_ref(key, 0); /* check to see if we possess the key */ ctx.index_key = key->index_key; ctx.match_data.raw_data = key; kdebug("check possessed"); rcu_read_lock(); skey_ref = search_process_keyrings_rcu(&ctx); rcu_read_unlock(); kdebug("possessed=%p", skey_ref); if (!IS_ERR(skey_ref)) { key_put(key); key_ref = skey_ref; } break; } /* unlink does not use the nominated key in any way, so can skip all * the permission checks as it is only concerned with the keyring */ if (need_perm != KEY_NEED_UNLINK) { if (!(lflags & KEY_LOOKUP_PARTIAL)) { ret = wait_for_key_construction(key, true); switch (ret) { case -ERESTARTSYS: goto invalid_key; default: if (need_perm != KEY_AUTHTOKEN_OVERRIDE && need_perm != KEY_DEFER_PERM_CHECK) goto invalid_key; break; case 0: break; } } else if (need_perm != KEY_DEFER_PERM_CHECK) { ret = key_validate(key); if (ret < 0) goto invalid_key; } ret = -EIO; if (!(lflags & KEY_LOOKUP_PARTIAL) && key_read_state(key) == KEY_IS_UNINSTANTIATED) goto invalid_key; } /* check the permissions */ ret = key_task_permission(key_ref, ctx.cred, need_perm); if (ret < 0) goto invalid_key; key->last_used_at = ktime_get_real_seconds(); error: put_cred(ctx.cred); return key_ref; invalid_key: key_ref_put(key_ref); key_ref = ERR_PTR(ret); goto error; /* if we attempted to install a keyring, then it may have caused new * creds to be installed */ reget_creds: put_cred(ctx.cred); goto try_again; } EXPORT_SYMBOL(lookup_user_key); /* * Join the named keyring as the session keyring if possible else attempt to * create a new one of that name and join that. * * If the name is NULL, an empty anonymous keyring will be installed as the * session keyring. * * Named session keyrings are joined with a semaphore held to prevent the * keyrings from going away whilst the attempt is made to going them and also * to prevent a race in creating compatible session keyrings. */ long join_session_keyring(const char *name) { const struct cred *old; struct cred *new; struct key *keyring; long ret, serial; new = prepare_creds(); if (!new) return -ENOMEM; old = current_cred(); /* if no name is provided, install an anonymous keyring */ if (!name) { ret = install_session_keyring_to_cred(new, NULL); if (ret < 0) goto error; serial = new->session_keyring->serial; ret = commit_creds(new); if (ret == 0) ret = serial; goto okay; } /* allow the user to join or create a named keyring */ mutex_lock(&key_session_mutex); /* look for an existing keyring of this name */ keyring = find_keyring_by_name(name, false); if (PTR_ERR(keyring) == -ENOKEY) { /* not found - try and create a new one */ keyring = keyring_alloc( name, old->uid, old->gid, old, KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_LINK, KEY_ALLOC_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error2; } } else if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error2; } else if (keyring == new->session_keyring) { ret = 0; goto error3; } /* we've got a keyring - now to install it */ ret = install_session_keyring_to_cred(new, keyring); if (ret < 0) goto error3; commit_creds(new); mutex_unlock(&key_session_mutex); ret = keyring->serial; key_put(keyring); okay: return ret; error3: key_put(keyring); error2: mutex_unlock(&key_session_mutex); error: abort_creds(new); return ret; } /* * Replace a process's session keyring on behalf of one of its children when * the target process is about to resume userspace execution. */ void key_change_session_keyring(struct callback_head *twork) { const struct cred *old = current_cred(); struct cred *new = container_of(twork, struct cred, rcu); if (unlikely(current->flags & PF_EXITING)) { put_cred(new); return; } /* If get_ucounts fails more bits are needed in the refcount */ if (unlikely(!get_ucounts(old->ucounts))) { WARN_ONCE(1, "In %s get_ucounts failed\n", __func__); put_cred(new); return; } new-> uid = old-> uid; new-> euid = old-> euid; new-> suid = old-> suid; new->fsuid = old->fsuid; new-> gid = old-> gid; new-> egid = old-> egid; new-> sgid = old-> sgid; new->fsgid = old->fsgid; new->user = get_uid(old->user); new->ucounts = old->ucounts; new->user_ns = get_user_ns(old->user_ns); new->group_info = get_group_info(old->group_info); new->securebits = old->securebits; new->cap_inheritable = old->cap_inheritable; new->cap_permitted = old->cap_permitted; new->cap_effective = old->cap_effective; new->cap_ambient = old->cap_ambient; new->cap_bset = old->cap_bset; new->jit_keyring = old->jit_keyring; new->thread_keyring = key_get(old->thread_keyring); new->process_keyring = key_get(old->process_keyring); security_transfer_creds(new, old); commit_creds(new); } /* * Make sure that root's user and user-session keyrings exist. */ static int __init init_root_keyring(void) { return look_up_user_keyrings(NULL, NULL); } late_initcall(init_root_keyring);
14 14 10 14 14 2 2 1 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 // SPDX-License-Identifier: GPL-2.0 OR MIT /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. * * This is an implementation of the ChaCha20Poly1305 AEAD construction. * * Information: https://tools.ietf.org/html/rfc8439 */ #include <crypto/algapi.h> #include <crypto/chacha20poly1305.h> #include <crypto/chacha.h> #include <crypto/poly1305.h> #include <crypto/scatterwalk.h> #include <asm/unaligned.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/mm.h> #include <linux/module.h> #define CHACHA_KEY_WORDS (CHACHA_KEY_SIZE / sizeof(u32)) static void chacha_load_key(u32 *k, const u8 *in) { k[0] = get_unaligned_le32(in); k[1] = get_unaligned_le32(in + 4); k[2] = get_unaligned_le32(in + 8); k[3] = get_unaligned_le32(in + 12); k[4] = get_unaligned_le32(in + 16); k[5] = get_unaligned_le32(in + 20); k[6] = get_unaligned_le32(in + 24); k[7] = get_unaligned_le32(in + 28); } static void xchacha_init(u32 *chacha_state, const u8 *key, const u8 *nonce) { u32 k[CHACHA_KEY_WORDS]; u8 iv[CHACHA_IV_SIZE]; memset(iv, 0, 8); memcpy(iv + 8, nonce + 16, 8); chacha_load_key(k, key); /* Compute the subkey given the original key and first 128 nonce bits */ chacha_init(chacha_state, k, nonce); hchacha_block(chacha_state, k, 20); chacha_init(chacha_state, k, iv); memzero_explicit(k, sizeof(k)); memzero_explicit(iv, sizeof(iv)); } static void __chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len, const u8 *ad, const size_t ad_len, u32 *chacha_state) { const u8 *pad0 = page_address(ZERO_PAGE(0)); struct poly1305_desc_ctx poly1305_state; union { u8 block0[POLY1305_KEY_SIZE]; __le64 lens[2]; } b; chacha20_crypt(chacha_state, b.block0, pad0, sizeof(b.block0)); poly1305_init(&poly1305_state, b.block0); poly1305_update(&poly1305_state, ad, ad_len); if (ad_len & 0xf) poly1305_update(&poly1305_state, pad0, 0x10 - (ad_len & 0xf)); chacha20_crypt(chacha_state, dst, src, src_len); poly1305_update(&poly1305_state, dst, src_len); if (src_len & 0xf) poly1305_update(&poly1305_state, pad0, 0x10 - (src_len & 0xf)); b.lens[0] = cpu_to_le64(ad_len); b.lens[1] = cpu_to_le64(src_len); poly1305_update(&poly1305_state, (u8 *)b.lens, sizeof(b.lens)); poly1305_final(&poly1305_state, dst + src_len); memzero_explicit(chacha_state, CHACHA_STATE_WORDS * sizeof(u32)); memzero_explicit(&b, sizeof(b)); } void chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len, const u8 *ad, const size_t ad_len, const u64 nonce, const u8 key[CHACHA20POLY1305_KEY_SIZE]) { u32 chacha_state[CHACHA_STATE_WORDS]; u32 k[CHACHA_KEY_WORDS]; __le64 iv[2]; chacha_load_key(k, key); iv[0] = 0; iv[1] = cpu_to_le64(nonce); chacha_init(chacha_state, k, (u8 *)iv); __chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len, chacha_state); memzero_explicit(iv, sizeof(iv)); memzero_explicit(k, sizeof(k)); } EXPORT_SYMBOL(chacha20poly1305_encrypt); void xchacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len, const u8 *ad, const size_t ad_len, const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE], const u8 key[CHACHA20POLY1305_KEY_SIZE]) { u32 chacha_state[CHACHA_STATE_WORDS]; xchacha_init(chacha_state, key, nonce); __chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len, chacha_state); } EXPORT_SYMBOL(xchacha20poly1305_encrypt); static bool __chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len, const u8 *ad, const size_t ad_len, u32 *chacha_state) { const u8 *pad0 = page_address(ZERO_PAGE(0)); struct poly1305_desc_ctx poly1305_state; size_t dst_len; int ret; union { u8 block0[POLY1305_KEY_SIZE]; u8 mac[POLY1305_DIGEST_SIZE]; __le64 lens[2]; } b; if (unlikely(src_len < POLY1305_DIGEST_SIZE)) return false; chacha20_crypt(chacha_state, b.block0, pad0, sizeof(b.block0)); poly1305_init(&poly1305_state, b.block0); poly1305_update(&poly1305_state, ad, ad_len); if (ad_len & 0xf) poly1305_update(&poly1305_state, pad0, 0x10 - (ad_len & 0xf)); dst_len = src_len - POLY1305_DIGEST_SIZE; poly1305_update(&poly1305_state, src, dst_len); if (dst_len & 0xf) poly1305_update(&poly1305_state, pad0, 0x10 - (dst_len & 0xf)); b.lens[0] = cpu_to_le64(ad_len); b.lens[1] = cpu_to_le64(dst_len); poly1305_update(&poly1305_state, (u8 *)b.lens, sizeof(b.lens)); poly1305_final(&poly1305_state, b.mac); ret = crypto_memneq(b.mac, src + dst_len, POLY1305_DIGEST_SIZE); if (likely(!ret)) chacha20_crypt(chacha_state, dst, src, dst_len); memzero_explicit(&b, sizeof(b)); return !ret; } bool chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len, const u8 *ad, const size_t ad_len, const u64 nonce, const u8 key[CHACHA20POLY1305_KEY_SIZE]) { u32 chacha_state[CHACHA_STATE_WORDS]; u32 k[CHACHA_KEY_WORDS]; __le64 iv[2]; bool ret; chacha_load_key(k, key); iv[0] = 0; iv[1] = cpu_to_le64(nonce); chacha_init(chacha_state, k, (u8 *)iv); ret = __chacha20poly1305_decrypt(dst, src, src_len, ad, ad_len, chacha_state); memzero_explicit(chacha_state, sizeof(chacha_state)); memzero_explicit(iv, sizeof(iv)); memzero_explicit(k, sizeof(k)); return ret; } EXPORT_SYMBOL(chacha20poly1305_decrypt); bool xchacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len, const u8 *ad, const size_t ad_len, const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE], const u8 key[CHACHA20POLY1305_KEY_SIZE]) { u32 chacha_state[CHACHA_STATE_WORDS]; xchacha_init(chacha_state, key, nonce); return __chacha20poly1305_decrypt(dst, src, src_len, ad, ad_len, chacha_state); } EXPORT_SYMBOL(xchacha20poly1305_decrypt); static bool chacha20poly1305_crypt_sg_inplace(struct scatterlist *src, const size_t src_len, const u8 *ad, const size_t ad_len, const u64 nonce, const u8 key[CHACHA20POLY1305_KEY_SIZE], int encrypt) { const u8 *pad0 = page_address(ZERO_PAGE(0)); struct poly1305_desc_ctx poly1305_state; u32 chacha_state[CHACHA_STATE_WORDS]; struct sg_mapping_iter miter; size_t partial = 0; unsigned int flags; bool ret = true; int sl; union { struct { u32 k[CHACHA_KEY_WORDS]; __le64 iv[2]; }; u8 block0[POLY1305_KEY_SIZE]; u8 chacha_stream[CHACHA_BLOCK_SIZE]; struct { u8 mac[2][POLY1305_DIGEST_SIZE]; }; __le64 lens[2]; } b __aligned(16); if (WARN_ON(src_len > INT_MAX)) return false; chacha_load_key(b.k, key); b.iv[0] = 0; b.iv[1] = cpu_to_le64(nonce); chacha_init(chacha_state, b.k, (u8 *)b.iv); chacha20_crypt(chacha_state, b.block0, pad0, sizeof(b.block0)); poly1305_init(&poly1305_state, b.block0); if (unlikely(ad_len)) { poly1305_update(&poly1305_state, ad, ad_len); if (ad_len & 0xf) poly1305_update(&poly1305_state, pad0, 0x10 - (ad_len & 0xf)); } flags = SG_MITER_TO_SG | SG_MITER_ATOMIC; sg_miter_start(&miter, src, sg_nents(src), flags); for (sl = src_len; sl > 0 && sg_miter_next(&miter); sl -= miter.length) { u8 *addr = miter.addr; size_t length = min_t(size_t, sl, miter.length); if (!encrypt) poly1305_update(&poly1305_state, addr, length); if (unlikely(partial)) { size_t l = min(length, CHACHA_BLOCK_SIZE - partial); crypto_xor(addr, b.chacha_stream + partial, l); partial = (partial + l) & (CHACHA_BLOCK_SIZE - 1); addr += l; length -= l; } if (likely(length >= CHACHA_BLOCK_SIZE || length == sl)) { size_t l = length; if (unlikely(length < sl)) l &= ~(CHACHA_BLOCK_SIZE - 1); chacha20_crypt(chacha_state, addr, addr, l); addr += l; length -= l; } if (unlikely(length > 0)) { chacha20_crypt(chacha_state, b.chacha_stream, pad0, CHACHA_BLOCK_SIZE); crypto_xor(addr, b.chacha_stream, length); partial = length; } if (encrypt) poly1305_update(&poly1305_state, miter.addr, min_t(size_t, sl, miter.length)); } if (src_len & 0xf) poly1305_update(&poly1305_state, pad0, 0x10 - (src_len & 0xf)); b.lens[0] = cpu_to_le64(ad_len); b.lens[1] = cpu_to_le64(src_len); poly1305_update(&poly1305_state, (u8 *)b.lens, sizeof(b.lens)); if (likely(sl <= -POLY1305_DIGEST_SIZE)) { if (encrypt) { poly1305_final(&poly1305_state, miter.addr + miter.length + sl); ret = true; } else { poly1305_final(&poly1305_state, b.mac[0]); ret = !crypto_memneq(b.mac[0], miter.addr + miter.length + sl, POLY1305_DIGEST_SIZE); } } sg_miter_stop(&miter); if (unlikely(sl > -POLY1305_DIGEST_SIZE)) { poly1305_final(&poly1305_state, b.mac[1]); scatterwalk_map_and_copy(b.mac[encrypt], src, src_len, sizeof(b.mac[1]), encrypt); ret = encrypt || !crypto_memneq(b.mac[0], b.mac[1], POLY1305_DIGEST_SIZE); } memzero_explicit(chacha_state, sizeof(chacha_state)); memzero_explicit(&b, sizeof(b)); return ret; } bool chacha20poly1305_encrypt_sg_inplace(struct scatterlist *src, size_t src_len, const u8 *ad, const size_t ad_len, const u64 nonce, const u8 key[CHACHA20POLY1305_KEY_SIZE]) { return chacha20poly1305_crypt_sg_inplace(src, src_len, ad, ad_len, nonce, key, 1); } EXPORT_SYMBOL(chacha20poly1305_encrypt_sg_inplace); bool chacha20poly1305_decrypt_sg_inplace(struct scatterlist *src, size_t src_len, const u8 *ad, const size_t ad_len, const u64 nonce, const u8 key[CHACHA20POLY1305_KEY_SIZE]) { if (unlikely(src_len < POLY1305_DIGEST_SIZE)) return false; return chacha20poly1305_crypt_sg_inplace(src, src_len - POLY1305_DIGEST_SIZE, ad, ad_len, nonce, key, 0); } EXPORT_SYMBOL(chacha20poly1305_decrypt_sg_inplace); static int __init chacha20poly1305_init(void) { if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) && WARN_ON(!chacha20poly1305_selftest())) return -ENODEV; return 0; } static void __exit chacha20poly1305_exit(void) { } module_init(chacha20poly1305_init); module_exit(chacha20poly1305_exit); MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("ChaCha20Poly1305 AEAD construction"); MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
12 1 134 101 44 563 536 5 1 2 10 1 381 11308 359 574 659 70 622 24 741 9 71 12137 688 36 25 319 114 1319 117 110 82 117 1090 63 60 467 500 192 82 184 17 223 599 629 576 783 779 76 76 758 87 1053 783 40 784 3 968 3313 312 759 1054 660 26 750 732 557 433 984 158 984 751 200 1001 6190 130 124 5 29 167 4 742 14605 14582 22 51 51 644 8171 19 26 3243 26 1237 202 53 58 71 577 7 2333 297 299 2 1 1 180 576 603 619 618 560 988 41 392 199 10528 226 225 1016 1147 36 36 8 7 3129 138 14 16 1 39 899 253 2003 10900 217 409 357 369 1 89 109 39 21 24 100 97 11 3 23 5 2 6 9 5 801 38 18 4966 18 7199 1739 7202 3 12 12 2 12 179 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the AF_INET socket handler. * * Version: @(#)sock.h 1.0.4 05/13/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche <flla@stud.uni-sb.de> * * Fixes: * Alan Cox : Volatiles in skbuff pointers. See * skbuff comments. May be overdone, * better to prove they can be removed * than the reverse. * Alan Cox : Added a zapped field for tcp to note * a socket is reset and must stay shut up * Alan Cox : New fields for options * Pauline Middelink : identd support * Alan Cox : Eliminate low level recv/recvfrom * David S. Miller : New socket lookup architecture. * Steve Whitehouse: Default routines for sock_ops * Arnaldo C. Melo : removed net_pinfo, tp_pinfo and made * protinfo be just a void pointer, as the * protocol specific parts were moved to * respective headers and ipv4/v6, etc now * use private slabcaches for its socks * Pedro Hortas : New flags field for socket options */ #ifndef _SOCK_H #define _SOCK_H #include <linux/hardirq.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/list_nulls.h> #include <linux/timer.h> #include <linux/cache.h> #include <linux/bitops.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/skbuff.h> /* struct sk_buff */ #include <linux/mm.h> #include <linux/security.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/page_counter.h> #include <linux/memcontrol.h> #include <linux/static_key.h> #include <linux/sched.h> #include <linux/wait.h> #include <linux/cgroup-defs.h> #include <linux/rbtree.h> #include <linux/rculist_nulls.h> #include <linux/poll.h> #include <linux/sockptr.h> #include <linux/indirect_call_wrapper.h> #include <linux/atomic.h> #include <linux/refcount.h> #include <linux/llist.h> #include <net/dst.h> #include <net/checksum.h> #include <net/tcp_states.h> #include <linux/net_tstamp.h> #include <net/l3mdev.h> #include <uapi/linux/socket.h> /* * This structure really needs to be cleaned up. * Most of it is for TCP, and not used by any of * the other protocols. */ /* This is the per-socket lock. The spinlock provides a synchronization * between user contexts and software interrupt processing, whereas the * mini-semaphore synchronizes multiple users amongst themselves. */ typedef struct { spinlock_t slock; int owned; wait_queue_head_t wq; /* * We express the mutex-alike socket_lock semantics * to the lock validator by explicitly managing * the slock as a lock variant (in addition to * the slock itself): */ #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif } socket_lock_t; struct sock; struct proto; struct net; typedef __u32 __bitwise __portpair; typedef __u64 __bitwise __addrpair; /** * struct sock_common - minimal network layer representation of sockets * @skc_daddr: Foreign IPv4 addr * @skc_rcv_saddr: Bound local IPv4 addr * @skc_addrpair: 8-byte-aligned __u64 union of @skc_daddr & @skc_rcv_saddr * @skc_hash: hash value used with various protocol lookup tables * @skc_u16hashes: two u16 hash values used by UDP lookup tables * @skc_dport: placeholder for inet_dport/tw_dport * @skc_num: placeholder for inet_num/tw_num * @skc_portpair: __u32 union of @skc_dport & @skc_num * @skc_family: network address family * @skc_state: Connection state * @skc_reuse: %SO_REUSEADDR setting * @skc_reuseport: %SO_REUSEPORT setting * @skc_ipv6only: socket is IPV6 only * @skc_net_refcnt: socket is using net ref counting * @skc_bound_dev_if: bound device index if != 0 * @skc_bind_node: bind hash linkage for various protocol lookup tables * @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol * @skc_prot: protocol handlers inside a network family * @skc_net: reference to the network namespace of this socket * @skc_v6_daddr: IPV6 destination address * @skc_v6_rcv_saddr: IPV6 source address * @skc_cookie: socket's cookie value * @skc_node: main hash linkage for various protocol lookup tables * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol * @skc_tx_queue_mapping: tx queue number for this connection * @skc_rx_queue_mapping: rx queue number for this connection * @skc_flags: place holder for sk_flags * %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings * @skc_listener: connection request listener socket (aka rsk_listener) * [union with @skc_flags] * @skc_tw_dr: (aka tw_dr) ptr to &struct inet_timewait_death_row * [union with @skc_flags] * @skc_incoming_cpu: record/match cpu processing incoming packets * @skc_rcv_wnd: (aka rsk_rcv_wnd) TCP receive window size (possibly scaled) * [union with @skc_incoming_cpu] * @skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number * [union with @skc_incoming_cpu] * @skc_refcnt: reference count * * This is the minimal network layer representation of sockets, the header * for struct sock and struct inet_timewait_sock. */ struct sock_common { union { __addrpair skc_addrpair; struct { __be32 skc_daddr; __be32 skc_rcv_saddr; }; }; union { unsigned int skc_hash; __u16 skc_u16hashes[2]; }; /* skc_dport && skc_num must be grouped as well */ union { __portpair skc_portpair; struct { __be16 skc_dport; __u16 skc_num; }; }; unsigned short skc_family; volatile unsigned char skc_state; unsigned char skc_reuse:4; unsigned char skc_reuseport:1; unsigned char skc_ipv6only:1; unsigned char skc_net_refcnt:1; int skc_bound_dev_if; union { struct hlist_node skc_bind_node; struct hlist_node skc_portaddr_node; }; struct proto *skc_prot; possible_net_t skc_net; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr skc_v6_daddr; struct in6_addr skc_v6_rcv_saddr; #endif atomic64_t skc_cookie; /* following fields are padding to force * offset(struct sock, sk_refcnt) == 128 on 64bit arches * assuming IPV6 is enabled. We use this padding differently * for different kind of 'sockets' */ union { unsigned long skc_flags; struct sock *skc_listener; /* request_sock */ struct inet_timewait_death_row *skc_tw_dr; /* inet_timewait_sock */ }; /* * fields between dontcopy_begin/dontcopy_end * are not copied in sock_copy() */ /* private: */ int skc_dontcopy_begin[0]; /* public: */ union { struct hlist_node skc_node; struct hlist_nulls_node skc_nulls_node; }; unsigned short skc_tx_queue_mapping; #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING unsigned short skc_rx_queue_mapping; #endif union { int skc_incoming_cpu; u32 skc_rcv_wnd; u32 skc_tw_rcv_nxt; /* struct tcp_timewait_sock */ }; refcount_t skc_refcnt; /* private: */ int skc_dontcopy_end[0]; union { u32 skc_rxhash; u32 skc_window_clamp; u32 skc_tw_snd_nxt; /* struct tcp_timewait_sock */ }; /* public: */ }; struct bpf_local_storage; struct sk_filter; /** * struct sock - network layer representation of sockets * @__sk_common: shared layout with inet_timewait_sock * @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN * @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings * @sk_lock: synchronizer * @sk_kern_sock: True if sock is using kernel lock classes * @sk_rcvbuf: size of receive buffer in bytes * @sk_wq: sock wait queue and async head * @sk_rx_dst: receive input route used by early demux * @sk_rx_dst_ifindex: ifindex for @sk_rx_dst * @sk_rx_dst_cookie: cookie for @sk_rx_dst * @sk_dst_cache: destination cache * @sk_dst_pending_confirm: need to confirm neighbour * @sk_policy: flow policy * @sk_receive_queue: incoming packets * @sk_wmem_alloc: transmit queue bytes committed * @sk_tsq_flags: TCP Small Queues flags * @sk_write_queue: Packet sending queue * @sk_omem_alloc: "o" is "option" or "other" * @sk_wmem_queued: persistent queue size * @sk_forward_alloc: space allocated forward * @sk_reserved_mem: space reserved and non-reclaimable for the socket * @sk_napi_id: id of the last napi context to receive data for sk * @sk_ll_usec: usecs to busypoll when there is no data * @sk_allocation: allocation mode * @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler) * @sk_pacing_status: Pacing status (requested, handled by sch_fq) * @sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE) * @sk_sndbuf: size of send buffer in bytes * @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets * @sk_no_check_rx: allow zero checksum in RX packets * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) * @sk_gso_disabled: if set, NETIF_F_GSO_MASK is forbidden. * @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4) * @sk_gso_max_size: Maximum GSO segment size to build * @sk_gso_max_segs: Maximum number of GSO segments * @sk_pacing_shift: scaling factor for TCP Small Queues * @sk_lingertime: %SO_LINGER l_linger setting * @sk_backlog: always used with the per-socket spinlock held * @sk_callback_lock: used with the callbacks in the end of this struct * @sk_error_queue: rarely used * @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt, * IPV6_ADDRFORM for instance) * @sk_err: last error * @sk_err_soft: errors that don't cause failure but are the cause of a * persistent failure not just 'timed out' * @sk_drops: raw/udp drops counter * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_uid: user id of owner * @sk_prefer_busy_poll: prefer busypolling over softirq processing * @sk_busy_poll_budget: napi processing budget when busypolling * @sk_priority: %SO_PRIORITY setting * @sk_type: socket type (%SOCK_STREAM, etc) * @sk_protocol: which protocol this socket belongs in this network family * @sk_peer_lock: lock protecting @sk_peer_pid and @sk_peer_cred * @sk_peer_pid: &struct pid for this socket's peer * @sk_peer_cred: %SO_PEERCRED setting * @sk_rcvlowat: %SO_RCVLOWAT setting * @sk_rcvtimeo: %SO_RCVTIMEO setting * @sk_sndtimeo: %SO_SNDTIMEO setting * @sk_txhash: computed flow hash for use on transmit * @sk_txrehash: enable TX hash rethink * @sk_filter: socket filtering instructions * @sk_timer: sock cleanup timer * @sk_stamp: time stamp of last packet received * @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only * @sk_tsflags: SO_TIMESTAMPING flags * @sk_use_task_frag: allow sk_page_frag() to use current->task_frag. * Sockets that can be used under memory reclaim should * set this to false. * @sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock * for timestamping * @sk_tskey: counter to disambiguate concurrent tstamp requests * @sk_zckey: counter to order MSG_ZEROCOPY notifications * @sk_socket: Identd and reporting IO signals * @sk_user_data: RPC layer private data. Write-protected by @sk_callback_lock. * @sk_frag: cached page frag * @sk_peek_off: current peek_offset value * @sk_send_head: front of stuff to transmit * @tcp_rtx_queue: TCP re-transmit queue [union with @sk_send_head] * @sk_security: used by security modules * @sk_mark: generic packet mark * @sk_cgrp_data: cgroup data for this cgroup * @sk_memcg: this socket's memory cgroup association * @sk_write_pending: a write to stream socket waits to start * @sk_disconnects: number of disconnect operations performed on this sock * @sk_state_change: callback to indicate change in the state of the sock * @sk_data_ready: callback to indicate there is data to be processed * @sk_write_space: callback to indicate there is bf sending space available * @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE) * @sk_backlog_rcv: callback to process the backlog * @sk_validate_xmit_skb: ptr to an optional validate function * @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0 * @sk_reuseport_cb: reuseport group container * @sk_bpf_storage: ptr to cache and control for bpf_sk_storage * @sk_rcu: used during RCU grace period * @sk_clockid: clockid used by time-based scheduling (SO_TXTIME) * @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME * @sk_txtime_report_errors: set report errors mode for SO_TXTIME * @sk_txtime_unused: unused txtime flags * @ns_tracker: tracker for netns reference */ struct sock { /* * Now struct inet_timewait_sock also uses sock_common, so please just * don't add nothing before this first member (__sk_common) --acme */ struct sock_common __sk_common; #define sk_node __sk_common.skc_node #define sk_nulls_node __sk_common.skc_nulls_node #define sk_refcnt __sk_common.skc_refcnt #define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING #define sk_rx_queue_mapping __sk_common.skc_rx_queue_mapping #endif #define sk_dontcopy_begin __sk_common.skc_dontcopy_begin #define sk_dontcopy_end __sk_common.skc_dontcopy_end #define sk_hash __sk_common.skc_hash #define sk_portpair __sk_common.skc_portpair #define sk_num __sk_common.skc_num #define sk_dport __sk_common.skc_dport #define sk_addrpair __sk_common.skc_addrpair #define sk_daddr __sk_common.skc_daddr #define sk_rcv_saddr __sk_common.skc_rcv_saddr #define sk_family __sk_common.skc_family #define sk_state __sk_common.skc_state #define sk_reuse __sk_common.skc_reuse #define sk_reuseport __sk_common.skc_reuseport #define sk_ipv6only __sk_common.skc_ipv6only #define sk_net_refcnt __sk_common.skc_net_refcnt #define sk_bound_dev_if __sk_common.skc_bound_dev_if #define sk_bind_node __sk_common.skc_bind_node #define sk_prot __sk_common.skc_prot #define sk_net __sk_common.skc_net #define sk_v6_daddr __sk_common.skc_v6_daddr #define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr #define sk_cookie __sk_common.skc_cookie #define sk_incoming_cpu __sk_common.skc_incoming_cpu #define sk_flags __sk_common.skc_flags #define sk_rxhash __sk_common.skc_rxhash /* early demux fields */ struct dst_entry __rcu *sk_rx_dst; int sk_rx_dst_ifindex; u32 sk_rx_dst_cookie; socket_lock_t sk_lock; atomic_t sk_drops; int sk_rcvlowat; struct sk_buff_head sk_error_queue; struct sk_buff_head sk_receive_queue; /* * The backlog queue is special, it is always used with * the per-socket spinlock held and requires low latency * access. Therefore we special case it's implementation. * Note : rmem_alloc is in this structure to fill a hole * on 64bit arches, not because its logically part of * backlog. */ struct { atomic_t rmem_alloc; int len; struct sk_buff *head; struct sk_buff *tail; } sk_backlog; #define sk_rmem_alloc sk_backlog.rmem_alloc int sk_forward_alloc; u32 sk_reserved_mem; #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int sk_ll_usec; /* ===== mostly read cache line ===== */ unsigned int sk_napi_id; #endif int sk_rcvbuf; int sk_disconnects; struct sk_filter __rcu *sk_filter; union { struct socket_wq __rcu *sk_wq; /* private: */ struct socket_wq *sk_wq_raw; /* public: */ }; #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; #endif struct dst_entry __rcu *sk_dst_cache; atomic_t sk_omem_alloc; int sk_sndbuf; /* ===== cache line for TX ===== */ int sk_wmem_queued; refcount_t sk_wmem_alloc; unsigned long sk_tsq_flags; union { struct sk_buff *sk_send_head; struct rb_root tcp_rtx_queue; }; struct sk_buff_head sk_write_queue; __s32 sk_peek_off; int sk_write_pending; __u32 sk_dst_pending_confirm; u32 sk_pacing_status; /* see enum sk_pacing */ long sk_sndtimeo; struct timer_list sk_timer; __u32 sk_priority; __u32 sk_mark; unsigned long sk_pacing_rate; /* bytes per second */ unsigned long sk_max_pacing_rate; struct page_frag sk_frag; netdev_features_t sk_route_caps; int sk_gso_type; unsigned int sk_gso_max_size; gfp_t sk_allocation; __u32 sk_txhash; /* * Because of non atomicity rules, all * changes are protected by socket lock. */ u8 sk_gso_disabled : 1, sk_kern_sock : 1, sk_no_check_tx : 1, sk_no_check_rx : 1, sk_userlocks : 4; u8 sk_pacing_shift; u16 sk_type; u16 sk_protocol; u16 sk_gso_max_segs; unsigned long sk_lingertime; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; int sk_err, sk_err_soft; u32 sk_ack_backlog; u32 sk_max_ack_backlog; kuid_t sk_uid; u8 sk_txrehash; #ifdef CONFIG_NET_RX_BUSY_POLL u8 sk_prefer_busy_poll; u16 sk_busy_poll_budget; #endif spinlock_t sk_peer_lock; int sk_bind_phc; struct pid *sk_peer_pid; const struct cred *sk_peer_cred; long sk_rcvtimeo; ktime_t sk_stamp; #if BITS_PER_LONG==32 seqlock_t sk_stamp_seq; #endif atomic_t sk_tskey; atomic_t sk_zckey; u32 sk_tsflags; u8 sk_shutdown; u8 sk_clockid; u8 sk_txtime_deadline_mode : 1, sk_txtime_report_errors : 1, sk_txtime_unused : 6; bool sk_use_task_frag; struct socket *sk_socket; void *sk_user_data; #ifdef CONFIG_SECURITY void *sk_security; #endif struct sock_cgroup_data sk_cgrp_data; struct mem_cgroup *sk_memcg; void (*sk_state_change)(struct sock *sk); void (*sk_data_ready)(struct sock *sk); void (*sk_write_space)(struct sock *sk); void (*sk_error_report)(struct sock *sk); int (*sk_backlog_rcv)(struct sock *sk, struct sk_buff *skb); #ifdef CONFIG_SOCK_VALIDATE_XMIT struct sk_buff* (*sk_validate_xmit_skb)(struct sock *sk, struct net_device *dev, struct sk_buff *skb); #endif void (*sk_destruct)(struct sock *sk); struct sock_reuseport __rcu *sk_reuseport_cb; #ifdef CONFIG_BPF_SYSCALL struct bpf_local_storage __rcu *sk_bpf_storage; #endif struct rcu_head sk_rcu; netns_tracker ns_tracker; }; enum sk_pacing { SK_PACING_NONE = 0, SK_PACING_NEEDED = 1, SK_PACING_FQ = 2, }; /* flag bits in sk_user_data * * - SK_USER_DATA_NOCOPY: Pointer stored in sk_user_data might * not be suitable for copying when cloning the socket. For instance, * it can point to a reference counted object. sk_user_data bottom * bit is set if pointer must not be copied. * * - SK_USER_DATA_BPF: Mark whether sk_user_data field is * managed/owned by a BPF reuseport array. This bit should be set * when sk_user_data's sk is added to the bpf's reuseport_array. * * - SK_USER_DATA_PSOCK: Mark whether pointer stored in * sk_user_data points to psock type. This bit should be set * when sk_user_data is assigned to a psock object. */ #define SK_USER_DATA_NOCOPY 1UL #define SK_USER_DATA_BPF 2UL #define SK_USER_DATA_PSOCK 4UL #define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF |\ SK_USER_DATA_PSOCK) /** * sk_user_data_is_nocopy - Test if sk_user_data pointer must not be copied * @sk: socket */ static inline bool sk_user_data_is_nocopy(const struct sock *sk) { return ((uintptr_t)sk->sk_user_data & SK_USER_DATA_NOCOPY); } #define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data))) /** * __locked_read_sk_user_data_with_flags - return the pointer * only if argument flags all has been set in sk_user_data. Otherwise * return NULL * * @sk: socket * @flags: flag bits * * The caller must be holding sk->sk_callback_lock. */ static inline void * __locked_read_sk_user_data_with_flags(const struct sock *sk, uintptr_t flags) { uintptr_t sk_user_data = (uintptr_t)rcu_dereference_check(__sk_user_data(sk), lockdep_is_held(&sk->sk_callback_lock)); WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK); if ((sk_user_data & flags) == flags) return (void *)(sk_user_data & SK_USER_DATA_PTRMASK); return NULL; } /** * __rcu_dereference_sk_user_data_with_flags - return the pointer * only if argument flags all has been set in sk_user_data. Otherwise * return NULL * * @sk: socket * @flags: flag bits */ static inline void * __rcu_dereference_sk_user_data_with_flags(const struct sock *sk, uintptr_t flags) { uintptr_t sk_user_data = (uintptr_t)rcu_dereference(__sk_user_data(sk)); WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK); if ((sk_user_data & flags) == flags) return (void *)(sk_user_data & SK_USER_DATA_PTRMASK); return NULL; } #define rcu_dereference_sk_user_data(sk) \ __rcu_dereference_sk_user_data_with_flags(sk, 0) #define __rcu_assign_sk_user_data_with_flags(sk, ptr, flags) \ ({ \ uintptr_t __tmp1 = (uintptr_t)(ptr), \ __tmp2 = (uintptr_t)(flags); \ WARN_ON_ONCE(__tmp1 & ~SK_USER_DATA_PTRMASK); \ WARN_ON_ONCE(__tmp2 & SK_USER_DATA_PTRMASK); \ rcu_assign_pointer(__sk_user_data((sk)), \ __tmp1 | __tmp2); \ }) #define rcu_assign_sk_user_data(sk, ptr) \ __rcu_assign_sk_user_data_with_flags(sk, ptr, 0) static inline struct net *sock_net(const struct sock *sk) { return read_pnet(&sk->sk_net); } static inline void sock_net_set(struct sock *sk, struct net *net) { write_pnet(&sk->sk_net, net); } /* * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK * or not whether his port will be reused by someone else. SK_FORCE_REUSE * on a socket means that the socket will reuse everybody else's port * without looking at the other's sk_reuse value. */ #define SK_NO_REUSE 0 #define SK_CAN_REUSE 1 #define SK_FORCE_REUSE 2 int sk_set_peek_off(struct sock *sk, int val); static inline int sk_peek_offset(const struct sock *sk, int flags) { if (unlikely(flags & MSG_PEEK)) { return READ_ONCE(sk->sk_peek_off); } return 0; } static inline void sk_peek_offset_bwd(struct sock *sk, int val) { s32 off = READ_ONCE(sk->sk_peek_off); if (unlikely(off >= 0)) { off = max_t(s32, off - val, 0); WRITE_ONCE(sk->sk_peek_off, off); } } static inline void sk_peek_offset_fwd(struct sock *sk, int val) { sk_peek_offset_bwd(sk, -val); } /* * Hashed lists helper routines */ static inline struct sock *sk_entry(const struct hlist_node *node) { return hlist_entry(node, struct sock, sk_node); } static inline struct sock *__sk_head(const struct hlist_head *head) { return hlist_entry(head->first, struct sock, sk_node); } static inline struct sock *sk_head(const struct hlist_head *head) { return hlist_empty(head) ? NULL : __sk_head(head); } static inline struct sock *__sk_nulls_head(const struct hlist_nulls_head *head) { return hlist_nulls_entry(head->first, struct sock, sk_nulls_node); } static inline struct sock *sk_nulls_head(const struct hlist_nulls_head *head) { return hlist_nulls_empty(head) ? NULL : __sk_nulls_head(head); } static inline struct sock *sk_next(const struct sock *sk) { return hlist_entry_safe(sk->sk_node.next, struct sock, sk_node); } static inline struct sock *sk_nulls_next(const struct sock *sk) { return (!is_a_nulls(sk->sk_nulls_node.next)) ? hlist_nulls_entry(sk->sk_nulls_node.next, struct sock, sk_nulls_node) : NULL; } static inline bool sk_unhashed(const struct sock *sk) { return hlist_unhashed(&sk->sk_node); } static inline bool sk_hashed(const struct sock *sk) { return !sk_unhashed(sk); } static inline void sk_node_init(struct hlist_node *node) { node->pprev = NULL; } static inline void __sk_del_node(struct sock *sk) { __hlist_del(&sk->sk_node); } /* NB: equivalent to hlist_del_init_rcu */ static inline bool __sk_del_node_init(struct sock *sk) { if (sk_hashed(sk)) { __sk_del_node(sk); sk_node_init(&sk->sk_node); return true; } return false; } /* Grab socket reference count. This operation is valid only when sk is ALREADY grabbed f.e. it is found in hash table or a list and the lookup is made under lock preventing hash table modifications. */ static __always_inline void sock_hold(struct sock *sk) { refcount_inc(&sk->sk_refcnt); } /* Ungrab socket in the context, which assumes that socket refcnt cannot hit zero, f.e. it is true in context of any socketcall. */ static __always_inline void __sock_put(struct sock *sk) { refcount_dec(&sk->sk_refcnt); } static inline bool sk_del_node_init(struct sock *sk) { bool rc = __sk_del_node_init(sk); if (rc) { /* paranoid for a while -acme */ WARN_ON(refcount_read(&sk->sk_refcnt) == 1); __sock_put(sk); } return rc; } #define sk_del_node_init_rcu(sk) sk_del_node_init(sk) static inline bool __sk_nulls_del_node_init_rcu(struct sock *sk) { if (sk_hashed(sk)) { hlist_nulls_del_init_rcu(&sk->sk_nulls_node); return true; } return false; } static inline bool sk_nulls_del_node_init_rcu(struct sock *sk) { bool rc = __sk_nulls_del_node_init_rcu(sk); if (rc) { /* paranoid for a while -acme */ WARN_ON(refcount_read(&sk->sk_refcnt) == 1); __sock_put(sk); } return rc; } static inline void __sk_add_node(struct sock *sk, struct hlist_head *list) { hlist_add_head(&sk->sk_node, list); } static inline void sk_add_node(struct sock *sk, struct hlist_head *list) { sock_hold(sk); __sk_add_node(sk, list); } static inline void sk_add_node_rcu(struct sock *sk, struct hlist_head *list) { sock_hold(sk); if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && sk->sk_family == AF_INET6) hlist_add_tail_rcu(&sk->sk_node, list); else hlist_add_head_rcu(&sk->sk_node, list); } static inline void sk_add_node_tail_rcu(struct sock *sk, struct hlist_head *list) { sock_hold(sk); hlist_add_tail_rcu(&sk->sk_node, list); } static inline void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) { hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list); } static inline void __sk_nulls_add_node_tail_rcu(struct sock *sk, struct hlist_nulls_head *list) { hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list); } static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) { sock_hold(sk); __sk_nulls_add_node_rcu(sk, list); } static inline void __sk_del_bind_node(struct sock *sk) { __hlist_del(&sk->sk_bind_node); } static inline void sk_add_bind_node(struct sock *sk, struct hlist_head *list) { hlist_add_head(&sk->sk_bind_node, list); } #define sk_for_each(__sk, list) \ hlist_for_each_entry(__sk, list, sk_node) #define sk_for_each_rcu(__sk, list) \ hlist_for_each_entry_rcu(__sk, list, sk_node) #define sk_nulls_for_each(__sk, node, list) \ hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node) #define sk_nulls_for_each_rcu(__sk, node, list) \ hlist_nulls_for_each_entry_rcu(__sk, node, list, sk_nulls_node) #define sk_for_each_from(__sk) \ hlist_for_each_entry_from(__sk, sk_node) #define sk_nulls_for_each_from(__sk, node) \ if (__sk && ({ node = &(__sk)->sk_nulls_node; 1; })) \ hlist_nulls_for_each_entry_from(__sk, node, sk_nulls_node) #define sk_for_each_safe(__sk, tmp, list) \ hlist_for_each_entry_safe(__sk, tmp, list, sk_node) #define sk_for_each_bound(__sk, list) \ hlist_for_each_entry(__sk, list, sk_bind_node) /** * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @head: the head for your list. * @offset: offset of hlist_node within the struct. * */ #define sk_for_each_entry_offset_rcu(tpos, pos, head, offset) \ for (pos = rcu_dereference(hlist_first_rcu(head)); \ pos != NULL && \ ({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;}); \ pos = rcu_dereference(hlist_next_rcu(pos))) static inline struct user_namespace *sk_user_ns(const struct sock *sk) { /* Careful only use this in a context where these parameters * can not change and must all be valid, such as recvmsg from * userspace. */ return sk->sk_socket->file->f_cred->user_ns; } /* Sock flags */ enum sock_flags { SOCK_DEAD, SOCK_DONE, SOCK_URGINLINE, SOCK_KEEPOPEN, SOCK_LINGER, SOCK_DESTROY, SOCK_BROADCAST, SOCK_TIMESTAMP, SOCK_ZAPPED, SOCK_USE_WRITE_QUEUE, /* whether to call sk->sk_write_space in sock_wfree */ SOCK_DBG, /* %SO_DEBUG setting */ SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */ SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */ SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */ SOCK_MEMALLOC, /* VM depends on this socket for swapping */ SOCK_TIMESTAMPING_RX_SOFTWARE, /* %SOF_TIMESTAMPING_RX_SOFTWARE */ SOCK_FASYNC, /* fasync() active */ SOCK_RXQ_OVFL, SOCK_ZEROCOPY, /* buffers from userspace */ SOCK_WIFI_STATUS, /* push wifi status to userspace */ SOCK_NOFCS, /* Tell NIC not to do the Ethernet FCS. * Will use last 4 bytes of packet sent from * user-space instead. */ SOCK_FILTER_LOCKED, /* Filter cannot be changed anymore */ SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */ SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */ SOCK_TXTIME, SOCK_XDP, /* XDP is attached */ SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */ SOCK_RCVMARK, /* Receive SO_MARK ancillary data with packet */ }; #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) static inline void sock_copy_flags(struct sock *nsk, const struct sock *osk) { nsk->sk_flags = osk->sk_flags; } static inline void sock_set_flag(struct sock *sk, enum sock_flags flag) { __set_bit(flag, &sk->sk_flags); } static inline void sock_reset_flag(struct sock *sk, enum sock_flags flag) { __clear_bit(flag, &sk->sk_flags); } static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit, int valbool) { if (valbool) sock_set_flag(sk, bit); else sock_reset_flag(sk, bit); } static inline bool sock_flag(const struct sock *sk, enum sock_flags flag) { return test_bit(flag, &sk->sk_flags); } #ifdef CONFIG_NET DECLARE_STATIC_KEY_FALSE(memalloc_socks_key); static inline int sk_memalloc_socks(void) { return static_branch_unlikely(&memalloc_socks_key); } void __receive_sock(struct file *file); #else static inline int sk_memalloc_socks(void) { return 0; } static inline void __receive_sock(struct file *file) { } #endif static inline gfp_t sk_gfp_mask(const struct sock *sk, gfp_t gfp_mask) { return gfp_mask | (sk->sk_allocation & __GFP_MEMALLOC); } static inline void sk_acceptq_removed(struct sock *sk) { WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog - 1); } static inline void sk_acceptq_added(struct sock *sk) { WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog + 1); } /* Note: If you think the test should be: * return READ_ONCE(sk->sk_ack_backlog) >= READ_ONCE(sk->sk_max_ack_backlog); * Then please take a look at commit 64a146513f8f ("[NET]: Revert incorrect accept queue backlog changes.") */ static inline bool sk_acceptq_is_full(const struct sock *sk) { return READ_ONCE(sk->sk_ack_backlog) > READ_ONCE(sk->sk_max_ack_backlog); } /* * Compute minimal free write space needed to queue new packets. */ static inline int sk_stream_min_wspace(const struct sock *sk) { return READ_ONCE(sk->sk_wmem_queued) >> 1; } static inline int sk_stream_wspace(const struct sock *sk) { return READ_ONCE(sk->sk_sndbuf) - READ_ONCE(sk->sk_wmem_queued); } static inline void sk_wmem_queued_add(struct sock *sk, int val) { WRITE_ONCE(sk->sk_wmem_queued, sk->sk_wmem_queued + val); } static inline void sk_forward_alloc_add(struct sock *sk, int val) { /* Paired with lockless reads of sk->sk_forward_alloc */ WRITE_ONCE(sk->sk_forward_alloc, sk->sk_forward_alloc + val); } void sk_stream_write_space(struct sock *sk); /* OOB backlog add */ static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) { /* dont let skb dst not refcounted, we are going to leave rcu lock */ skb_dst_force(skb); if (!sk->sk_backlog.tail) WRITE_ONCE(sk->sk_backlog.head, skb); else sk->sk_backlog.tail->next = skb; WRITE_ONCE(sk->sk_backlog.tail, skb); skb->next = NULL; } /* * Take into account size of receive queue and backlog queue * Do not take into account this skb truesize, * to allow even a single big packet to come. */ static inline bool sk_rcvqueues_full(const struct sock *sk, unsigned int limit) { unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc); return qsize > limit; } /* The per-socket spinlock must be held here. */ static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb, unsigned int limit) { if (sk_rcvqueues_full(sk, limit)) return -ENOBUFS; /* * If the skb was allocated from pfmemalloc reserves, only * allow SOCK_MEMALLOC sockets to use it as this socket is * helping free memory */ if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) return -ENOMEM; __sk_add_backlog(sk, skb); sk->sk_backlog.len += skb->truesize; return 0; } int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb); INDIRECT_CALLABLE_DECLARE(int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)); INDIRECT_CALLABLE_DECLARE(int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)); static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) { if (sk_memalloc_socks() && skb_pfmemalloc(skb)) return __sk_backlog_rcv(sk, skb); return INDIRECT_CALL_INET(sk->sk_backlog_rcv, tcp_v6_do_rcv, tcp_v4_do_rcv, sk, skb); } static inline void sk_incoming_cpu_update(struct sock *sk) { int cpu = raw_smp_processor_id(); if (unlikely(READ_ONCE(sk->sk_incoming_cpu) != cpu)) WRITE_ONCE(sk->sk_incoming_cpu, cpu); } static inline void sock_rps_record_flow_hash(__u32 hash) { #ifdef CONFIG_RPS struct rps_sock_flow_table *sock_flow_table; rcu_read_lock(); sock_flow_table = rcu_dereference(rps_sock_flow_table); rps_record_sock_flow(sock_flow_table, hash); rcu_read_unlock(); #endif } static inline void sock_rps_record_flow(const struct sock *sk) { #ifdef CONFIG_RPS if (static_branch_unlikely(&rfs_needed)) { /* Reading sk->sk_rxhash might incur an expensive cache line * miss. * * TCP_ESTABLISHED does cover almost all states where RFS * might be useful, and is cheaper [1] than testing : * IPv4: inet_sk(sk)->inet_daddr * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) * OR an additional socket flag * [1] : sk_state and sk_prot are in the same cache line. */ if (sk->sk_state == TCP_ESTABLISHED) { /* This READ_ONCE() is paired with the WRITE_ONCE() * from sock_rps_save_rxhash() and sock_rps_reset_rxhash(). */ sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash)); } } #endif } static inline void sock_rps_save_rxhash(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_RPS /* The following WRITE_ONCE() is paired with the READ_ONCE() * here, and another one in sock_rps_record_flow(). */ if (unlikely(READ_ONCE(sk->sk_rxhash) != skb->hash)) WRITE_ONCE(sk->sk_rxhash, skb->hash); #endif } static inline void sock_rps_reset_rxhash(struct sock *sk) { #ifdef CONFIG_RPS /* Paired with READ_ONCE() in sock_rps_record_flow() */ WRITE_ONCE(sk->sk_rxhash, 0); #endif } #define sk_wait_event(__sk, __timeo, __condition, __wait) \ ({ int __rc, __dis = __sk->sk_disconnects; \ release_sock(__sk); \ __rc = __condition; \ if (!__rc) { \ *(__timeo) = wait_woken(__wait, \ TASK_INTERRUPTIBLE, \ *(__timeo)); \ } \ sched_annotate_sleep(); \ lock_sock(__sk); \ __rc = __dis == __sk->sk_disconnects ? __condition : -EPIPE; \ __rc; \ }) int sk_stream_wait_connect(struct sock *sk, long *timeo_p); int sk_stream_wait_memory(struct sock *sk, long *timeo_p); void sk_stream_wait_close(struct sock *sk, long timeo_p); int sk_stream_error(struct sock *sk, int flags, int err); void sk_stream_kill_queues(struct sock *sk); void sk_set_memalloc(struct sock *sk); void sk_clear_memalloc(struct sock *sk); void __sk_flush_backlog(struct sock *sk); static inline bool sk_flush_backlog(struct sock *sk) { if (unlikely(READ_ONCE(sk->sk_backlog.tail))) { __sk_flush_backlog(sk); return true; } return false; } int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb); struct request_sock_ops; struct timewait_sock_ops; struct inet_hashinfo; struct raw_hashinfo; struct smc_hashinfo; struct module; struct sk_psock; /* * caches using SLAB_TYPESAFE_BY_RCU should let .next pointer from nulls nodes * un-modified. Special care is taken when initializing object to zero. */ static inline void sk_prot_clear_nulls(struct sock *sk, int size) { if (offsetof(struct sock, sk_node.next) != 0) memset(sk, 0, offsetof(struct sock, sk_node.next)); memset(&sk->sk_node.pprev, 0, size - offsetof(struct sock, sk_node.pprev)); } /* Networking protocol blocks we attach to sockets. * socket layer -> transport layer interface */ struct proto { void (*close)(struct sock *sk, long timeout); int (*pre_connect)(struct sock *sk, struct sockaddr *uaddr, int addr_len); int (*connect)(struct sock *sk, struct sockaddr *uaddr, int addr_len); int (*disconnect)(struct sock *sk, int flags); struct sock * (*accept)(struct sock *sk, int flags, int *err, bool kern); int (*ioctl)(struct sock *sk, int cmd, int *karg); int (*init)(struct sock *sk); void (*destroy)(struct sock *sk); void (*shutdown)(struct sock *sk, int how); int (*setsockopt)(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int (*getsockopt)(struct sock *sk, int level, int optname, char __user *optval, int __user *option); void (*keepalive)(struct sock *sk, int valbool); #ifdef CONFIG_COMPAT int (*compat_ioctl)(struct sock *sk, unsigned int cmd, unsigned long arg); #endif int (*sendmsg)(struct sock *sk, struct msghdr *msg, size_t len); int (*recvmsg)(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); void (*splice_eof)(struct socket *sock); int (*bind)(struct sock *sk, struct sockaddr *addr, int addr_len); int (*bind_add)(struct sock *sk, struct sockaddr *addr, int addr_len); int (*backlog_rcv) (struct sock *sk, struct sk_buff *skb); bool (*bpf_bypass_getsockopt)(int level, int optname); void (*release_cb)(struct sock *sk); /* Keeping track of sk's, looking them up, and port selection methods. */ int (*hash)(struct sock *sk); void (*unhash)(struct sock *sk); void (*rehash)(struct sock *sk); int (*get_port)(struct sock *sk, unsigned short snum); void (*put_port)(struct sock *sk); #ifdef CONFIG_BPF_SYSCALL int (*psock_update_sk_prot)(struct sock *sk, struct sk_psock *psock, bool restore); #endif /* Keeping track of sockets in use */ #ifdef CONFIG_PROC_FS unsigned int inuse_idx; #endif #if IS_ENABLED(CONFIG_MPTCP) int (*forward_alloc_get)(const struct sock *sk); #endif bool (*stream_memory_free)(const struct sock *sk, int wake); bool (*sock_is_readable)(struct sock *sk); /* Memory pressure */ void (*enter_memory_pressure)(struct sock *sk); void (*leave_memory_pressure)(struct sock *sk); atomic_long_t *memory_allocated; /* Current allocated memory. */ int __percpu *per_cpu_fw_alloc; struct percpu_counter *sockets_allocated; /* Current number of sockets. */ /* * Pressure flag: try to collapse. * Technical note: it is used by multiple contexts non atomically. * Make sure to use READ_ONCE()/WRITE_ONCE() for all reads/writes. * All the __sk_mem_schedule() is of this nature: accounting * is strict, actions are advisory and have some latency. */ unsigned long *memory_pressure; long *sysctl_mem; int *sysctl_wmem; int *sysctl_rmem; u32 sysctl_wmem_offset; u32 sysctl_rmem_offset; int max_header; bool no_autobind; struct kmem_cache *slab; unsigned int obj_size; unsigned int ipv6_pinfo_offset; slab_flags_t slab_flags; unsigned int useroffset; /* Usercopy region offset */ unsigned int usersize; /* Usercopy region size */ unsigned int __percpu *orphan_count; struct request_sock_ops *rsk_prot; struct timewait_sock_ops *twsk_prot; union { struct inet_hashinfo *hashinfo; struct udp_table *udp_table; struct raw_hashinfo *raw_hash; struct smc_hashinfo *smc_hash; } h; struct module *owner; char name[32]; struct list_head node; int (*diag_destroy)(struct sock *sk, int err); } __randomize_layout; int proto_register(struct proto *prot, int alloc_slab); void proto_unregister(struct proto *prot); int sock_load_diag_module(int family, int protocol); INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int wake)); static inline int sk_forward_alloc_get(const struct sock *sk) { #if IS_ENABLED(CONFIG_MPTCP) if (sk->sk_prot->forward_alloc_get) return sk->sk_prot->forward_alloc_get(sk); #endif return READ_ONCE(sk->sk_forward_alloc); } static inline bool __sk_stream_memory_free(const struct sock *sk, int wake) { if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf)) return false; return sk->sk_prot->stream_memory_free ? INDIRECT_CALL_INET_1(sk->sk_prot->stream_memory_free, tcp_stream_memory_free, sk, wake) : true; } static inline bool sk_stream_memory_free(const struct sock *sk) { return __sk_stream_memory_free(sk, 0); } static inline bool __sk_stream_is_writeable(const struct sock *sk, int wake) { return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && __sk_stream_memory_free(sk, wake); } static inline bool sk_stream_is_writeable(const struct sock *sk) { return __sk_stream_is_writeable(sk, 0); } static inline int sk_under_cgroup_hierarchy(struct sock *sk, struct cgroup *ancestor) { #ifdef CONFIG_SOCK_CGROUP_DATA return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data), ancestor); #else return -ENOTSUPP; #endif } static inline bool sk_has_memory_pressure(const struct sock *sk) { return sk->sk_prot->memory_pressure != NULL; } static inline bool sk_under_global_memory_pressure(const struct sock *sk) { return sk->sk_prot->memory_pressure && !!READ_ONCE(*sk->sk_prot->memory_pressure); } static inline bool sk_under_memory_pressure(const struct sock *sk) { if (!sk->sk_prot->memory_pressure) return false; if (mem_cgroup_sockets_enabled && sk->sk_memcg && mem_cgroup_under_socket_pressure(sk->sk_memcg)) return true; return !!READ_ONCE(*sk->sk_prot->memory_pressure); } static inline long proto_memory_allocated(const struct proto *prot) { return max(0L, atomic_long_read(prot->memory_allocated)); } static inline long sk_memory_allocated(const struct sock *sk) { return proto_memory_allocated(sk->sk_prot); } /* 1 MB per cpu, in page units */ #define SK_MEMORY_PCPU_RESERVE (1 << (20 - PAGE_SHIFT)) static inline void sk_memory_allocated_add(struct sock *sk, int amt) { int local_reserve; preempt_disable(); local_reserve = __this_cpu_add_return(*sk->sk_prot->per_cpu_fw_alloc, amt); if (local_reserve >= SK_MEMORY_PCPU_RESERVE) { __this_cpu_sub(*sk->sk_prot->per_cpu_fw_alloc, local_reserve); atomic_long_add(local_reserve, sk->sk_prot->memory_allocated); } preempt_enable(); } static inline void sk_memory_allocated_sub(struct sock *sk, int amt) { int local_reserve; preempt_disable(); local_reserve = __this_cpu_sub_return(*sk->sk_prot->per_cpu_fw_alloc, amt); if (local_reserve <= -SK_MEMORY_PCPU_RESERVE) { __this_cpu_sub(*sk->sk_prot->per_cpu_fw_alloc, local_reserve); atomic_long_add(local_reserve, sk->sk_prot->memory_allocated); } preempt_enable(); } #define SK_ALLOC_PERCPU_COUNTER_BATCH 16 static inline void sk_sockets_allocated_dec(struct sock *sk) { percpu_counter_add_batch(sk->sk_prot->sockets_allocated, -1, SK_ALLOC_PERCPU_COUNTER_BATCH); } static inline void sk_sockets_allocated_inc(struct sock *sk) { percpu_counter_add_batch(sk->sk_prot->sockets_allocated, 1, SK_ALLOC_PERCPU_COUNTER_BATCH); } static inline u64 sk_sockets_allocated_read_positive(struct sock *sk) { return percpu_counter_read_positive(sk->sk_prot->sockets_allocated); } static inline int proto_sockets_allocated_sum_positive(struct proto *prot) { return percpu_counter_sum_positive(prot->sockets_allocated); } static inline bool proto_memory_pressure(struct proto *prot) { if (!prot->memory_pressure) return false; return !!READ_ONCE(*prot->memory_pressure); } #ifdef CONFIG_PROC_FS #define PROTO_INUSE_NR 64 /* should be enough for the first time */ struct prot_inuse { int all; int val[PROTO_INUSE_NR]; }; static inline void sock_prot_inuse_add(const struct net *net, const struct proto *prot, int val) { this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val); } static inline void sock_inuse_add(const struct net *net, int val) { this_cpu_add(net->core.prot_inuse->all, val); } int sock_prot_inuse_get(struct net *net, struct proto *proto); int sock_inuse_get(struct net *net); #else static inline void sock_prot_inuse_add(const struct net *net, const struct proto *prot, int val) { } static inline void sock_inuse_add(const struct net *net, int val) { } #endif /* With per-bucket locks this operation is not-atomic, so that * this version is not worse. */ static inline int __sk_prot_rehash(struct sock *sk) { sk->sk_prot->unhash(sk); return sk->sk_prot->hash(sk); } /* About 10 seconds */ #define SOCK_DESTROY_TIME (10*HZ) /* Sockets 0-1023 can't be bound to unless you are superuser */ #define PROT_SOCK 1024 #define SHUTDOWN_MASK 3 #define RCV_SHUTDOWN 1 #define SEND_SHUTDOWN 2 #define SOCK_BINDADDR_LOCK 4 #define SOCK_BINDPORT_LOCK 8 struct socket_alloc { struct socket socket; struct inode vfs_inode; }; static inline struct socket *SOCKET_I(struct inode *inode) { return &container_of(inode, struct socket_alloc, vfs_inode)->socket; } static inline struct inode *SOCK_INODE(struct socket *socket) { return &container_of(socket, struct socket_alloc, socket)->vfs_inode; } /* * Functions for memory accounting */ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind); int __sk_mem_schedule(struct sock *sk, int size, int kind); void __sk_mem_reduce_allocated(struct sock *sk, int amount); void __sk_mem_reclaim(struct sock *sk, int amount); #define SK_MEM_SEND 0 #define SK_MEM_RECV 1 /* sysctl_mem values are in pages */ static inline long sk_prot_mem_limits(const struct sock *sk, int index) { return READ_ONCE(sk->sk_prot->sysctl_mem[index]); } static inline int sk_mem_pages(int amt) { return (amt + PAGE_SIZE - 1) >> PAGE_SHIFT; } static inline bool sk_has_account(struct sock *sk) { /* return true if protocol supports memory accounting */ return !!sk->sk_prot->memory_allocated; } static inline bool sk_wmem_schedule(struct sock *sk, int size) { int delta; if (!sk_has_account(sk)) return true; delta = size - sk->sk_forward_alloc; return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_SEND); } static inline bool sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) { int delta; if (!sk_has_account(sk)) return true; delta = size - sk->sk_forward_alloc; return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) || skb_pfmemalloc(skb); } static inline int sk_unused_reserved_mem(const struct sock *sk) { int unused_mem; if (likely(!sk->sk_reserved_mem)) return 0; unused_mem = sk->sk_reserved_mem - sk->sk_wmem_queued - atomic_read(&sk->sk_rmem_alloc); return unused_mem > 0 ? unused_mem : 0; } static inline void sk_mem_reclaim(struct sock *sk) { int reclaimable; if (!sk_has_account(sk)) return; reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk); if (reclaimable >= (int)PAGE_SIZE) __sk_mem_reclaim(sk, reclaimable); } static inline void sk_mem_reclaim_final(struct sock *sk) { sk->sk_reserved_mem = 0; sk_mem_reclaim(sk); } static inline void sk_mem_charge(struct sock *sk, int size) { if (!sk_has_account(sk)) return; sk_forward_alloc_add(sk, -size); } static inline void sk_mem_uncharge(struct sock *sk, int size) { if (!sk_has_account(sk)) return; sk_forward_alloc_add(sk, size); sk_mem_reclaim(sk); } /* * Macro so as to not evaluate some arguments when * lockdep is not enabled. * * Mark both the sk_lock and the sk_lock.slock as a * per-address-family lock class. */ #define sock_lock_init_class_and_name(sk, sname, skey, name, key) \ do { \ sk->sk_lock.owned = 0; \ init_waitqueue_head(&sk->sk_lock.wq); \ spin_lock_init(&(sk)->sk_lock.slock); \ debug_check_no_locks_freed((void *)&(sk)->sk_lock, \ sizeof((sk)->sk_lock)); \ lockdep_set_class_and_name(&(sk)->sk_lock.slock, \ (skey), (sname)); \ lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); \ } while (0) static inline bool lockdep_sock_is_held(const struct sock *sk) { return lockdep_is_held(&sk->sk_lock) || lockdep_is_held(&sk->sk_lock.slock); } void lock_sock_nested(struct sock *sk, int subclass); static inline void lock_sock(struct sock *sk) { lock_sock_nested(sk, 0); } void __lock_sock(struct sock *sk); void __release_sock(struct sock *sk); void release_sock(struct sock *sk); /* BH context may only use the following locking interface. */ #define bh_lock_sock(__sk) spin_lock(&((__sk)->sk_lock.slock)) #define bh_lock_sock_nested(__sk) \ spin_lock_nested(&((__sk)->sk_lock.slock), \ SINGLE_DEPTH_NESTING) #define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.slock)) bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock); /** * lock_sock_fast - fast version of lock_sock * @sk: socket * * This version should be used for very small section, where process wont block * return false if fast path is taken: * * sk_lock.slock locked, owned = 0, BH disabled * * return true if slow path is taken: * * sk_lock.slock unlocked, owned = 1, BH enabled */ static inline bool lock_sock_fast(struct sock *sk) { /* The sk_lock has mutex_lock() semantics here. */ mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); return __lock_sock_fast(sk); } /* fast socket lock variant for caller already holding a [different] socket lock */ static inline bool lock_sock_fast_nested(struct sock *sk) { mutex_acquire(&sk->sk_lock.dep_map, SINGLE_DEPTH_NESTING, 0, _RET_IP_); return __lock_sock_fast(sk); } /** * unlock_sock_fast - complement of lock_sock_fast * @sk: socket * @slow: slow mode * * fast unlock socket for user context. * If slow mode is on, we call regular release_sock() */ static inline void unlock_sock_fast(struct sock *sk, bool slow) __releases(&sk->sk_lock.slock) { if (slow) { release_sock(sk); __release(&sk->sk_lock.slock); } else { mutex_release(&sk->sk_lock.dep_map, _RET_IP_); spin_unlock_bh(&sk->sk_lock.slock); } } void sockopt_lock_sock(struct sock *sk); void sockopt_release_sock(struct sock *sk); bool sockopt_ns_capable(struct user_namespace *ns, int cap); bool sockopt_capable(int cap); /* Used by processes to "lock" a socket state, so that * interrupts and bottom half handlers won't change it * from under us. It essentially blocks any incoming * packets, so that we won't get any new data or any * packets that change the state of the socket. * * While locked, BH processing will add new packets to * the backlog queue. This queue is processed by the * owner of the socket lock right before it is released. * * Since ~2.3.5 it is also exclusive sleep lock serializing * accesses from user process context. */ static inline void sock_owned_by_me(const struct sock *sk) { #ifdef CONFIG_LOCKDEP WARN_ON_ONCE(!lockdep_sock_is_held(sk) && debug_locks); #endif } static inline bool sock_owned_by_user(const struct sock *sk) { sock_owned_by_me(sk); return sk->sk_lock.owned; } static inline bool sock_owned_by_user_nocheck(const struct sock *sk) { return sk->sk_lock.owned; } static inline void sock_release_ownership(struct sock *sk) { DEBUG_NET_WARN_ON_ONCE(!sock_owned_by_user_nocheck(sk)); sk->sk_lock.owned = 0; /* The sk_lock has mutex_unlock() semantics: */ mutex_release(&sk->sk_lock.dep_map, _RET_IP_); } /* no reclassification while locks are held */ static inline bool sock_allow_reclassification(const struct sock *csk) { struct sock *sk = (struct sock *)csk; return !sock_owned_by_user_nocheck(sk) && !spin_is_locked(&sk->sk_lock.slock); } struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern); void sk_free(struct sock *sk); void sk_destruct(struct sock *sk); struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority); void sk_free_unlock_clone(struct sock *sk); struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority); void __sock_wfree(struct sk_buff *skb); void sock_wfree(struct sk_buff *skb); struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, gfp_t priority); void skb_orphan_partial(struct sk_buff *skb); void sock_rfree(struct sk_buff *skb); void sock_efree(struct sk_buff *skb); #ifdef CONFIG_INET void sock_edemux(struct sk_buff *skb); void sock_pfree(struct sk_buff *skb); #else #define sock_edemux sock_efree #endif int sk_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int sock_setsockopt(struct socket *sock, int level, int op, sockptr_t optval, unsigned int optlen); int do_sock_setsockopt(struct socket *sock, bool compat, int level, int optname, sockptr_t optval, int optlen); int do_sock_getsockopt(struct socket *sock, bool compat, int level, int optname, sockptr_t optval, sockptr_t optlen); int sk_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen); int sock_gettstamp(struct socket *sock, void __user *userstamp, bool timeval, bool time32); struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, int *errcode, int max_page_order); static inline struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int noblock, int *errcode) { return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); } void *sock_kmalloc(struct sock *sk, int size, gfp_t priority); void sock_kfree_s(struct sock *sk, void *mem, int size); void sock_kzfree_s(struct sock *sk, void *mem, int size); void sk_send_sigurg(struct sock *sk); static inline void sock_replace_proto(struct sock *sk, struct proto *proto) { if (sk->sk_socket) clear_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); WRITE_ONCE(sk->sk_prot, proto); } struct sockcm_cookie { u64 transmit_time; u32 mark; u32 tsflags; }; static inline void sockcm_init(struct sockcm_cookie *sockc, const struct sock *sk) { *sockc = (struct sockcm_cookie) { .tsflags = READ_ONCE(sk->sk_tsflags) }; } int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, struct sockcm_cookie *sockc); int sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct sockcm_cookie *sockc); /* * Functions to fill in entries in struct proto_ops when a protocol * does not implement a particular function. */ int sock_no_bind(struct socket *, struct sockaddr *, int); int sock_no_connect(struct socket *, struct sockaddr *, int, int); int sock_no_socketpair(struct socket *, struct socket *); int sock_no_accept(struct socket *, struct socket *, int, bool); int sock_no_getname(struct socket *, struct sockaddr *, int); int sock_no_ioctl(struct socket *, unsigned int, unsigned long); int sock_no_listen(struct socket *, int); int sock_no_shutdown(struct socket *, int); int sock_no_sendmsg(struct socket *, struct msghdr *, size_t); int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t len); int sock_no_recvmsg(struct socket *, struct msghdr *, size_t, int); int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma); /* * Functions to fill in entries in struct proto_ops when a protocol * uses the inet style. */ int sock_common_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen); int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); int sock_common_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen); void sk_common_release(struct sock *sk); /* * Default socket callbacks and setup code */ /* Initialise core socket variables using an explicit uid. */ void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid); /* Initialise core socket variables. * Assumes struct socket *sock is embedded in a struct socket_alloc. */ void sock_init_data(struct socket *sock, struct sock *sk); /* * Socket reference counting postulates. * * * Each user of socket SHOULD hold a reference count. * * Each access point to socket (an hash table bucket, reference from a list, * running timer, skb in flight MUST hold a reference count. * * When reference count hits 0, it means it will never increase back. * * When reference count hits 0, it means that no references from * outside exist to this socket and current process on current CPU * is last user and may/should destroy this socket. * * sk_free is called from any context: process, BH, IRQ. When * it is called, socket has no references from outside -> sk_free * may release descendant resources allocated by the socket, but * to the time when it is called, socket is NOT referenced by any * hash tables, lists etc. * * Packets, delivered from outside (from network or from another process) * and enqueued on receive/error queues SHOULD NOT grab reference count, * when they sit in queue. Otherwise, packets will leak to hole, when * socket is looked up by one cpu and unhasing is made by another CPU. * It is true for udp/raw, netlink (leak to receive and error queues), tcp * (leak to backlog). Packet socket does all the processing inside * BR_NETPROTO_LOCK, so that it has not this race condition. UNIX sockets * use separate SMP lock, so that they are prone too. */ /* Ungrab socket and destroy it, if it was the last reference. */ static inline void sock_put(struct sock *sk) { if (refcount_dec_and_test(&sk->sk_refcnt)) sk_free(sk); } /* Generic version of sock_put(), dealing with all sockets * (TCP_TIMEWAIT, TCP_NEW_SYN_RECV, ESTABLISHED...) */ void sock_gen_put(struct sock *sk); int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested, unsigned int trim_cap, bool refcounted); static inline int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) { return __sk_receive_skb(sk, skb, nested, 1, true); } static inline void sk_tx_queue_set(struct sock *sk, int tx_queue) { /* sk_tx_queue_mapping accept only upto a 16-bit value */ if (WARN_ON_ONCE((unsigned short)tx_queue >= USHRT_MAX)) return; /* Paired with READ_ONCE() in sk_tx_queue_get() and * other WRITE_ONCE() because socket lock might be not held. */ WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue); } #define NO_QUEUE_MAPPING USHRT_MAX static inline void sk_tx_queue_clear(struct sock *sk) { /* Paired with READ_ONCE() in sk_tx_queue_get() and * other WRITE_ONCE() because socket lock might be not held. */ WRITE_ONCE(sk->sk_tx_queue_mapping, NO_QUEUE_MAPPING); } static inline int sk_tx_queue_get(const struct sock *sk) { if (sk) { /* Paired with WRITE_ONCE() in sk_tx_queue_clear() * and sk_tx_queue_set(). */ int val = READ_ONCE(sk->sk_tx_queue_mapping); if (val != NO_QUEUE_MAPPING) return val; } return -1; } static inline void __sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb, bool force_set) { #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING if (skb_rx_queue_recorded(skb)) { u16 rx_queue = skb_get_rx_queue(skb); if (force_set || unlikely(READ_ONCE(sk->sk_rx_queue_mapping) != rx_queue)) WRITE_ONCE(sk->sk_rx_queue_mapping, rx_queue); } #endif } static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb) { __sk_rx_queue_set(sk, skb, true); } static inline void sk_rx_queue_update(struct sock *sk, const struct sk_buff *skb) { __sk_rx_queue_set(sk, skb, false); } static inline void sk_rx_queue_clear(struct sock *sk) { #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING WRITE_ONCE(sk->sk_rx_queue_mapping, NO_QUEUE_MAPPING); #endif } static inline int sk_rx_queue_get(const struct sock *sk) { #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING if (sk) { int res = READ_ONCE(sk->sk_rx_queue_mapping); if (res != NO_QUEUE_MAPPING) return res; } #endif return -1; } static inline void sk_set_socket(struct sock *sk, struct socket *sock) { sk->sk_socket = sock; } static inline wait_queue_head_t *sk_sleep(struct sock *sk) { BUILD_BUG_ON(offsetof(struct socket_wq, wait) != 0); return &rcu_dereference_raw(sk->sk_wq)->wait; } /* Detach socket from process context. * Announce socket dead, detach it from wait queue and inode. * Note that parent inode held reference count on this struct sock, * we do not release it in this function, because protocol * probably wants some additional cleanups or even continuing * to work with this socket (TCP). */ static inline void sock_orphan(struct sock *sk) { write_lock_bh(&sk->sk_callback_lock); sock_set_flag(sk, SOCK_DEAD); sk_set_socket(sk, NULL); sk->sk_wq = NULL; write_unlock_bh(&sk->sk_callback_lock); } static inline void sock_graft(struct sock *sk, struct socket *parent) { WARN_ON(parent->sk); write_lock_bh(&sk->sk_callback_lock); rcu_assign_pointer(sk->sk_wq, &parent->wq); parent->sk = sk; sk_set_socket(sk, parent); sk->sk_uid = SOCK_INODE(parent)->i_uid; security_sock_graft(sk, parent); write_unlock_bh(&sk->sk_callback_lock); } kuid_t sock_i_uid(struct sock *sk); unsigned long __sock_i_ino(struct sock *sk); unsigned long sock_i_ino(struct sock *sk); static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk) { return sk ? sk->sk_uid : make_kuid(net->user_ns, 0); } static inline u32 net_tx_rndhash(void) { u32 v = get_random_u32(); return v ?: 1; } static inline void sk_set_txhash(struct sock *sk) { /* This pairs with READ_ONCE() in skb_set_hash_from_sk() */ WRITE_ONCE(sk->sk_txhash, net_tx_rndhash()); } static inline bool sk_rethink_txhash(struct sock *sk) { if (sk->sk_txhash && sk->sk_txrehash == SOCK_TXREHASH_ENABLED) { sk_set_txhash(sk); return true; } return false; } static inline struct dst_entry * __sk_dst_get(const struct sock *sk) { return rcu_dereference_check(sk->sk_dst_cache, lockdep_sock_is_held(sk)); } static inline struct dst_entry * sk_dst_get(const struct sock *sk) { struct dst_entry *dst; rcu_read_lock(); dst = rcu_dereference(sk->sk_dst_cache); if (dst && !rcuref_get(&dst->__rcuref)) dst = NULL; rcu_read_unlock(); return dst; } static inline void __dst_negative_advice(struct sock *sk) { struct dst_entry *ndst, *dst = __sk_dst_get(sk); if (dst && dst->ops->negative_advice) { ndst = dst->ops->negative_advice(dst); if (ndst != dst) { rcu_assign_pointer(sk->sk_dst_cache, ndst); sk_tx_queue_clear(sk); WRITE_ONCE(sk->sk_dst_pending_confirm, 0); } } } static inline void dst_negative_advice(struct sock *sk) { sk_rethink_txhash(sk); __dst_negative_advice(sk); } static inline void __sk_dst_set(struct sock *sk, struct dst_entry *dst) { struct dst_entry *old_dst; sk_tx_queue_clear(sk); WRITE_ONCE(sk->sk_dst_pending_confirm, 0); old_dst = rcu_dereference_protected(sk->sk_dst_cache, lockdep_sock_is_held(sk)); rcu_assign_pointer(sk->sk_dst_cache, dst); dst_release(old_dst); } static inline void sk_dst_set(struct sock *sk, struct dst_entry *dst) { struct dst_entry *old_dst; sk_tx_queue_clear(sk); WRITE_ONCE(sk->sk_dst_pending_confirm, 0); old_dst = xchg((__force struct dst_entry **)&sk->sk_dst_cache, dst); dst_release(old_dst); } static inline void __sk_dst_reset(struct sock *sk) { __sk_dst_set(sk, NULL); } static inline void sk_dst_reset(struct sock *sk) { sk_dst_set(sk, NULL); } struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie); struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie); static inline void sk_dst_confirm(struct sock *sk) { if (!READ_ONCE(sk->sk_dst_pending_confirm)) WRITE_ONCE(sk->sk_dst_pending_confirm, 1); } static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n) { if (skb_get_dst_pending_confirm(skb)) { struct sock *sk = skb->sk; if (sk && READ_ONCE(sk->sk_dst_pending_confirm)) WRITE_ONCE(sk->sk_dst_pending_confirm, 0); neigh_confirm(n); } } bool sk_mc_loop(const struct sock *sk); static inline bool sk_can_gso(const struct sock *sk) { return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type); } void sk_setup_caps(struct sock *sk, struct dst_entry *dst); static inline void sk_gso_disable(struct sock *sk) { sk->sk_gso_disabled = 1; sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb, struct iov_iter *from, char *to, int copy, int offset) { if (skb->ip_summed == CHECKSUM_NONE) { __wsum csum = 0; if (!csum_and_copy_from_iter_full(to, copy, &csum, from)) return -EFAULT; skb->csum = csum_block_add(skb->csum, csum, offset); } else if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) { if (!copy_from_iter_full_nocache(to, copy, from)) return -EFAULT; } else if (!copy_from_iter_full(to, copy, from)) return -EFAULT; return 0; } static inline int skb_add_data_nocache(struct sock *sk, struct sk_buff *skb, struct iov_iter *from, int copy) { int err, offset = skb->len; err = skb_do_copy_data_nocache(sk, skb, from, skb_put(skb, copy), copy, offset); if (err) __skb_trim(skb, offset); return err; } static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *from, struct sk_buff *skb, struct page *page, int off, int copy) { int err; err = skb_do_copy_data_nocache(sk, skb, from, page_address(page) + off, copy, skb->len); if (err) return err; skb_len_add(skb, copy); sk_wmem_queued_add(sk, copy); sk_mem_charge(sk, copy); return 0; } /** * sk_wmem_alloc_get - returns write allocations * @sk: socket * * Return: sk_wmem_alloc minus initial offset of one */ static inline int sk_wmem_alloc_get(const struct sock *sk) { return refcount_read(&sk->sk_wmem_alloc) - 1; } /** * sk_rmem_alloc_get - returns read allocations * @sk: socket * * Return: sk_rmem_alloc */ static inline int sk_rmem_alloc_get(const struct sock *sk) { return atomic_read(&sk->sk_rmem_alloc); } /** * sk_has_allocations - check if allocations are outstanding * @sk: socket * * Return: true if socket has write or read allocations */ static inline bool sk_has_allocations(const struct sock *sk) { return sk_wmem_alloc_get(sk) || sk_rmem_alloc_get(sk); } /** * skwq_has_sleeper - check if there are any waiting processes * @wq: struct socket_wq * * Return: true if socket_wq has waiting processes * * The purpose of the skwq_has_sleeper and sock_poll_wait is to wrap the memory * barrier call. They were added due to the race found within the tcp code. * * Consider following tcp code paths:: * * CPU1 CPU2 * sys_select receive packet * ... ... * __add_wait_queue update tp->rcv_nxt * ... ... * tp->rcv_nxt check sock_def_readable * ... { * schedule rcu_read_lock(); * wq = rcu_dereference(sk->sk_wq); * if (wq && waitqueue_active(&wq->wait)) * wake_up_interruptible(&wq->wait) * ... * } * * The race for tcp fires when the __add_wait_queue changes done by CPU1 stay * in its cache, and so does the tp->rcv_nxt update on CPU2 side. The CPU1 * could then endup calling schedule and sleep forever if there are no more * data on the socket. * */ static inline bool skwq_has_sleeper(struct socket_wq *wq) { return wq && wq_has_sleeper(&wq->wait); } /** * sock_poll_wait - place memory barrier behind the poll_wait call. * @filp: file * @sock: socket to wait on * @p: poll_table * * See the comments in the wq_has_sleeper function. */ static inline void sock_poll_wait(struct file *filp, struct socket *sock, poll_table *p) { if (!poll_does_not_wait(p)) { poll_wait(filp, &sock->wq.wait, p); /* We need to be sure we are in sync with the * socket flags modification. * * This memory barrier is paired in the wq_has_sleeper. */ smp_mb(); } } static inline void skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk) { /* This pairs with WRITE_ONCE() in sk_set_txhash() */ u32 txhash = READ_ONCE(sk->sk_txhash); if (txhash) { skb->l4_hash = 1; skb->hash = txhash; } } void skb_set_owner_w(struct sk_buff *skb, struct sock *sk); /* * Queue a received datagram if it will fit. Stream and sequenced * protocols can't normally use this as they need to fit buffers in * and play with them. * * Inlined as it's very short and called for pretty much every * packet ever received. */ static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk) { skb_orphan(skb); skb->sk = sk; skb->destructor = sock_rfree; atomic_add(skb->truesize, &sk->sk_rmem_alloc); sk_mem_charge(sk, skb->truesize); } static inline __must_check bool skb_set_owner_sk_safe(struct sk_buff *skb, struct sock *sk) { if (sk && refcount_inc_not_zero(&sk->sk_refcnt)) { skb_orphan(skb); skb->destructor = sock_efree; skb->sk = sk; return true; } return false; } static inline struct sk_buff *skb_clone_and_charge_r(struct sk_buff *skb, struct sock *sk) { skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC)); if (skb) { if (sk_rmem_schedule(sk, skb, skb->truesize)) { skb_set_owner_r(skb, sk); return skb; } __kfree_skb(skb); } return NULL; } static inline void skb_prepare_for_gro(struct sk_buff *skb) { if (skb->destructor != sock_wfree) { skb_orphan(skb); return; } skb->slow_gro = 1; } void sk_reset_timer(struct sock *sk, struct timer_list *timer, unsigned long expires); void sk_stop_timer(struct sock *sk, struct timer_list *timer); void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer); int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, struct sk_buff *skb, unsigned int flags, void (*destructor)(struct sock *sk, struct sk_buff *skb)); int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason); static inline int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { return sock_queue_rcv_skb_reason(sk, skb, NULL); } int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb); struct sk_buff *sock_dequeue_err_skb(struct sock *sk); /* * Recover an error report and clear atomically */ static inline int sock_error(struct sock *sk) { int err; /* Avoid an atomic operation for the common case. * This is racy since another cpu/thread can change sk_err under us. */ if (likely(data_race(!sk->sk_err))) return 0; err = xchg(&sk->sk_err, 0); return -err; } void sk_error_report(struct sock *sk); static inline unsigned long sock_wspace(struct sock *sk) { int amt = 0; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { amt = sk->sk_sndbuf - refcount_read(&sk->sk_wmem_alloc); if (amt < 0) amt = 0; } return amt; } /* Note: * We use sk->sk_wq_raw, from contexts knowing this * pointer is not NULL and cannot disappear/change. */ static inline void sk_set_bit(int nr, struct sock *sk) { if ((nr == SOCKWQ_ASYNC_NOSPACE || nr == SOCKWQ_ASYNC_WAITDATA) && !sock_flag(sk, SOCK_FASYNC)) return; set_bit(nr, &sk->sk_wq_raw->flags); } static inline void sk_clear_bit(int nr, struct sock *sk) { if ((nr == SOCKWQ_ASYNC_NOSPACE || nr == SOCKWQ_ASYNC_WAITDATA) && !sock_flag(sk, SOCK_FASYNC)) return; clear_bit(nr, &sk->sk_wq_raw->flags); } static inline void sk_wake_async(const struct sock *sk, int how, int band) { if (sock_flag(sk, SOCK_FASYNC)) { rcu_read_lock(); sock_wake_async(rcu_dereference(sk->sk_wq), how, band); rcu_read_unlock(); } } /* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might * need sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak. * Note: for send buffers, TCP works better if we can build two skbs at * minimum. */ #define TCP_SKB_MIN_TRUESIZE (2048 + SKB_DATA_ALIGN(sizeof(struct sk_buff))) #define SOCK_MIN_SNDBUF (TCP_SKB_MIN_TRUESIZE * 2) #define SOCK_MIN_RCVBUF TCP_SKB_MIN_TRUESIZE static inline void sk_stream_moderate_sndbuf(struct sock *sk) { u32 val; if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) return; val = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1); val = max_t(u32, val, sk_unused_reserved_mem(sk)); WRITE_ONCE(sk->sk_sndbuf, max_t(u32, val, SOCK_MIN_SNDBUF)); } /** * sk_page_frag - return an appropriate page_frag * @sk: socket * * Use the per task page_frag instead of the per socket one for * optimization when we know that we're in process context and own * everything that's associated with %current. * * Both direct reclaim and page faults can nest inside other * socket operations and end up recursing into sk_page_frag() * while it's already in use: explicitly avoid task page_frag * when users disable sk_use_task_frag. * * Return: a per task page_frag if context allows that, * otherwise a per socket one. */ static inline struct page_frag *sk_page_frag(struct sock *sk) { if (sk->sk_use_task_frag) return &current->task_frag; return &sk->sk_frag; } bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag); /* * Default write policy as shown to user space via poll/select/SIGIO */ static inline bool sock_writeable(const struct sock *sk) { return refcount_read(&sk->sk_wmem_alloc) < (READ_ONCE(sk->sk_sndbuf) >> 1); } static inline gfp_t gfp_any(void) { return in_softirq() ? GFP_ATOMIC : GFP_KERNEL; } static inline gfp_t gfp_memcg_charge(void) { return in_softirq() ? GFP_ATOMIC : GFP_KERNEL; } static inline long sock_rcvtimeo(const struct sock *sk, bool noblock) { return noblock ? 0 : sk->sk_rcvtimeo; } static inline long sock_sndtimeo(const struct sock *sk, bool noblock) { return noblock ? 0 : sk->sk_sndtimeo; } static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len) { int v = waitall ? len : min_t(int, READ_ONCE(sk->sk_rcvlowat), len); return v ?: 1; } /* Alas, with timeout socket operations are not restartable. * Compare this to poll(). */ static inline int sock_intr_errno(long timeo) { return timeo == MAX_SCHEDULE_TIMEOUT ? -ERESTARTSYS : -EINTR; } struct sock_skb_cb { u32 dropcount; }; /* Store sock_skb_cb at the end of skb->cb[] so protocol families * using skb->cb[] would keep using it directly and utilize its * alignement guarantee. */ #define SOCK_SKB_CB_OFFSET ((sizeof_field(struct sk_buff, cb) - \ sizeof(struct sock_skb_cb))) #define SOCK_SKB_CB(__skb) ((struct sock_skb_cb *)((__skb)->cb + \ SOCK_SKB_CB_OFFSET)) #define sock_skb_cb_check_size(size) \ BUILD_BUG_ON((size) > SOCK_SKB_CB_OFFSET) static inline void sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb) { SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ? atomic_read(&sk->sk_drops) : 0; } static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb) { int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs); atomic_add(segs, &sk->sk_drops); } static inline ktime_t sock_read_timestamp(struct sock *sk) { #if BITS_PER_LONG==32 unsigned int seq; ktime_t kt; do { seq = read_seqbegin(&sk->sk_stamp_seq); kt = sk->sk_stamp; } while (read_seqretry(&sk->sk_stamp_seq, seq)); return kt; #else return READ_ONCE(sk->sk_stamp); #endif } static inline void sock_write_timestamp(struct sock *sk, ktime_t kt) { #if BITS_PER_LONG==32 write_seqlock(&sk->sk_stamp_seq); sk->sk_stamp = kt; write_sequnlock(&sk->sk_stamp_seq); #else WRITE_ONCE(sk->sk_stamp, kt); #endif } void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); static inline void sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb); u32 tsflags = READ_ONCE(sk->sk_tsflags); ktime_t kt = skb->tstamp; /* * generate control messages if * - receive time stamping in software requested * - software time stamp available and wanted * - hardware time stamps available and wanted */ if (sock_flag(sk, SOCK_RCVTSTAMP) || (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) || (kt && tsflags & SOF_TIMESTAMPING_SOFTWARE) || (hwtstamps->hwtstamp && (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE))) __sock_recv_timestamp(msg, sk, skb); else sock_write_timestamp(sk, kt); if (sock_flag(sk, SOCK_WIFI_STATUS) && skb_wifi_acked_valid(skb)) __sock_recv_wifi_status(msg, sk, skb); } void __sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); #define SK_DEFAULT_STAMP (-1L * NSEC_PER_SEC) static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { #define FLAGS_RECV_CMSGS ((1UL << SOCK_RXQ_OVFL) | \ (1UL << SOCK_RCVTSTAMP) | \ (1UL << SOCK_RCVMARK)) #define TSFLAGS_ANY (SOF_TIMESTAMPING_SOFTWARE | \ SOF_TIMESTAMPING_RAW_HARDWARE) if (sk->sk_flags & FLAGS_RECV_CMSGS || READ_ONCE(sk->sk_tsflags) & TSFLAGS_ANY) __sock_recv_cmsgs(msg, sk, skb); else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP))) sock_write_timestamp(sk, skb->tstamp); else if (unlikely(sock_read_timestamp(sk) == SK_DEFAULT_STAMP)) sock_write_timestamp(sk, 0); } void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags); /** * _sock_tx_timestamp - checks whether the outgoing packet is to be time stamped * @sk: socket sending this packet * @tsflags: timestamping flags to use * @tx_flags: completed with instructions for time stamping * @tskey: filled in with next sk_tskey (not for TCP, which uses seqno) * * Note: callers should take care of initial ``*tx_flags`` value (usually 0) */ static inline void _sock_tx_timestamp(struct sock *sk, __u16 tsflags, __u8 *tx_flags, __u32 *tskey) { if (unlikely(tsflags)) { __sock_tx_timestamp(tsflags, tx_flags); if (tsflags & SOF_TIMESTAMPING_OPT_ID && tskey && tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) *tskey = atomic_inc_return(&sk->sk_tskey) - 1; } if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS))) *tx_flags |= SKBTX_WIFI_STATUS; } static inline void sock_tx_timestamp(struct sock *sk, __u16 tsflags, __u8 *tx_flags) { _sock_tx_timestamp(sk, tsflags, tx_flags, NULL); } static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags) { _sock_tx_timestamp(skb->sk, tsflags, &skb_shinfo(skb)->tx_flags, &skb_shinfo(skb)->tskey); } static inline bool sk_is_tcp(const struct sock *sk) { return sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP; } static inline bool sk_is_stream_unix(const struct sock *sk) { return sk->sk_family == AF_UNIX && sk->sk_type == SOCK_STREAM; } /** * sk_eat_skb - Release a skb if it is no longer needed * @sk: socket to eat this skb from * @skb: socket buffer to eat * * This routine must be called with interrupts disabled or with the socket * locked so that the sk_buff queue operation is ok. */ static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb) { __skb_unlink(skb, &sk->sk_receive_queue); __kfree_skb(skb); } static inline bool skb_sk_is_prefetched(struct sk_buff *skb) { #ifdef CONFIG_INET return skb->destructor == sock_pfree; #else return false; #endif /* CONFIG_INET */ } /* This helper checks if a socket is a full socket, * ie _not_ a timewait or request socket. */ static inline bool sk_fullsock(const struct sock *sk) { return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV); } static inline bool sk_is_refcounted(struct sock *sk) { /* Only full sockets have sk->sk_flags. */ return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE); } /** * skb_steal_sock - steal a socket from an sk_buff * @skb: sk_buff to steal the socket from * @refcounted: is set to true if the socket is reference-counted * @prefetched: is set to true if the socket was assigned from bpf */ static inline struct sock * skb_steal_sock(struct sk_buff *skb, bool *refcounted, bool *prefetched) { if (skb->sk) { struct sock *sk = skb->sk; *refcounted = true; *prefetched = skb_sk_is_prefetched(skb); if (*prefetched) *refcounted = sk_is_refcounted(sk); skb->destructor = NULL; skb->sk = NULL; return sk; } *prefetched = false; *refcounted = false; return NULL; } /* Checks if this SKB belongs to an HW offloaded socket * and whether any SW fallbacks are required based on dev. * Check decrypted mark in case skb_orphan() cleared socket. */ static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) { #ifdef CONFIG_SOCK_VALIDATE_XMIT struct sock *sk = skb->sk; if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) { skb = sk->sk_validate_xmit_skb(sk, dev, skb); #ifdef CONFIG_TLS_DEVICE } else if (unlikely(skb->decrypted)) { pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); kfree_skb(skb); skb = NULL; #endif } #endif return skb; } /* This helper checks if a socket is a LISTEN or NEW_SYN_RECV * SYNACK messages can be attached to either ones (depending on SYNCOOKIE) */ static inline bool sk_listener(const struct sock *sk) { return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV); } void sock_enable_timestamp(struct sock *sk, enum sock_flags flag); int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level, int type); bool sk_ns_capable(const struct sock *sk, struct user_namespace *user_ns, int cap); bool sk_capable(const struct sock *sk, int cap); bool sk_net_capable(const struct sock *sk, int cap); void sk_get_meminfo(const struct sock *sk, u32 *meminfo); /* Take into consideration the size of the struct sk_buff overhead in the * determination of these values, since that is non-constant across * platforms. This makes socket queueing behavior and performance * not depend upon such differences. */ #define _SK_MEM_PACKETS 256 #define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) #define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) #define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; extern int sysctl_tstamp_allow_data; extern __u32 sysctl_wmem_default; extern __u32 sysctl_rmem_default; #define SKB_FRAG_PAGE_ORDER get_order(32768) DECLARE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto) { /* Does this proto have per netns sysctl_wmem ? */ if (proto->sysctl_wmem_offset) return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset)); return READ_ONCE(*proto->sysctl_wmem); } static inline int sk_get_rmem0(const struct sock *sk, const struct proto *proto) { /* Does this proto have per netns sysctl_rmem ? */ if (proto->sysctl_rmem_offset) return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset)); return READ_ONCE(*proto->sysctl_rmem); } /* Default TCP Small queue budget is ~1 ms of data (1sec >> 10) * Some wifi drivers need to tweak it to get more chunks. * They can use this helper from their ndo_start_xmit() */ static inline void sk_pacing_shift_update(struct sock *sk, int val) { if (!sk || !sk_fullsock(sk) || READ_ONCE(sk->sk_pacing_shift) == val) return; WRITE_ONCE(sk->sk_pacing_shift, val); } /* if a socket is bound to a device, check that the given device * index is either the same or that the socket is bound to an L3 * master device and the given device index is also enslaved to * that L3 master */ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) { int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); int mdif; if (!bound_dev_if || bound_dev_if == dif) return true; mdif = l3mdev_master_ifindex_by_index(sock_net(sk), dif); if (mdif && mdif == bound_dev_if) return true; return false; } void sock_def_readable(struct sock *sk); int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk); void sock_set_timestamp(struct sock *sk, int optname, bool valbool); int sock_set_timestamping(struct sock *sk, int optname, struct so_timestamping timestamping); void sock_enable_timestamps(struct sock *sk); void sock_no_linger(struct sock *sk); void sock_set_keepalive(struct sock *sk); void sock_set_priority(struct sock *sk, u32 priority); void sock_set_rcvbuf(struct sock *sk, int val); void sock_set_mark(struct sock *sk, u32 val); void sock_set_reuseaddr(struct sock *sk); void sock_set_reuseport(struct sock *sk); void sock_set_sndtimeo(struct sock *sk, s64 secs); int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len); int sock_get_timeout(long timeo, void *optval, bool old_timeval); int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, sockptr_t optval, int optlen, bool old_timeval); int sock_ioctl_inout(struct sock *sk, unsigned int cmd, void __user *arg, void *karg, size_t size); int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); static inline bool sk_is_readable(struct sock *sk) { if (sk->sk_prot->sock_is_readable) return sk->sk_prot->sock_is_readable(sk); return false; } #endif /* _SOCK_H */
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 // SPDX-License-Identifier: GPL-2.0-only /* * debugfs code for HSR & PRP * Copyright (C) 2019 Texas Instruments Incorporated * * Author(s): * Murali Karicheri <m-karicheri2@ti.com> */ #include <linux/module.h> #include <linux/errno.h> #include <linux/debugfs.h> #include "hsr_main.h" #include "hsr_framereg.h" static struct dentry *hsr_debugfs_root_dir; /* hsr_node_table_show - Formats and prints node_table entries */ static int hsr_node_table_show(struct seq_file *sfp, void *data) { struct hsr_priv *priv = (struct hsr_priv *)sfp->private; struct hsr_node *node; seq_printf(sfp, "Node Table entries for (%s) device\n", (priv->prot_version == PRP_V1 ? "PRP" : "HSR")); seq_puts(sfp, "MAC-Address-A, MAC-Address-B, time_in[A], "); seq_puts(sfp, "time_in[B], Address-B port, "); if (priv->prot_version == PRP_V1) seq_puts(sfp, "SAN-A, SAN-B, DAN-P\n"); else seq_puts(sfp, "DAN-H\n"); rcu_read_lock(); list_for_each_entry_rcu(node, &priv->node_db, mac_list) { /* skip self node */ if (hsr_addr_is_self(priv, node->macaddress_A)) continue; seq_printf(sfp, "%pM ", &node->macaddress_A[0]); seq_printf(sfp, "%pM ", &node->macaddress_B[0]); seq_printf(sfp, "%10lx, ", node->time_in[HSR_PT_SLAVE_A]); seq_printf(sfp, "%10lx, ", node->time_in[HSR_PT_SLAVE_B]); seq_printf(sfp, "%14x, ", node->addr_B_port); if (priv->prot_version == PRP_V1) seq_printf(sfp, "%5x, %5x, %5x\n", node->san_a, node->san_b, (node->san_a == 0 && node->san_b == 0)); else seq_printf(sfp, "%5x\n", 1); } rcu_read_unlock(); return 0; } DEFINE_SHOW_ATTRIBUTE(hsr_node_table); void hsr_debugfs_rename(struct net_device *dev) { struct hsr_priv *priv = netdev_priv(dev); struct dentry *d; d = debugfs_rename(hsr_debugfs_root_dir, priv->node_tbl_root, hsr_debugfs_root_dir, dev->name); if (IS_ERR(d)) netdev_warn(dev, "failed to rename\n"); else priv->node_tbl_root = d; } /* hsr_debugfs_init - create hsr node_table file for dumping * the node table * * Description: * When debugfs is configured this routine sets up the node_table file per * hsr device for dumping the node_table entries */ void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev) { struct dentry *de = NULL; de = debugfs_create_dir(hsr_dev->name, hsr_debugfs_root_dir); if (IS_ERR(de)) { pr_err("Cannot create hsr debugfs directory\n"); return; } priv->node_tbl_root = de; de = debugfs_create_file("node_table", S_IFREG | 0444, priv->node_tbl_root, priv, &hsr_node_table_fops); if (IS_ERR(de)) { pr_err("Cannot create hsr node_table file\n"); debugfs_remove(priv->node_tbl_root); priv->node_tbl_root = NULL; return; } } /* hsr_debugfs_term - Tear down debugfs intrastructure * * Description: * When Debugfs is configured this routine removes debugfs file system * elements that are specific to hsr */ void hsr_debugfs_term(struct hsr_priv *priv) { debugfs_remove_recursive(priv->node_tbl_root); priv->node_tbl_root = NULL; } void hsr_debugfs_create_root(void) { hsr_debugfs_root_dir = debugfs_create_dir("hsr", NULL); if (IS_ERR(hsr_debugfs_root_dir)) { pr_err("Cannot create hsr debugfs root directory\n"); hsr_debugfs_root_dir = NULL; } } void hsr_debugfs_remove_root(void) { /* debugfs_remove() internally checks NULL and ERROR */ debugfs_remove(hsr_debugfs_root_dir); }
2955 2957 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2020 Google LLC. */ #ifndef _LINUX_BPF_LSM_H #define _LINUX_BPF_LSM_H #include <linux/sched.h> #include <linux/bpf.h> #include <linux/lsm_hooks.h> #ifdef CONFIG_BPF_LSM #define LSM_HOOK(RET, DEFAULT, NAME, ...) \ RET bpf_lsm_##NAME(__VA_ARGS__); #include <linux/lsm_hook_defs.h> #undef LSM_HOOK struct bpf_storage_blob { struct bpf_local_storage __rcu *storage; }; extern struct lsm_blob_sizes bpf_lsm_blob_sizes; int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog); bool bpf_lsm_is_sleepable_hook(u32 btf_id); bool bpf_lsm_is_trusted(const struct bpf_prog *prog); static inline struct bpf_storage_blob *bpf_inode( const struct inode *inode) { if (unlikely(!inode->i_security)) return NULL; return inode->i_security + bpf_lsm_blob_sizes.lbs_inode; } extern const struct bpf_func_proto bpf_inode_storage_get_proto; extern const struct bpf_func_proto bpf_inode_storage_delete_proto; void bpf_inode_storage_free(struct inode *inode); void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func); #else /* !CONFIG_BPF_LSM */ static inline bool bpf_lsm_is_sleepable_hook(u32 btf_id) { return false; } static inline bool bpf_lsm_is_trusted(const struct bpf_prog *prog) { return false; } static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) { return -EOPNOTSUPP; } static inline struct bpf_storage_blob *bpf_inode( const struct inode *inode) { return NULL; } static inline void bpf_inode_storage_free(struct inode *inode) { } static inline void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func) { } #endif /* CONFIG_BPF_LSM */ #endif /* _LINUX_BPF_LSM_H */
472 196 197 482 479 8 477 469 467 24 463 480 34 12 34 38 39 39 38 38 24 23 23 17 23 483 459 459 461 1 56 57 477 480 482 58 56 43 38 7 39 20 479 480 479 38 19 19 38 142 10 120 49 49 49 49 49 49 49 14 14 442 442 444 442 443 442 441 462 461 463 445 447 10 10 178 110 10 140 17 13 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 // SPDX-License-Identifier: GPL-2.0-or-later /* * Scatterlist Cryptographic API. * * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * Copyright (c) 2002 David S. Miller (davem@redhat.com) * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au> * * Portions derived from Cryptoapi, by Alexander Kjeldaas <astor@fast.no> * and Nettle, by Niels Möller. */ #include <linux/err.h> #include <linux/errno.h> #include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/module.h> #include <linux/param.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/completion.h> #include "internal.h" LIST_HEAD(crypto_alg_list); EXPORT_SYMBOL_GPL(crypto_alg_list); DECLARE_RWSEM(crypto_alg_sem); EXPORT_SYMBOL_GPL(crypto_alg_sem); BLOCKING_NOTIFIER_HEAD(crypto_chain); EXPORT_SYMBOL_GPL(crypto_chain); #ifndef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS DEFINE_STATIC_KEY_FALSE(__crypto_boot_test_finished); EXPORT_SYMBOL_GPL(__crypto_boot_test_finished); #endif static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg); struct crypto_alg *crypto_mod_get(struct crypto_alg *alg) { return try_module_get(alg->cra_module) ? crypto_alg_get(alg) : NULL; } EXPORT_SYMBOL_GPL(crypto_mod_get); void crypto_mod_put(struct crypto_alg *alg) { struct module *module = alg->cra_module; crypto_alg_put(alg); module_put(module); } EXPORT_SYMBOL_GPL(crypto_mod_put); static struct crypto_alg *__crypto_alg_lookup(const char *name, u32 type, u32 mask) { struct crypto_alg *q, *alg = NULL; int best = -2; list_for_each_entry(q, &crypto_alg_list, cra_list) { int exact, fuzzy; if (crypto_is_moribund(q)) continue; if ((q->cra_flags ^ type) & mask) continue; if (crypto_is_larval(q) && !crypto_is_test_larval((struct crypto_larval *)q) && ((struct crypto_larval *)q)->mask != mask) continue; exact = !strcmp(q->cra_driver_name, name); fuzzy = !strcmp(q->cra_name, name); if (!exact && !(fuzzy && q->cra_priority > best)) continue; if (unlikely(!crypto_mod_get(q))) continue; best = q->cra_priority; if (alg) crypto_mod_put(alg); alg = q; if (exact) break; } return alg; } static void crypto_larval_destroy(struct crypto_alg *alg) { struct crypto_larval *larval = (void *)alg; BUG_ON(!crypto_is_larval(alg)); if (!IS_ERR_OR_NULL(larval->adult)) crypto_mod_put(larval->adult); kfree(larval); } struct crypto_larval *crypto_larval_alloc(const char *name, u32 type, u32 mask) { struct crypto_larval *larval; larval = kzalloc(sizeof(*larval), GFP_KERNEL); if (!larval) return ERR_PTR(-ENOMEM); larval->mask = mask; larval->alg.cra_flags = CRYPTO_ALG_LARVAL | type; larval->alg.cra_priority = -1; larval->alg.cra_destroy = crypto_larval_destroy; strscpy(larval->alg.cra_name, name, CRYPTO_MAX_ALG_NAME); init_completion(&larval->completion); return larval; } EXPORT_SYMBOL_GPL(crypto_larval_alloc); static struct crypto_alg *crypto_larval_add(const char *name, u32 type, u32 mask) { struct crypto_alg *alg; struct crypto_larval *larval; larval = crypto_larval_alloc(name, type, mask); if (IS_ERR(larval)) return ERR_CAST(larval); refcount_set(&larval->alg.cra_refcnt, 2); down_write(&crypto_alg_sem); alg = __crypto_alg_lookup(name, type, mask); if (!alg) { alg = &larval->alg; list_add(&alg->cra_list, &crypto_alg_list); } up_write(&crypto_alg_sem); if (alg != &larval->alg) { kfree(larval); if (crypto_is_larval(alg)) alg = crypto_larval_wait(alg); } return alg; } void crypto_larval_kill(struct crypto_alg *alg) { struct crypto_larval *larval = (void *)alg; down_write(&crypto_alg_sem); list_del(&alg->cra_list); up_write(&crypto_alg_sem); complete_all(&larval->completion); crypto_alg_put(alg); } EXPORT_SYMBOL_GPL(crypto_larval_kill); void crypto_wait_for_test(struct crypto_larval *larval) { int err; err = crypto_probing_notify(CRYPTO_MSG_ALG_REGISTER, larval->adult); if (WARN_ON_ONCE(err != NOTIFY_STOP)) goto out; err = wait_for_completion_killable(&larval->completion); WARN_ON(err); out: crypto_larval_kill(&larval->alg); } EXPORT_SYMBOL_GPL(crypto_wait_for_test); static void crypto_start_test(struct crypto_larval *larval) { if (!crypto_is_test_larval(larval)) return; if (larval->test_started) return; down_write(&crypto_alg_sem); if (larval->test_started) { up_write(&crypto_alg_sem); return; } larval->test_started = true; up_write(&crypto_alg_sem); crypto_wait_for_test(larval); } static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg) { struct crypto_larval *larval = (void *)alg; long timeout; if (!crypto_boot_test_finished()) crypto_start_test(larval); timeout = wait_for_completion_killable_timeout( &larval->completion, 60 * HZ); alg = larval->adult; if (timeout < 0) alg = ERR_PTR(-EINTR); else if (!timeout) alg = ERR_PTR(-ETIMEDOUT); else if (!alg) alg = ERR_PTR(-ENOENT); else if (IS_ERR(alg)) ; else if (crypto_is_test_larval(larval) && !(alg->cra_flags & CRYPTO_ALG_TESTED)) alg = ERR_PTR(-EAGAIN); else if (alg->cra_flags & CRYPTO_ALG_FIPS_INTERNAL) alg = ERR_PTR(-EAGAIN); else if (!crypto_mod_get(alg)) alg = ERR_PTR(-EAGAIN); crypto_mod_put(&larval->alg); return alg; } static struct crypto_alg *crypto_alg_lookup(const char *name, u32 type, u32 mask) { const u32 fips = CRYPTO_ALG_FIPS_INTERNAL; struct crypto_alg *alg; u32 test = 0; if (!((type | mask) & CRYPTO_ALG_TESTED)) test |= CRYPTO_ALG_TESTED; down_read(&crypto_alg_sem); alg = __crypto_alg_lookup(name, (type | test) & ~fips, (mask | test) & ~fips); if (alg) { if (((type | mask) ^ fips) & fips) mask |= fips; mask &= fips; if (!crypto_is_larval(alg) && ((type ^ alg->cra_flags) & mask)) { /* Algorithm is disallowed in FIPS mode. */ crypto_mod_put(alg); alg = ERR_PTR(-ENOENT); } } else if (test) { alg = __crypto_alg_lookup(name, type, mask); if (alg && !crypto_is_larval(alg)) { /* Test failed */ crypto_mod_put(alg); alg = ERR_PTR(-ELIBBAD); } } up_read(&crypto_alg_sem); return alg; } static struct crypto_alg *crypto_larval_lookup(const char *name, u32 type, u32 mask) { struct crypto_alg *alg; if (!name) return ERR_PTR(-ENOENT); type &= ~(CRYPTO_ALG_LARVAL | CRYPTO_ALG_DEAD); mask &= ~(CRYPTO_ALG_LARVAL | CRYPTO_ALG_DEAD); alg = crypto_alg_lookup(name, type, mask); if (!alg && !(mask & CRYPTO_NOLOAD)) { request_module("crypto-%s", name); if (!((type ^ CRYPTO_ALG_NEED_FALLBACK) & mask & CRYPTO_ALG_NEED_FALLBACK)) request_module("crypto-%s-all", name); alg = crypto_alg_lookup(name, type, mask); } if (!IS_ERR_OR_NULL(alg) && crypto_is_larval(alg)) alg = crypto_larval_wait(alg); else if (!alg) alg = crypto_larval_add(name, type, mask); return alg; } int crypto_probing_notify(unsigned long val, void *v) { int ok; ok = blocking_notifier_call_chain(&crypto_chain, val, v); if (ok == NOTIFY_DONE) { request_module("cryptomgr"); ok = blocking_notifier_call_chain(&crypto_chain, val, v); } return ok; } EXPORT_SYMBOL_GPL(crypto_probing_notify); struct crypto_alg *crypto_alg_mod_lookup(const char *name, u32 type, u32 mask) { struct crypto_alg *alg; struct crypto_alg *larval; int ok; /* * If the internal flag is set for a cipher, require a caller to * invoke the cipher with the internal flag to use that cipher. * Also, if a caller wants to allocate a cipher that may or may * not be an internal cipher, use type | CRYPTO_ALG_INTERNAL and * !(mask & CRYPTO_ALG_INTERNAL). */ if (!((type | mask) & CRYPTO_ALG_INTERNAL)) mask |= CRYPTO_ALG_INTERNAL; larval = crypto_larval_lookup(name, type, mask); if (IS_ERR(larval) || !crypto_is_larval(larval)) return larval; ok = crypto_probing_notify(CRYPTO_MSG_ALG_REQUEST, larval); if (ok == NOTIFY_STOP) alg = crypto_larval_wait(larval); else { crypto_mod_put(larval); alg = ERR_PTR(-ENOENT); } crypto_larval_kill(larval); return alg; } EXPORT_SYMBOL_GPL(crypto_alg_mod_lookup); static void crypto_exit_ops(struct crypto_tfm *tfm) { const struct crypto_type *type = tfm->__crt_alg->cra_type; if (type && tfm->exit) tfm->exit(tfm); } static unsigned int crypto_ctxsize(struct crypto_alg *alg, u32 type, u32 mask) { const struct crypto_type *type_obj = alg->cra_type; unsigned int len; len = alg->cra_alignmask & ~(crypto_tfm_ctx_alignment() - 1); if (type_obj) return len + type_obj->ctxsize(alg, type, mask); switch (alg->cra_flags & CRYPTO_ALG_TYPE_MASK) { default: BUG(); case CRYPTO_ALG_TYPE_CIPHER: len += crypto_cipher_ctxsize(alg); break; case CRYPTO_ALG_TYPE_COMPRESS: len += crypto_compress_ctxsize(alg); break; } return len; } void crypto_shoot_alg(struct crypto_alg *alg) { down_write(&crypto_alg_sem); alg->cra_flags |= CRYPTO_ALG_DYING; up_write(&crypto_alg_sem); } EXPORT_SYMBOL_GPL(crypto_shoot_alg); struct crypto_tfm *__crypto_alloc_tfmgfp(struct crypto_alg *alg, u32 type, u32 mask, gfp_t gfp) { struct crypto_tfm *tfm; unsigned int tfm_size; int err = -ENOMEM; tfm_size = sizeof(*tfm) + crypto_ctxsize(alg, type, mask); tfm = kzalloc(tfm_size, gfp); if (tfm == NULL) goto out_err; tfm->__crt_alg = alg; refcount_set(&tfm->refcnt, 1); if (!tfm->exit && alg->cra_init && (err = alg->cra_init(tfm))) goto cra_init_failed; goto out; cra_init_failed: crypto_exit_ops(tfm); if (err == -EAGAIN) crypto_shoot_alg(alg); kfree(tfm); out_err: tfm = ERR_PTR(err); out: return tfm; } EXPORT_SYMBOL_GPL(__crypto_alloc_tfmgfp); struct crypto_tfm *__crypto_alloc_tfm(struct crypto_alg *alg, u32 type, u32 mask) { return __crypto_alloc_tfmgfp(alg, type, mask, GFP_KERNEL); } EXPORT_SYMBOL_GPL(__crypto_alloc_tfm); /* * crypto_alloc_base - Locate algorithm and allocate transform * @alg_name: Name of algorithm * @type: Type of algorithm * @mask: Mask for type comparison * * This function should not be used by new algorithm types. * Please use crypto_alloc_tfm instead. * * crypto_alloc_base() will first attempt to locate an already loaded * algorithm. If that fails and the kernel supports dynamically loadable * modules, it will then attempt to load a module of the same name or * alias. If that fails it will send a query to any loaded crypto manager * to construct an algorithm on the fly. A refcount is grabbed on the * algorithm which is then associated with the new transform. * * The returned transform is of a non-determinate type. Most people * should use one of the more specific allocation functions such as * crypto_alloc_skcipher(). * * In case of error the return value is an error pointer. */ struct crypto_tfm *crypto_alloc_base(const char *alg_name, u32 type, u32 mask) { struct crypto_tfm *tfm; int err; for (;;) { struct crypto_alg *alg; alg = crypto_alg_mod_lookup(alg_name, type, mask); if (IS_ERR(alg)) { err = PTR_ERR(alg); goto err; } tfm = __crypto_alloc_tfm(alg, type, mask); if (!IS_ERR(tfm)) return tfm; crypto_mod_put(alg); err = PTR_ERR(tfm); err: if (err != -EAGAIN) break; if (fatal_signal_pending(current)) { err = -EINTR; break; } } return ERR_PTR(err); } EXPORT_SYMBOL_GPL(crypto_alloc_base); static void *crypto_alloc_tfmmem(struct crypto_alg *alg, const struct crypto_type *frontend, int node, gfp_t gfp) { struct crypto_tfm *tfm; unsigned int tfmsize; unsigned int total; char *mem; tfmsize = frontend->tfmsize; total = tfmsize + sizeof(*tfm) + frontend->extsize(alg); mem = kzalloc_node(total, gfp, node); if (mem == NULL) return ERR_PTR(-ENOMEM); tfm = (struct crypto_tfm *)(mem + tfmsize); tfm->__crt_alg = alg; tfm->node = node; refcount_set(&tfm->refcnt, 1); return mem; } void *crypto_create_tfm_node(struct crypto_alg *alg, const struct crypto_type *frontend, int node) { struct crypto_tfm *tfm; char *mem; int err; mem = crypto_alloc_tfmmem(alg, frontend, node, GFP_KERNEL); if (IS_ERR(mem)) goto out; tfm = (struct crypto_tfm *)(mem + frontend->tfmsize); err = frontend->init_tfm(tfm); if (err) goto out_free_tfm; if (!tfm->exit && alg->cra_init && (err = alg->cra_init(tfm))) goto cra_init_failed; goto out; cra_init_failed: crypto_exit_ops(tfm); out_free_tfm: if (err == -EAGAIN) crypto_shoot_alg(alg); kfree(mem); mem = ERR_PTR(err); out: return mem; } EXPORT_SYMBOL_GPL(crypto_create_tfm_node); void *crypto_clone_tfm(const struct crypto_type *frontend, struct crypto_tfm *otfm) { struct crypto_alg *alg = otfm->__crt_alg; struct crypto_tfm *tfm; char *mem; mem = ERR_PTR(-ESTALE); if (unlikely(!crypto_mod_get(alg))) goto out; mem = crypto_alloc_tfmmem(alg, frontend, otfm->node, GFP_ATOMIC); if (IS_ERR(mem)) { crypto_mod_put(alg); goto out; } tfm = (struct crypto_tfm *)(mem + frontend->tfmsize); tfm->crt_flags = otfm->crt_flags; tfm->exit = otfm->exit; out: return mem; } EXPORT_SYMBOL_GPL(crypto_clone_tfm); struct crypto_alg *crypto_find_alg(const char *alg_name, const struct crypto_type *frontend, u32 type, u32 mask) { if (frontend) { type &= frontend->maskclear; mask &= frontend->maskclear; type |= frontend->type; mask |= frontend->maskset; } return crypto_alg_mod_lookup(alg_name, type, mask); } EXPORT_SYMBOL_GPL(crypto_find_alg); /* * crypto_alloc_tfm_node - Locate algorithm and allocate transform * @alg_name: Name of algorithm * @frontend: Frontend algorithm type * @type: Type of algorithm * @mask: Mask for type comparison * @node: NUMA node in which users desire to put requests, if node is * NUMA_NO_NODE, it means users have no special requirement. * * crypto_alloc_tfm() will first attempt to locate an already loaded * algorithm. If that fails and the kernel supports dynamically loadable * modules, it will then attempt to load a module of the same name or * alias. If that fails it will send a query to any loaded crypto manager * to construct an algorithm on the fly. A refcount is grabbed on the * algorithm which is then associated with the new transform. * * The returned transform is of a non-determinate type. Most people * should use one of the more specific allocation functions such as * crypto_alloc_skcipher(). * * In case of error the return value is an error pointer. */ void *crypto_alloc_tfm_node(const char *alg_name, const struct crypto_type *frontend, u32 type, u32 mask, int node) { void *tfm; int err; for (;;) { struct crypto_alg *alg; alg = crypto_find_alg(alg_name, frontend, type, mask); if (IS_ERR(alg)) { err = PTR_ERR(alg); goto err; } tfm = crypto_create_tfm_node(alg, frontend, node); if (!IS_ERR(tfm)) return tfm; crypto_mod_put(alg); err = PTR_ERR(tfm); err: if (err != -EAGAIN) break; if (fatal_signal_pending(current)) { err = -EINTR; break; } } return ERR_PTR(err); } EXPORT_SYMBOL_GPL(crypto_alloc_tfm_node); /* * crypto_destroy_tfm - Free crypto transform * @mem: Start of tfm slab * @tfm: Transform to free * * This function frees up the transform and any associated resources, * then drops the refcount on the associated algorithm. */ void crypto_destroy_tfm(void *mem, struct crypto_tfm *tfm) { struct crypto_alg *alg; if (IS_ERR_OR_NULL(mem)) return; if (!refcount_dec_and_test(&tfm->refcnt)) return; alg = tfm->__crt_alg; if (!tfm->exit && alg->cra_exit) alg->cra_exit(tfm); crypto_exit_ops(tfm); crypto_mod_put(alg); kfree_sensitive(mem); } EXPORT_SYMBOL_GPL(crypto_destroy_tfm); int crypto_has_alg(const char *name, u32 type, u32 mask) { int ret = 0; struct crypto_alg *alg = crypto_alg_mod_lookup(name, type, mask); if (!IS_ERR(alg)) { crypto_mod_put(alg); ret = 1; } return ret; } EXPORT_SYMBOL_GPL(crypto_has_alg); void crypto_req_done(void *data, int err) { struct crypto_wait *wait = data; if (err == -EINPROGRESS) return; wait->err = err; complete(&wait->completion); } EXPORT_SYMBOL_GPL(crypto_req_done); MODULE_DESCRIPTION("Cryptographic core API"); MODULE_LICENSE("GPL");
79 77 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 // SPDX-License-Identifier: GPL-2.0-only /* * linux/net/sunrpc/stats.c * * procfs-based user access to generic RPC statistics. The stats files * reside in /proc/net/rpc. * * The read routines assume that the buffer passed in is just big enough. * If you implement an RPC service that has its own stats routine which * appends the generic RPC stats, make sure you don't exceed the PAGE_SIZE * limit. * * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> */ #include <linux/module.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/metrics.h> #include <linux/rcupdate.h> #include <trace/events/sunrpc.h> #include "netns.h" #define RPCDBG_FACILITY RPCDBG_MISC /* * Get RPC client stats */ static int rpc_proc_show(struct seq_file *seq, void *v) { const struct rpc_stat *statp = seq->private; const struct rpc_program *prog = statp->program; unsigned int i, j; seq_printf(seq, "net %u %u %u %u\n", statp->netcnt, statp->netudpcnt, statp->nettcpcnt, statp->nettcpconn); seq_printf(seq, "rpc %u %u %u\n", statp->rpccnt, statp->rpcretrans, statp->rpcauthrefresh); for (i = 0; i < prog->nrvers; i++) { const struct rpc_version *vers = prog->version[i]; if (!vers) continue; seq_printf(seq, "proc%u %u", vers->number, vers->nrprocs); for (j = 0; j < vers->nrprocs; j++) seq_printf(seq, " %u", vers->counts[j]); seq_putc(seq, '\n'); } return 0; } static int rpc_proc_open(struct inode *inode, struct file *file) { return single_open(file, rpc_proc_show, pde_data(inode)); } static const struct proc_ops rpc_proc_ops = { .proc_open = rpc_proc_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = single_release, }; /* * Get RPC server stats */ void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) { const struct svc_program *prog = statp->program; const struct svc_version *vers; unsigned int i, j, k; unsigned long count; seq_printf(seq, "net %u %u %u %u\n", statp->netcnt, statp->netudpcnt, statp->nettcpcnt, statp->nettcpconn); seq_printf(seq, "rpc %u %u %u %u %u\n", statp->rpccnt, statp->rpcbadfmt+statp->rpcbadauth+statp->rpcbadclnt, statp->rpcbadfmt, statp->rpcbadauth, statp->rpcbadclnt); for (i = 0; i < prog->pg_nvers; i++) { vers = prog->pg_vers[i]; if (!vers) continue; seq_printf(seq, "proc%d %u", i, vers->vs_nproc); for (j = 0; j < vers->vs_nproc; j++) { count = 0; for_each_possible_cpu(k) count += per_cpu(vers->vs_count[j], k); seq_printf(seq, " %lu", count); } seq_putc(seq, '\n'); } } EXPORT_SYMBOL_GPL(svc_seq_show); /** * rpc_alloc_iostats - allocate an rpc_iostats structure * @clnt: RPC program, version, and xprt * */ struct rpc_iostats *rpc_alloc_iostats(struct rpc_clnt *clnt) { struct rpc_iostats *stats; int i; stats = kcalloc(clnt->cl_maxproc, sizeof(*stats), GFP_KERNEL); if (stats) { for (i = 0; i < clnt->cl_maxproc; i++) spin_lock_init(&stats[i].om_lock); } return stats; } EXPORT_SYMBOL_GPL(rpc_alloc_iostats); /** * rpc_free_iostats - release an rpc_iostats structure * @stats: doomed rpc_iostats structure * */ void rpc_free_iostats(struct rpc_iostats *stats) { kfree(stats); } EXPORT_SYMBOL_GPL(rpc_free_iostats); /** * rpc_count_iostats_metrics - tally up per-task stats * @task: completed rpc_task * @op_metrics: stat structure for OP that will accumulate stats from @task */ void rpc_count_iostats_metrics(const struct rpc_task *task, struct rpc_iostats *op_metrics) { struct rpc_rqst *req = task->tk_rqstp; ktime_t backlog, execute, now; if (!op_metrics || !req) return; now = ktime_get(); spin_lock(&op_metrics->om_lock); op_metrics->om_ops++; /* kernel API: om_ops must never become larger than om_ntrans */ op_metrics->om_ntrans += max(req->rq_ntrans, 1); op_metrics->om_timeouts += task->tk_timeouts; op_metrics->om_bytes_sent += req->rq_xmit_bytes_sent; op_metrics->om_bytes_recv += req->rq_reply_bytes_recvd; backlog = 0; if (ktime_to_ns(req->rq_xtime)) { backlog = ktime_sub(req->rq_xtime, task->tk_start); op_metrics->om_queue = ktime_add(op_metrics->om_queue, backlog); } op_metrics->om_rtt = ktime_add(op_metrics->om_rtt, req->rq_rtt); execute = ktime_sub(now, task->tk_start); op_metrics->om_execute = ktime_add(op_metrics->om_execute, execute); if (task->tk_status < 0) op_metrics->om_error_status++; spin_unlock(&op_metrics->om_lock); trace_rpc_stats_latency(req->rq_task, backlog, req->rq_rtt, execute); } EXPORT_SYMBOL_GPL(rpc_count_iostats_metrics); /** * rpc_count_iostats - tally up per-task stats * @task: completed rpc_task * @stats: array of stat structures * * Uses the statidx from @task */ void rpc_count_iostats(const struct rpc_task *task, struct rpc_iostats *stats) { rpc_count_iostats_metrics(task, &stats[task->tk_msg.rpc_proc->p_statidx]); } EXPORT_SYMBOL_GPL(rpc_count_iostats); static void _print_name(struct seq_file *seq, unsigned int op, const struct rpc_procinfo *procs) { if (procs[op].p_name) seq_printf(seq, "\t%12s: ", procs[op].p_name); else if (op == 0) seq_printf(seq, "\t NULL: "); else seq_printf(seq, "\t%12u: ", op); } static void _add_rpc_iostats(struct rpc_iostats *a, struct rpc_iostats *b) { a->om_ops += b->om_ops; a->om_ntrans += b->om_ntrans; a->om_timeouts += b->om_timeouts; a->om_bytes_sent += b->om_bytes_sent; a->om_bytes_recv += b->om_bytes_recv; a->om_queue = ktime_add(a->om_queue, b->om_queue); a->om_rtt = ktime_add(a->om_rtt, b->om_rtt); a->om_execute = ktime_add(a->om_execute, b->om_execute); a->om_error_status += b->om_error_status; } static void _print_rpc_iostats(struct seq_file *seq, struct rpc_iostats *stats, int op, const struct rpc_procinfo *procs) { _print_name(seq, op, procs); seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %llu %lu\n", stats->om_ops, stats->om_ntrans, stats->om_timeouts, stats->om_bytes_sent, stats->om_bytes_recv, ktime_to_ms(stats->om_queue), ktime_to_ms(stats->om_rtt), ktime_to_ms(stats->om_execute), stats->om_error_status); } static int do_print_stats(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *seqv) { struct seq_file *seq = seqv; xprt->ops->print_stats(xprt, seq); return 0; } void rpc_clnt_show_stats(struct seq_file *seq, struct rpc_clnt *clnt) { unsigned int op, maxproc = clnt->cl_maxproc; if (!clnt->cl_metrics) return; seq_printf(seq, "\tRPC iostats version: %s ", RPC_IOSTATS_VERS); seq_printf(seq, "p/v: %u/%u (%s)\n", clnt->cl_prog, clnt->cl_vers, clnt->cl_program->name); rpc_clnt_iterate_for_each_xprt(clnt, do_print_stats, seq); seq_printf(seq, "\tper-op statistics\n"); for (op = 0; op < maxproc; op++) { struct rpc_iostats stats = {}; struct rpc_clnt *next = clnt; do { _add_rpc_iostats(&stats, &next->cl_metrics[op]); if (next == next->cl_parent) break; next = next->cl_parent; } while (next); _print_rpc_iostats(seq, &stats, op, clnt->cl_procinfo); } } EXPORT_SYMBOL_GPL(rpc_clnt_show_stats); /* * Register/unregister RPC proc files */ static inline struct proc_dir_entry * do_register(struct net *net, const char *name, void *data, const struct proc_ops *proc_ops) { struct sunrpc_net *sn; dprintk("RPC: registering /proc/net/rpc/%s\n", name); sn = net_generic(net, sunrpc_net_id); return proc_create_data(name, 0, sn->proc_net_rpc, proc_ops, data); } struct proc_dir_entry * rpc_proc_register(struct net *net, struct rpc_stat *statp) { return do_register(net, statp->program->name, statp, &rpc_proc_ops); } EXPORT_SYMBOL_GPL(rpc_proc_register); void rpc_proc_unregister(struct net *net, const char *name) { struct sunrpc_net *sn; sn = net_generic(net, sunrpc_net_id); remove_proc_entry(name, sn->proc_net_rpc); } EXPORT_SYMBOL_GPL(rpc_proc_unregister); struct proc_dir_entry * svc_proc_register(struct net *net, struct svc_stat *statp, const struct proc_ops *proc_ops) { return do_register(net, statp->program->pg_name, statp, proc_ops); } EXPORT_SYMBOL_GPL(svc_proc_register); void svc_proc_unregister(struct net *net, const char *name) { struct sunrpc_net *sn; sn = net_generic(net, sunrpc_net_id); remove_proc_entry(name, sn->proc_net_rpc); } EXPORT_SYMBOL_GPL(svc_proc_unregister); int rpc_proc_init(struct net *net) { struct sunrpc_net *sn; dprintk("RPC: registering /proc/net/rpc\n"); sn = net_generic(net, sunrpc_net_id); sn->proc_net_rpc = proc_mkdir("rpc", net->proc_net); if (sn->proc_net_rpc == NULL) return -ENOMEM; return 0; } void rpc_proc_exit(struct net *net) { dprintk("RPC: unregistering /proc/net/rpc\n"); remove_proc_entry("rpc", net->proc_net); }
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 111 113 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2014 Facebook. All rights reserved. */ #include <linux/sched.h> #include <linux/stacktrace.h> #include "messages.h" #include "ctree.h" #include "disk-io.h" #include "locking.h" #include "delayed-ref.h" #include "ref-verify.h" #include "fs.h" #include "accessors.h" /* * Used to keep track the roots and number of refs each root has for a given * bytenr. This just tracks the number of direct references, no shared * references. */ struct root_entry { u64 root_objectid; u64 num_refs; struct rb_node node; }; /* * These are meant to represent what should exist in the extent tree, these can * be used to verify the extent tree is consistent as these should all match * what the extent tree says. */ struct ref_entry { u64 root_objectid; u64 parent; u64 owner; u64 offset; u64 num_refs; struct rb_node node; }; #define MAX_TRACE 16 /* * Whenever we add/remove a reference we record the action. The action maps * back to the delayed ref action. We hold the ref we are changing in the * action so we can account for the history properly, and we record the root we * were called with since it could be different from ref_root. We also store * stack traces because that's how I roll. */ struct ref_action { int action; u64 root; struct ref_entry ref; struct list_head list; unsigned long trace[MAX_TRACE]; unsigned int trace_len; }; /* * One of these for every block we reference, it holds the roots and references * to it as well as all of the ref actions that have occurred to it. We never * free it until we unmount the file system in order to make sure re-allocations * are happening properly. */ struct block_entry { u64 bytenr; u64 len; u64 num_refs; int metadata; int from_disk; struct rb_root roots; struct rb_root refs; struct rb_node node; struct list_head actions; }; static struct block_entry *insert_block_entry(struct rb_root *root, struct block_entry *be) { struct rb_node **p = &root->rb_node; struct rb_node *parent_node = NULL; struct block_entry *entry; while (*p) { parent_node = *p; entry = rb_entry(parent_node, struct block_entry, node); if (entry->bytenr > be->bytenr) p = &(*p)->rb_left; else if (entry->bytenr < be->bytenr) p = &(*p)->rb_right; else return entry; } rb_link_node(&be->node, parent_node, p); rb_insert_color(&be->node, root); return NULL; } static struct block_entry *lookup_block_entry(struct rb_root *root, u64 bytenr) { struct rb_node *n; struct block_entry *entry = NULL; n = root->rb_node; while (n) { entry = rb_entry(n, struct block_entry, node); if (entry->bytenr < bytenr) n = n->rb_right; else if (entry->bytenr > bytenr) n = n->rb_left; else return entry; } return NULL; } static struct root_entry *insert_root_entry(struct rb_root *root, struct root_entry *re) { struct rb_node **p = &root->rb_node; struct rb_node *parent_node = NULL; struct root_entry *entry; while (*p) { parent_node = *p; entry = rb_entry(parent_node, struct root_entry, node); if (entry->root_objectid > re->root_objectid) p = &(*p)->rb_left; else if (entry->root_objectid < re->root_objectid) p = &(*p)->rb_right; else return entry; } rb_link_node(&re->node, parent_node, p); rb_insert_color(&re->node, root); return NULL; } static int comp_refs(struct ref_entry *ref1, struct ref_entry *ref2) { if (ref1->root_objectid < ref2->root_objectid) return -1; if (ref1->root_objectid > ref2->root_objectid) return 1; if (ref1->parent < ref2->parent) return -1; if (ref1->parent > ref2->parent) return 1; if (ref1->owner < ref2->owner) return -1; if (ref1->owner > ref2->owner) return 1; if (ref1->offset < ref2->offset) return -1; if (ref1->offset > ref2->offset) return 1; return 0; } static struct ref_entry *insert_ref_entry(struct rb_root *root, struct ref_entry *ref) { struct rb_node **p = &root->rb_node; struct rb_node *parent_node = NULL; struct ref_entry *entry; int cmp; while (*p) { parent_node = *p; entry = rb_entry(parent_node, struct ref_entry, node); cmp = comp_refs(entry, ref); if (cmp > 0) p = &(*p)->rb_left; else if (cmp < 0) p = &(*p)->rb_right; else return entry; } rb_link_node(&ref->node, parent_node, p); rb_insert_color(&ref->node, root); return NULL; } static struct root_entry *lookup_root_entry(struct rb_root *root, u64 objectid) { struct rb_node *n; struct root_entry *entry = NULL; n = root->rb_node; while (n) { entry = rb_entry(n, struct root_entry, node); if (entry->root_objectid < objectid) n = n->rb_right; else if (entry->root_objectid > objectid) n = n->rb_left; else return entry; } return NULL; } #ifdef CONFIG_STACKTRACE static void __save_stack_trace(struct ref_action *ra) { ra->trace_len = stack_trace_save(ra->trace, MAX_TRACE, 2); } static void __print_stack_trace(struct btrfs_fs_info *fs_info, struct ref_action *ra) { if (ra->trace_len == 0) { btrfs_err(fs_info, " ref-verify: no stacktrace"); return; } stack_trace_print(ra->trace, ra->trace_len, 2); } #else static inline void __save_stack_trace(struct ref_action *ra) { } static inline void __print_stack_trace(struct btrfs_fs_info *fs_info, struct ref_action *ra) { btrfs_err(fs_info, " ref-verify: no stacktrace support"); } #endif static void free_block_entry(struct block_entry *be) { struct root_entry *re; struct ref_entry *ref; struct ref_action *ra; struct rb_node *n; while ((n = rb_first(&be->roots))) { re = rb_entry(n, struct root_entry, node); rb_erase(&re->node, &be->roots); kfree(re); } while((n = rb_first(&be->refs))) { ref = rb_entry(n, struct ref_entry, node); rb_erase(&ref->node, &be->refs); kfree(ref); } while (!list_empty(&be->actions)) { ra = list_first_entry(&be->actions, struct ref_action, list); list_del(&ra->list); kfree(ra); } kfree(be); } static struct block_entry *add_block_entry(struct btrfs_fs_info *fs_info, u64 bytenr, u64 len, u64 root_objectid) { struct block_entry *be = NULL, *exist; struct root_entry *re = NULL; re = kzalloc(sizeof(struct root_entry), GFP_NOFS); be = kzalloc(sizeof(struct block_entry), GFP_NOFS); if (!be || !re) { kfree(re); kfree(be); return ERR_PTR(-ENOMEM); } be->bytenr = bytenr; be->len = len; re->root_objectid = root_objectid; re->num_refs = 0; spin_lock(&fs_info->ref_verify_lock); exist = insert_block_entry(&fs_info->block_tree, be); if (exist) { if (root_objectid) { struct root_entry *exist_re; exist_re = insert_root_entry(&exist->roots, re); if (exist_re) kfree(re); } else { kfree(re); } kfree(be); return exist; } be->num_refs = 0; be->metadata = 0; be->from_disk = 0; be->roots = RB_ROOT; be->refs = RB_ROOT; INIT_LIST_HEAD(&be->actions); if (root_objectid) insert_root_entry(&be->roots, re); else kfree(re); return be; } static int add_tree_block(struct btrfs_fs_info *fs_info, u64 ref_root, u64 parent, u64 bytenr, int level) { struct block_entry *be; struct root_entry *re; struct ref_entry *ref = NULL, *exist; ref = kmalloc(sizeof(struct ref_entry), GFP_NOFS); if (!ref) return -ENOMEM; if (parent) ref->root_objectid = 0; else ref->root_objectid = ref_root; ref->parent = parent; ref->owner = level; ref->offset = 0; ref->num_refs = 1; be = add_block_entry(fs_info, bytenr, fs_info->nodesize, ref_root); if (IS_ERR(be)) { kfree(ref); return PTR_ERR(be); } be->num_refs++; be->from_disk = 1; be->metadata = 1; if (!parent) { ASSERT(ref_root); re = lookup_root_entry(&be->roots, ref_root); ASSERT(re); re->num_refs++; } exist = insert_ref_entry(&be->refs, ref); if (exist) { exist->num_refs++; kfree(ref); } spin_unlock(&fs_info->ref_verify_lock); return 0; } static int add_shared_data_ref(struct btrfs_fs_info *fs_info, u64 parent, u32 num_refs, u64 bytenr, u64 num_bytes) { struct block_entry *be; struct ref_entry *ref; ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS); if (!ref) return -ENOMEM; be = add_block_entry(fs_info, bytenr, num_bytes, 0); if (IS_ERR(be)) { kfree(ref); return PTR_ERR(be); } be->num_refs += num_refs; ref->parent = parent; ref->num_refs = num_refs; if (insert_ref_entry(&be->refs, ref)) { spin_unlock(&fs_info->ref_verify_lock); btrfs_err(fs_info, "existing shared ref when reading from disk?"); kfree(ref); return -EINVAL; } spin_unlock(&fs_info->ref_verify_lock); return 0; } static int add_extent_data_ref(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, struct btrfs_extent_data_ref *dref, u64 bytenr, u64 num_bytes) { struct block_entry *be; struct ref_entry *ref; struct root_entry *re; u64 ref_root = btrfs_extent_data_ref_root(leaf, dref); u64 owner = btrfs_extent_data_ref_objectid(leaf, dref); u64 offset = btrfs_extent_data_ref_offset(leaf, dref); u32 num_refs = btrfs_extent_data_ref_count(leaf, dref); ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS); if (!ref) return -ENOMEM; be = add_block_entry(fs_info, bytenr, num_bytes, ref_root); if (IS_ERR(be)) { kfree(ref); return PTR_ERR(be); } be->num_refs += num_refs; ref->parent = 0; ref->owner = owner; ref->root_objectid = ref_root; ref->offset = offset; ref->num_refs = num_refs; if (insert_ref_entry(&be->refs, ref)) { spin_unlock(&fs_info->ref_verify_lock); btrfs_err(fs_info, "existing ref when reading from disk?"); kfree(ref); return -EINVAL; } re = lookup_root_entry(&be->roots, ref_root); if (!re) { spin_unlock(&fs_info->ref_verify_lock); btrfs_err(fs_info, "missing root in new block entry?"); return -EINVAL; } re->num_refs += num_refs; spin_unlock(&fs_info->ref_verify_lock); return 0; } static int process_extent_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path, struct btrfs_key *key, int slot, int *tree_block_level) { struct btrfs_extent_item *ei; struct btrfs_extent_inline_ref *iref; struct btrfs_extent_data_ref *dref; struct btrfs_shared_data_ref *sref; struct extent_buffer *leaf = path->nodes[0]; u32 item_size = btrfs_item_size(leaf, slot); unsigned long end, ptr; u64 offset, flags, count; int type, ret; ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); flags = btrfs_extent_flags(leaf, ei); if ((key->type == BTRFS_EXTENT_ITEM_KEY) && flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { struct btrfs_tree_block_info *info; info = (struct btrfs_tree_block_info *)(ei + 1); *tree_block_level = btrfs_tree_block_level(leaf, info); iref = (struct btrfs_extent_inline_ref *)(info + 1); } else { if (key->type == BTRFS_METADATA_ITEM_KEY) *tree_block_level = key->offset; iref = (struct btrfs_extent_inline_ref *)(ei + 1); } ptr = (unsigned long)iref; end = (unsigned long)ei + item_size; while (ptr < end) { iref = (struct btrfs_extent_inline_ref *)ptr; type = btrfs_extent_inline_ref_type(leaf, iref); offset = btrfs_extent_inline_ref_offset(leaf, iref); switch (type) { case BTRFS_TREE_BLOCK_REF_KEY: ret = add_tree_block(fs_info, offset, 0, key->objectid, *tree_block_level); break; case BTRFS_SHARED_BLOCK_REF_KEY: ret = add_tree_block(fs_info, 0, offset, key->objectid, *tree_block_level); break; case BTRFS_EXTENT_DATA_REF_KEY: dref = (struct btrfs_extent_data_ref *)(&iref->offset); ret = add_extent_data_ref(fs_info, leaf, dref, key->objectid, key->offset); break; case BTRFS_SHARED_DATA_REF_KEY: sref = (struct btrfs_shared_data_ref *)(iref + 1); count = btrfs_shared_data_ref_count(leaf, sref); ret = add_shared_data_ref(fs_info, offset, count, key->objectid, key->offset); break; case BTRFS_EXTENT_OWNER_REF_KEY: WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); break; default: btrfs_err(fs_info, "invalid key type in iref"); ret = -EINVAL; break; } if (ret) break; ptr += btrfs_extent_inline_ref_size(type); } return ret; } static int process_leaf(struct btrfs_root *root, struct btrfs_path *path, u64 *bytenr, u64 *num_bytes, int *tree_block_level) { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf = path->nodes[0]; struct btrfs_extent_data_ref *dref; struct btrfs_shared_data_ref *sref; u32 count; int i = 0, ret = 0; struct btrfs_key key; int nritems = btrfs_header_nritems(leaf); for (i = 0; i < nritems; i++) { btrfs_item_key_to_cpu(leaf, &key, i); switch (key.type) { case BTRFS_EXTENT_ITEM_KEY: *num_bytes = key.offset; fallthrough; case BTRFS_METADATA_ITEM_KEY: *bytenr = key.objectid; ret = process_extent_item(fs_info, path, &key, i, tree_block_level); break; case BTRFS_TREE_BLOCK_REF_KEY: ret = add_tree_block(fs_info, key.offset, 0, key.objectid, *tree_block_level); break; case BTRFS_SHARED_BLOCK_REF_KEY: ret = add_tree_block(fs_info, 0, key.offset, key.objectid, *tree_block_level); break; case BTRFS_EXTENT_DATA_REF_KEY: dref = btrfs_item_ptr(leaf, i, struct btrfs_extent_data_ref); ret = add_extent_data_ref(fs_info, leaf, dref, *bytenr, *num_bytes); break; case BTRFS_SHARED_DATA_REF_KEY: sref = btrfs_item_ptr(leaf, i, struct btrfs_shared_data_ref); count = btrfs_shared_data_ref_count(leaf, sref); ret = add_shared_data_ref(fs_info, key.offset, count, *bytenr, *num_bytes); break; default: break; } if (ret) break; } return ret; } /* Walk down to the leaf from the given level */ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path, int level, u64 *bytenr, u64 *num_bytes, int *tree_block_level) { struct extent_buffer *eb; int ret = 0; while (level >= 0) { if (level) { eb = btrfs_read_node_slot(path->nodes[level], path->slots[level]); if (IS_ERR(eb)) return PTR_ERR(eb); btrfs_tree_read_lock(eb); path->nodes[level-1] = eb; path->slots[level-1] = 0; path->locks[level-1] = BTRFS_READ_LOCK; } else { ret = process_leaf(root, path, bytenr, num_bytes, tree_block_level); if (ret) break; } level--; } return ret; } /* Walk up to the next node that needs to be processed */ static int walk_up_tree(struct btrfs_path *path, int *level) { int l; for (l = 0; l < BTRFS_MAX_LEVEL; l++) { if (!path->nodes[l]) continue; if (l) { path->slots[l]++; if (path->slots[l] < btrfs_header_nritems(path->nodes[l])) { *level = l; return 0; } } btrfs_tree_unlock_rw(path->nodes[l], path->locks[l]); free_extent_buffer(path->nodes[l]); path->nodes[l] = NULL; path->slots[l] = 0; path->locks[l] = 0; } return 1; } static void dump_ref_action(struct btrfs_fs_info *fs_info, struct ref_action *ra) { btrfs_err(fs_info, " Ref action %d, root %llu, ref_root %llu, parent %llu, owner %llu, offset %llu, num_refs %llu", ra->action, ra->root, ra->ref.root_objectid, ra->ref.parent, ra->ref.owner, ra->ref.offset, ra->ref.num_refs); __print_stack_trace(fs_info, ra); } /* * Dumps all the information from the block entry to printk, it's going to be * awesome. */ static void dump_block_entry(struct btrfs_fs_info *fs_info, struct block_entry *be) { struct ref_entry *ref; struct root_entry *re; struct ref_action *ra; struct rb_node *n; btrfs_err(fs_info, "dumping block entry [%llu %llu], num_refs %llu, metadata %d, from disk %d", be->bytenr, be->len, be->num_refs, be->metadata, be->from_disk); for (n = rb_first(&be->refs); n; n = rb_next(n)) { ref = rb_entry(n, struct ref_entry, node); btrfs_err(fs_info, " ref root %llu, parent %llu, owner %llu, offset %llu, num_refs %llu", ref->root_objectid, ref->parent, ref->owner, ref->offset, ref->num_refs); } for (n = rb_first(&be->roots); n; n = rb_next(n)) { re = rb_entry(n, struct root_entry, node); btrfs_err(fs_info, " root entry %llu, num_refs %llu", re->root_objectid, re->num_refs); } list_for_each_entry(ra, &be->actions, list) dump_ref_action(fs_info, ra); } /* * Called when we modify a ref for a bytenr. * * This will add an action item to the given bytenr and do sanity checks to make * sure we haven't messed something up. If we are making a new allocation and * this block entry has history we will delete all previous actions as long as * our sanity checks pass as they are no longer needed. */ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, struct btrfs_ref *generic_ref) { struct ref_entry *ref = NULL, *exist; struct ref_action *ra = NULL; struct block_entry *be = NULL; struct root_entry *re = NULL; int action = generic_ref->action; int ret = 0; bool metadata; u64 bytenr = generic_ref->bytenr; u64 num_bytes = generic_ref->len; u64 parent = generic_ref->parent; u64 ref_root = 0; u64 owner = 0; u64 offset = 0; if (!btrfs_test_opt(fs_info, REF_VERIFY)) return 0; if (generic_ref->type == BTRFS_REF_METADATA) { if (!parent) ref_root = generic_ref->tree_ref.ref_root; owner = generic_ref->tree_ref.level; } else if (!parent) { ref_root = generic_ref->data_ref.ref_root; owner = generic_ref->data_ref.ino; offset = generic_ref->data_ref.offset; } metadata = owner < BTRFS_FIRST_FREE_OBJECTID; ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS); ra = kmalloc(sizeof(struct ref_action), GFP_NOFS); if (!ra || !ref) { kfree(ref); kfree(ra); ret = -ENOMEM; goto out; } ref->parent = parent; ref->owner = owner; ref->root_objectid = ref_root; ref->offset = offset; ref->num_refs = (action == BTRFS_DROP_DELAYED_REF) ? -1 : 1; memcpy(&ra->ref, ref, sizeof(struct ref_entry)); /* * Save the extra info from the delayed ref in the ref action to make it * easier to figure out what is happening. The real ref's we add to the * ref tree need to reflect what we save on disk so it matches any * on-disk refs we pre-loaded. */ ra->ref.owner = owner; ra->ref.offset = offset; ra->ref.root_objectid = ref_root; __save_stack_trace(ra); INIT_LIST_HEAD(&ra->list); ra->action = action; ra->root = generic_ref->real_root; /* * This is an allocation, preallocate the block_entry in case we haven't * used it before. */ ret = -EINVAL; if (action == BTRFS_ADD_DELAYED_EXTENT) { /* * For subvol_create we'll just pass in whatever the parent root * is and the new root objectid, so let's not treat the passed * in root as if it really has a ref for this bytenr. */ be = add_block_entry(fs_info, bytenr, num_bytes, ref_root); if (IS_ERR(be)) { kfree(ref); kfree(ra); ret = PTR_ERR(be); goto out; } be->num_refs++; if (metadata) be->metadata = 1; if (be->num_refs != 1) { btrfs_err(fs_info, "re-allocated a block that still has references to it!"); dump_block_entry(fs_info, be); dump_ref_action(fs_info, ra); kfree(ref); kfree(ra); goto out_unlock; } while (!list_empty(&be->actions)) { struct ref_action *tmp; tmp = list_first_entry(&be->actions, struct ref_action, list); list_del(&tmp->list); kfree(tmp); } } else { struct root_entry *tmp; if (!parent) { re = kmalloc(sizeof(struct root_entry), GFP_NOFS); if (!re) { kfree(ref); kfree(ra); ret = -ENOMEM; goto out; } /* * This is the root that is modifying us, so it's the * one we want to lookup below when we modify the * re->num_refs. */ ref_root = generic_ref->real_root; re->root_objectid = generic_ref->real_root; re->num_refs = 0; } spin_lock(&fs_info->ref_verify_lock); be = lookup_block_entry(&fs_info->block_tree, bytenr); if (!be) { btrfs_err(fs_info, "trying to do action %d to bytenr %llu num_bytes %llu but there is no existing entry!", action, bytenr, num_bytes); dump_ref_action(fs_info, ra); kfree(ref); kfree(ra); kfree(re); goto out_unlock; } else if (be->num_refs == 0) { btrfs_err(fs_info, "trying to do action %d for a bytenr that has 0 total references", action); dump_block_entry(fs_info, be); dump_ref_action(fs_info, ra); kfree(ref); kfree(ra); kfree(re); goto out_unlock; } if (!parent) { tmp = insert_root_entry(&be->roots, re); if (tmp) { kfree(re); re = tmp; } } } exist = insert_ref_entry(&be->refs, ref); if (exist) { if (action == BTRFS_DROP_DELAYED_REF) { if (exist->num_refs == 0) { btrfs_err(fs_info, "dropping a ref for a existing root that doesn't have a ref on the block"); dump_block_entry(fs_info, be); dump_ref_action(fs_info, ra); kfree(ref); kfree(ra); goto out_unlock; } exist->num_refs--; if (exist->num_refs == 0) { rb_erase(&exist->node, &be->refs); kfree(exist); } } else if (!be->metadata) { exist->num_refs++; } else { btrfs_err(fs_info, "attempting to add another ref for an existing ref on a tree block"); dump_block_entry(fs_info, be); dump_ref_action(fs_info, ra); kfree(ref); kfree(ra); goto out_unlock; } kfree(ref); } else { if (action == BTRFS_DROP_DELAYED_REF) { btrfs_err(fs_info, "dropping a ref for a root that doesn't have a ref on the block"); dump_block_entry(fs_info, be); dump_ref_action(fs_info, ra); kfree(ref); kfree(ra); goto out_unlock; } } if (!parent && !re) { re = lookup_root_entry(&be->roots, ref_root); if (!re) { /* * This shouldn't happen because we will add our re * above when we lookup the be with !parent, but just in * case catch this case so we don't panic because I * didn't think of some other corner case. */ btrfs_err(fs_info, "failed to find root %llu for %llu", generic_ref->real_root, be->bytenr); dump_block_entry(fs_info, be); dump_ref_action(fs_info, ra); kfree(ra); goto out_unlock; } } if (action == BTRFS_DROP_DELAYED_REF) { if (re) re->num_refs--; be->num_refs--; } else if (action == BTRFS_ADD_DELAYED_REF) { be->num_refs++; if (re) re->num_refs++; } list_add_tail(&ra->list, &be->actions); ret = 0; out_unlock: spin_unlock(&fs_info->ref_verify_lock); out: if (ret) { btrfs_free_ref_cache(fs_info); btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); } return ret; } /* Free up the ref cache */ void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info) { struct block_entry *be; struct rb_node *n; if (!btrfs_test_opt(fs_info, REF_VERIFY)) return; spin_lock(&fs_info->ref_verify_lock); while ((n = rb_first(&fs_info->block_tree))) { be = rb_entry(n, struct block_entry, node); rb_erase(&be->node, &fs_info->block_tree); free_block_entry(be); cond_resched_lock(&fs_info->ref_verify_lock); } spin_unlock(&fs_info->ref_verify_lock); } void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start, u64 len) { struct block_entry *be = NULL, *entry; struct rb_node *n; if (!btrfs_test_opt(fs_info, REF_VERIFY)) return; spin_lock(&fs_info->ref_verify_lock); n = fs_info->block_tree.rb_node; while (n) { entry = rb_entry(n, struct block_entry, node); if (entry->bytenr < start) { n = n->rb_right; } else if (entry->bytenr > start) { n = n->rb_left; } else { be = entry; break; } /* We want to get as close to start as possible */ if (be == NULL || (entry->bytenr < start && be->bytenr > start) || (entry->bytenr < start && entry->bytenr > be->bytenr)) be = entry; } /* * Could have an empty block group, maybe have something to check for * this case to verify we were actually empty? */ if (!be) { spin_unlock(&fs_info->ref_verify_lock); return; } n = &be->node; while (n) { be = rb_entry(n, struct block_entry, node); n = rb_next(n); if (be->bytenr < start && be->bytenr + be->len > start) { btrfs_err(fs_info, "block entry overlaps a block group [%llu,%llu]!", start, len); dump_block_entry(fs_info, be); continue; } if (be->bytenr < start) continue; if (be->bytenr >= start + len) break; if (be->bytenr + be->len > start + len) { btrfs_err(fs_info, "block entry overlaps a block group [%llu,%llu]!", start, len); dump_block_entry(fs_info, be); } rb_erase(&be->node, &fs_info->block_tree); free_block_entry(be); } spin_unlock(&fs_info->ref_verify_lock); } /* Walk down all roots and build the ref tree, meant to be called at mount */ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) { struct btrfs_root *extent_root; struct btrfs_path *path; struct extent_buffer *eb; int tree_block_level = 0; u64 bytenr = 0, num_bytes = 0; int ret, level; if (!btrfs_test_opt(fs_info, REF_VERIFY)) return 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; extent_root = btrfs_extent_root(fs_info, 0); eb = btrfs_read_lock_root_node(extent_root); level = btrfs_header_level(eb); path->nodes[level] = eb; path->slots[level] = 0; path->locks[level] = BTRFS_READ_LOCK; while (1) { /* * We have to keep track of the bytenr/num_bytes we last hit * because we could have run out of space for an inline ref, and * would have had to added a ref key item which may appear on a * different leaf from the original extent item. */ ret = walk_down_tree(extent_root, path, level, &bytenr, &num_bytes, &tree_block_level); if (ret) break; ret = walk_up_tree(path, &level); if (ret < 0) break; if (ret > 0) { ret = 0; break; } } if (ret) { btrfs_free_ref_cache(fs_info); btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); } btrfs_free_path(path); return ret; }
16 9 7 1 13 1 12 4 4 2 3 1 9 9 8 8 8 2 8 8 7 16 1 6 6 5 6 6 2 2 6 1 6 6 5 6 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 // SPDX-License-Identifier: GPL-2.0+ /* net/sched/act_ctinfo.c netfilter ctinfo connmark actions * * Copyright (c) 2019 Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk> */ #include <linux/module.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/pkt_cls.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/act_api.h> #include <net/pkt_cls.h> #include <uapi/linux/tc_act/tc_ctinfo.h> #include <net/tc_act/tc_ctinfo.h> #include <net/tc_wrapper.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_zones.h> static struct tc_action_ops act_ctinfo_ops; static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca, struct tcf_ctinfo_params *cp, struct sk_buff *skb, int wlen, int proto) { u8 dscp, newdscp; newdscp = (((READ_ONCE(ct->mark) & cp->dscpmask) >> cp->dscpmaskshift) << 2) & ~INET_ECN_MASK; switch (proto) { case NFPROTO_IPV4: dscp = ipv4_get_dsfield(ip_hdr(skb)) & ~INET_ECN_MASK; if (dscp != newdscp) { if (likely(!skb_try_make_writable(skb, wlen))) { ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, newdscp); ca->stats_dscp_set++; } else { ca->stats_dscp_error++; } } break; case NFPROTO_IPV6: dscp = ipv6_get_dsfield(ipv6_hdr(skb)) & ~INET_ECN_MASK; if (dscp != newdscp) { if (likely(!skb_try_make_writable(skb, wlen))) { ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, newdscp); ca->stats_dscp_set++; } else { ca->stats_dscp_error++; } } break; default: break; } } static void tcf_ctinfo_cpmark_set(struct nf_conn *ct, struct tcf_ctinfo *ca, struct tcf_ctinfo_params *cp, struct sk_buff *skb) { ca->stats_cpmark_set++; skb->mark = READ_ONCE(ct->mark) & cp->cpmarkmask; } TC_INDIRECT_SCOPE int tcf_ctinfo_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { const struct nf_conntrack_tuple_hash *thash = NULL; struct tcf_ctinfo *ca = to_ctinfo(a); struct nf_conntrack_tuple tuple; struct nf_conntrack_zone zone; enum ip_conntrack_info ctinfo; struct tcf_ctinfo_params *cp; struct nf_conn *ct; int proto, wlen; int action; cp = rcu_dereference_bh(ca->params); tcf_lastuse_update(&ca->tcf_tm); tcf_action_update_bstats(&ca->common, skb); action = READ_ONCE(ca->tcf_action); wlen = skb_network_offset(skb); switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): wlen += sizeof(struct iphdr); if (!pskb_may_pull(skb, wlen)) goto out; proto = NFPROTO_IPV4; break; case htons(ETH_P_IPV6): wlen += sizeof(struct ipv6hdr); if (!pskb_may_pull(skb, wlen)) goto out; proto = NFPROTO_IPV6; break; default: goto out; } ct = nf_ct_get(skb, &ctinfo); if (!ct) { /* look harder, usually ingress */ if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, cp->net, &tuple)) goto out; zone.id = cp->zone; zone.dir = NF_CT_DEFAULT_ZONE_DIR; thash = nf_conntrack_find_get(cp->net, &zone, &tuple); if (!thash) goto out; ct = nf_ct_tuplehash_to_ctrack(thash); } if (cp->mode & CTINFO_MODE_DSCP) if (!cp->dscpstatemask || (READ_ONCE(ct->mark) & cp->dscpstatemask)) tcf_ctinfo_dscp_set(ct, ca, cp, skb, wlen, proto); if (cp->mode & CTINFO_MODE_CPMARK) tcf_ctinfo_cpmark_set(ct, ca, cp, skb); if (thash) nf_ct_put(ct); out: return action; } static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = { [TCA_CTINFO_ACT] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_ctinfo)), [TCA_CTINFO_ZONE] = { .type = NLA_U16 }, [TCA_CTINFO_PARMS_DSCP_MASK] = { .type = NLA_U32 }, [TCA_CTINFO_PARMS_DSCP_STATEMASK] = { .type = NLA_U32 }, [TCA_CTINFO_PARMS_CPMARK_MASK] = { .type = NLA_U32 }, }; static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_ctinfo_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; u32 dscpmask = 0, dscpstatemask, index; struct nlattr *tb[TCA_CTINFO_MAX + 1]; struct tcf_ctinfo_params *cp_new; struct tcf_chain *goto_ch = NULL; struct tc_ctinfo *actparm; struct tcf_ctinfo *ci; u8 dscpmaskshift; int ret = 0, err; if (!nla) { NL_SET_ERR_MSG_MOD(extack, "ctinfo requires attributes to be passed"); return -EINVAL; } err = nla_parse_nested(tb, TCA_CTINFO_MAX, nla, ctinfo_policy, extack); if (err < 0) return err; if (!tb[TCA_CTINFO_ACT]) { NL_SET_ERR_MSG_MOD(extack, "Missing required TCA_CTINFO_ACT attribute"); return -EINVAL; } actparm = nla_data(tb[TCA_CTINFO_ACT]); /* do some basic validation here before dynamically allocating things */ /* that we would otherwise have to clean up. */ if (tb[TCA_CTINFO_PARMS_DSCP_MASK]) { dscpmask = nla_get_u32(tb[TCA_CTINFO_PARMS_DSCP_MASK]); /* need contiguous 6 bit mask */ dscpmaskshift = dscpmask ? __ffs(dscpmask) : 0; if ((~0 & (dscpmask >> dscpmaskshift)) != 0x3f) { NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CTINFO_PARMS_DSCP_MASK], "dscp mask must be 6 contiguous bits"); return -EINVAL; } dscpstatemask = tb[TCA_CTINFO_PARMS_DSCP_STATEMASK] ? nla_get_u32(tb[TCA_CTINFO_PARMS_DSCP_STATEMASK]) : 0; /* mask & statemask must not overlap */ if (dscpmask & dscpstatemask) { NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CTINFO_PARMS_DSCP_STATEMASK], "dscp statemask must not overlap dscp mask"); return -EINVAL; } } /* done the validation:now to the actual action allocation */ index = actparm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { ret = tcf_idr_create_from_flags(tn, index, est, a, &act_ctinfo_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; } else if (err > 0) { if (bind) /* don't override defaults */ return ACT_P_BOUND; if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } } else { return err; } err = tcf_action_check_ctrlact(actparm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; ci = to_ctinfo(*a); cp_new = kzalloc(sizeof(*cp_new), GFP_KERNEL); if (unlikely(!cp_new)) { err = -ENOMEM; goto put_chain; } cp_new->net = net; cp_new->zone = tb[TCA_CTINFO_ZONE] ? nla_get_u16(tb[TCA_CTINFO_ZONE]) : 0; if (dscpmask) { cp_new->dscpmask = dscpmask; cp_new->dscpmaskshift = dscpmaskshift; cp_new->dscpstatemask = dscpstatemask; cp_new->mode |= CTINFO_MODE_DSCP; } if (tb[TCA_CTINFO_PARMS_CPMARK_MASK]) { cp_new->cpmarkmask = nla_get_u32(tb[TCA_CTINFO_PARMS_CPMARK_MASK]); cp_new->mode |= CTINFO_MODE_CPMARK; } spin_lock_bh(&ci->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, actparm->action, goto_ch); cp_new = rcu_replace_pointer(ci->params, cp_new, lockdep_is_held(&ci->tcf_lock)); spin_unlock_bh(&ci->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); if (cp_new) kfree_rcu(cp_new, rcu); return ret; put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: tcf_idr_release(*a, bind); return err; } static int tcf_ctinfo_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { struct tcf_ctinfo *ci = to_ctinfo(a); struct tc_ctinfo opt = { .index = ci->tcf_index, .refcnt = refcount_read(&ci->tcf_refcnt) - ref, .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind, }; unsigned char *b = skb_tail_pointer(skb); struct tcf_ctinfo_params *cp; struct tcf_t t; spin_lock_bh(&ci->tcf_lock); cp = rcu_dereference_protected(ci->params, lockdep_is_held(&ci->tcf_lock)); tcf_tm_dump(&t, &ci->tcf_tm); if (nla_put_64bit(skb, TCA_CTINFO_TM, sizeof(t), &t, TCA_CTINFO_PAD)) goto nla_put_failure; opt.action = ci->tcf_action; if (nla_put(skb, TCA_CTINFO_ACT, sizeof(opt), &opt)) goto nla_put_failure; if (nla_put_u16(skb, TCA_CTINFO_ZONE, cp->zone)) goto nla_put_failure; if (cp->mode & CTINFO_MODE_DSCP) { if (nla_put_u32(skb, TCA_CTINFO_PARMS_DSCP_MASK, cp->dscpmask)) goto nla_put_failure; if (nla_put_u32(skb, TCA_CTINFO_PARMS_DSCP_STATEMASK, cp->dscpstatemask)) goto nla_put_failure; } if (cp->mode & CTINFO_MODE_CPMARK) { if (nla_put_u32(skb, TCA_CTINFO_PARMS_CPMARK_MASK, cp->cpmarkmask)) goto nla_put_failure; } if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_SET, ci->stats_dscp_set, TCA_CTINFO_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_ERROR, ci->stats_dscp_error, TCA_CTINFO_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_CPMARK_SET, ci->stats_cpmark_set, TCA_CTINFO_PAD)) goto nla_put_failure; spin_unlock_bh(&ci->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&ci->tcf_lock); nlmsg_trim(skb, b); return -1; } static void tcf_ctinfo_cleanup(struct tc_action *a) { struct tcf_ctinfo *ci = to_ctinfo(a); struct tcf_ctinfo_params *cp; cp = rcu_dereference_protected(ci->params, 1); if (cp) kfree_rcu(cp, rcu); } static struct tc_action_ops act_ctinfo_ops = { .kind = "ctinfo", .id = TCA_ID_CTINFO, .owner = THIS_MODULE, .act = tcf_ctinfo_act, .dump = tcf_ctinfo_dump, .init = tcf_ctinfo_init, .cleanup= tcf_ctinfo_cleanup, .size = sizeof(struct tcf_ctinfo), }; static __net_init int ctinfo_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_ctinfo_ops.net_id); return tc_action_net_init(net, tn, &act_ctinfo_ops); } static void __net_exit ctinfo_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_ctinfo_ops.net_id); } static struct pernet_operations ctinfo_net_ops = { .init = ctinfo_init_net, .exit_batch = ctinfo_exit_net, .id = &act_ctinfo_ops.net_id, .size = sizeof(struct tc_action_net), }; static int __init ctinfo_init_module(void) { return tcf_register_action(&act_ctinfo_ops, &ctinfo_net_ops); } static void __exit ctinfo_cleanup_module(void) { tcf_unregister_action(&act_ctinfo_ops, &ctinfo_net_ops); } module_init(ctinfo_init_module); module_exit(ctinfo_cleanup_module); MODULE_AUTHOR("Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>"); MODULE_DESCRIPTION("Connection tracking mark actions"); MODULE_LICENSE("GPL");
21 21 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 // SPDX-License-Identifier: GPL-2.0 /* * fs/partitions/mac.c * * Code extracted from drivers/block/genhd.c * Copyright (C) 1991-1998 Linus Torvalds * Re-organised Feb 1998 Russell King */ #include <linux/ctype.h> #include "check.h" #include "mac.h" #ifdef CONFIG_PPC_PMAC #include <asm/machdep.h> extern void note_bootable_part(dev_t dev, int part, int goodness); #endif /* * Code to understand MacOS partition tables. */ static inline void mac_fix_string(char *stg, int len) { int i; for (i = len - 1; i >= 0 && stg[i] == ' '; i--) stg[i] = 0; } int mac_partition(struct parsed_partitions *state) { Sector sect; unsigned char *data; int slot, blocks_in_map; unsigned secsize, datasize, partoffset; #ifdef CONFIG_PPC_PMAC int found_root = 0; int found_root_goodness = 0; #endif struct mac_partition *part; struct mac_driver_desc *md; /* Get 0th block and look at the first partition map entry. */ md = read_part_sector(state, 0, &sect); if (!md) return -1; if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) { put_dev_sector(sect); return 0; } secsize = be16_to_cpu(md->block_size); put_dev_sector(sect); datasize = round_down(secsize, 512); data = read_part_sector(state, datasize / 512, &sect); if (!data) return -1; partoffset = secsize % 512; if (partoffset + sizeof(*part) > datasize) return -1; part = (struct mac_partition *) (data + partoffset); if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) { put_dev_sector(sect); return 0; /* not a MacOS disk */ } blocks_in_map = be32_to_cpu(part->map_count); if (blocks_in_map < 0 || blocks_in_map >= DISK_MAX_PARTS) { put_dev_sector(sect); return 0; } if (blocks_in_map >= state->limit) blocks_in_map = state->limit - 1; strlcat(state->pp_buf, " [mac]", PAGE_SIZE); for (slot = 1; slot <= blocks_in_map; ++slot) { int pos = slot * secsize; put_dev_sector(sect); data = read_part_sector(state, pos/512, &sect); if (!data) return -1; part = (struct mac_partition *) (data + pos%512); if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) break; put_partition(state, slot, be32_to_cpu(part->start_block) * (secsize/512), be32_to_cpu(part->block_count) * (secsize/512)); if (!strncasecmp(part->type, "Linux_RAID", 10)) state->parts[slot].flags = ADDPART_FLAG_RAID; #ifdef CONFIG_PPC_PMAC /* * If this is the first bootable partition, tell the * setup code, in case it wants to make this the root. */ if (machine_is(powermac)) { int goodness = 0; mac_fix_string(part->processor, 16); mac_fix_string(part->name, 32); mac_fix_string(part->type, 32); if ((be32_to_cpu(part->status) & MAC_STATUS_BOOTABLE) && strcasecmp(part->processor, "powerpc") == 0) goodness++; if (strcasecmp(part->type, "Apple_UNIX_SVR2") == 0 || (strncasecmp(part->type, "Linux", 5) == 0 && strcasecmp(part->type, "Linux_swap") != 0)) { int i, l; goodness++; l = strlen(part->name); if (strcmp(part->name, "/") == 0) goodness++; for (i = 0; i <= l - 4; ++i) { if (strncasecmp(part->name + i, "root", 4) == 0) { goodness += 2; break; } } if (strncasecmp(part->name, "swap", 4) == 0) goodness--; } if (goodness > found_root_goodness) { found_root = slot; found_root_goodness = goodness; } } #endif /* CONFIG_PPC_PMAC */ } #ifdef CONFIG_PPC_PMAC if (found_root_goodness) note_bootable_part(state->disk->part0->bd_dev, found_root, found_root_goodness); #endif put_dev_sector(sect); strlcat(state->pp_buf, "\n", PAGE_SIZE); return 1; }
8 8 3 3 1 1 1 3 3 3 8 18 14 7 7 7 7 7 8 8 8 8 2 3 30 30 30 9 5 5 5 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 // SPDX-License-Identifier: GPL-2.0-or-later /* * Virtual Raw MIDI client on Sequencer * * Copyright (c) 2000 by Takashi Iwai <tiwai@suse.de>, * Jaroslav Kysela <perex@perex.cz> */ /* * Virtual Raw MIDI client * * The virtual rawmidi client is a sequencer client which associate * a rawmidi device file. The created rawmidi device file can be * accessed as a normal raw midi, but its MIDI source and destination * are arbitrary. For example, a user-client software synth connected * to this port can be used as a normal midi device as well. * * The virtual rawmidi device accepts also multiple opens. Each file * has its own input buffer, so that no conflict would occur. The drain * of input/output buffer acts only to the local buffer. * */ #include <linux/init.h> #include <linux/wait.h> #include <linux/module.h> #include <linux/slab.h> #include <sound/core.h> #include <sound/rawmidi.h> #include <sound/info.h> #include <sound/control.h> #include <sound/minors.h> #include <sound/seq_kernel.h> #include <sound/seq_midi_event.h> #include <sound/seq_virmidi.h> MODULE_AUTHOR("Takashi Iwai <tiwai@suse.de>"); MODULE_DESCRIPTION("Virtual Raw MIDI client on Sequencer"); MODULE_LICENSE("GPL"); /* * initialize an event record */ static void snd_virmidi_init_event(struct snd_virmidi *vmidi, struct snd_seq_event *ev) { memset(ev, 0, sizeof(*ev)); ev->source.port = vmidi->port; switch (vmidi->seq_mode) { case SNDRV_VIRMIDI_SEQ_DISPATCH: ev->dest.client = SNDRV_SEQ_ADDRESS_SUBSCRIBERS; break; case SNDRV_VIRMIDI_SEQ_ATTACH: /* FIXME: source and destination are same - not good.. */ ev->dest.client = vmidi->client; ev->dest.port = vmidi->port; break; } ev->type = SNDRV_SEQ_EVENT_NONE; } /* * decode input event and put to read buffer of each opened file */ static int snd_virmidi_dev_receive_event(struct snd_virmidi_dev *rdev, struct snd_seq_event *ev, bool atomic) { struct snd_virmidi *vmidi; unsigned char msg[4]; int len; if (atomic) read_lock(&rdev->filelist_lock); else down_read(&rdev->filelist_sem); list_for_each_entry(vmidi, &rdev->filelist, list) { if (!READ_ONCE(vmidi->trigger)) continue; if (ev->type == SNDRV_SEQ_EVENT_SYSEX) { if ((ev->flags & SNDRV_SEQ_EVENT_LENGTH_MASK) != SNDRV_SEQ_EVENT_LENGTH_VARIABLE) continue; snd_seq_dump_var_event(ev, (snd_seq_dump_func_t)snd_rawmidi_receive, vmidi->substream); snd_midi_event_reset_decode(vmidi->parser); } else { len = snd_midi_event_decode(vmidi->parser, msg, sizeof(msg), ev); if (len > 0) snd_rawmidi_receive(vmidi->substream, msg, len); } } if (atomic) read_unlock(&rdev->filelist_lock); else up_read(&rdev->filelist_sem); return 0; } /* * event handler of virmidi port */ static int snd_virmidi_event_input(struct snd_seq_event *ev, int direct, void *private_data, int atomic, int hop) { struct snd_virmidi_dev *rdev; rdev = private_data; if (!(rdev->flags & SNDRV_VIRMIDI_USE)) return 0; /* ignored */ return snd_virmidi_dev_receive_event(rdev, ev, atomic); } /* * trigger rawmidi stream for input */ static void snd_virmidi_input_trigger(struct snd_rawmidi_substream *substream, int up) { struct snd_virmidi *vmidi = substream->runtime->private_data; WRITE_ONCE(vmidi->trigger, !!up); } /* process rawmidi bytes and send events; * we need no lock here for vmidi->event since it's handled only in this work */ static void snd_vmidi_output_work(struct work_struct *work) { struct snd_virmidi *vmidi; struct snd_rawmidi_substream *substream; unsigned char input; int ret; vmidi = container_of(work, struct snd_virmidi, output_work); substream = vmidi->substream; /* discard the outputs in dispatch mode unless subscribed */ if (vmidi->seq_mode == SNDRV_VIRMIDI_SEQ_DISPATCH && !(vmidi->rdev->flags & SNDRV_VIRMIDI_SUBSCRIBE)) { snd_rawmidi_proceed(substream); return; } while (READ_ONCE(vmidi->trigger)) { if (snd_rawmidi_transmit(substream, &input, 1) != 1) break; if (!snd_midi_event_encode_byte(vmidi->parser, input, &vmidi->event)) continue; if (vmidi->event.type != SNDRV_SEQ_EVENT_NONE) { ret = snd_seq_kernel_client_dispatch(vmidi->client, &vmidi->event, false, 0); vmidi->event.type = SNDRV_SEQ_EVENT_NONE; if (ret < 0) break; } /* rawmidi input might be huge, allow to have a break */ cond_resched(); } } /* * trigger rawmidi stream for output */ static void snd_virmidi_output_trigger(struct snd_rawmidi_substream *substream, int up) { struct snd_virmidi *vmidi = substream->runtime->private_data; WRITE_ONCE(vmidi->trigger, !!up); if (up) queue_work(system_highpri_wq, &vmidi->output_work); } /* * open rawmidi handle for input */ static int snd_virmidi_input_open(struct snd_rawmidi_substream *substream) { struct snd_virmidi_dev *rdev = substream->rmidi->private_data; struct snd_rawmidi_runtime *runtime = substream->runtime; struct snd_virmidi *vmidi; vmidi = kzalloc(sizeof(*vmidi), GFP_KERNEL); if (vmidi == NULL) return -ENOMEM; vmidi->substream = substream; if (snd_midi_event_new(0, &vmidi->parser) < 0) { kfree(vmidi); return -ENOMEM; } vmidi->seq_mode = rdev->seq_mode; vmidi->client = rdev->client; vmidi->port = rdev->port; runtime->private_data = vmidi; down_write(&rdev->filelist_sem); write_lock_irq(&rdev->filelist_lock); list_add_tail(&vmidi->list, &rdev->filelist); write_unlock_irq(&rdev->filelist_lock); up_write(&rdev->filelist_sem); vmidi->rdev = rdev; return 0; } /* * open rawmidi handle for output */ static int snd_virmidi_output_open(struct snd_rawmidi_substream *substream) { struct snd_virmidi_dev *rdev = substream->rmidi->private_data; struct snd_rawmidi_runtime *runtime = substream->runtime; struct snd_virmidi *vmidi; vmidi = kzalloc(sizeof(*vmidi), GFP_KERNEL); if (vmidi == NULL) return -ENOMEM; vmidi->substream = substream; if (snd_midi_event_new(MAX_MIDI_EVENT_BUF, &vmidi->parser) < 0) { kfree(vmidi); return -ENOMEM; } vmidi->seq_mode = rdev->seq_mode; vmidi->client = rdev->client; vmidi->port = rdev->port; snd_virmidi_init_event(vmidi, &vmidi->event); vmidi->rdev = rdev; INIT_WORK(&vmidi->output_work, snd_vmidi_output_work); runtime->private_data = vmidi; return 0; } /* * close rawmidi handle for input */ static int snd_virmidi_input_close(struct snd_rawmidi_substream *substream) { struct snd_virmidi_dev *rdev = substream->rmidi->private_data; struct snd_virmidi *vmidi = substream->runtime->private_data; down_write(&rdev->filelist_sem); write_lock_irq(&rdev->filelist_lock); list_del(&vmidi->list); write_unlock_irq(&rdev->filelist_lock); up_write(&rdev->filelist_sem); snd_midi_event_free(vmidi->parser); substream->runtime->private_data = NULL; kfree(vmidi); return 0; } /* * close rawmidi handle for output */ static int snd_virmidi_output_close(struct snd_rawmidi_substream *substream) { struct snd_virmidi *vmidi = substream->runtime->private_data; WRITE_ONCE(vmidi->trigger, false); /* to be sure */ cancel_work_sync(&vmidi->output_work); snd_midi_event_free(vmidi->parser); substream->runtime->private_data = NULL; kfree(vmidi); return 0; } /* * drain output work queue */ static void snd_virmidi_output_drain(struct snd_rawmidi_substream *substream) { struct snd_virmidi *vmidi = substream->runtime->private_data; flush_work(&vmidi->output_work); } /* * subscribe callback - allow output to rawmidi device */ static int snd_virmidi_subscribe(void *private_data, struct snd_seq_port_subscribe *info) { struct snd_virmidi_dev *rdev; rdev = private_data; if (!try_module_get(rdev->card->module)) return -EFAULT; rdev->flags |= SNDRV_VIRMIDI_SUBSCRIBE; return 0; } /* * unsubscribe callback - disallow output to rawmidi device */ static int snd_virmidi_unsubscribe(void *private_data, struct snd_seq_port_subscribe *info) { struct snd_virmidi_dev *rdev; rdev = private_data; rdev->flags &= ~SNDRV_VIRMIDI_SUBSCRIBE; module_put(rdev->card->module); return 0; } /* * use callback - allow input to rawmidi device */ static int snd_virmidi_use(void *private_data, struct snd_seq_port_subscribe *info) { struct snd_virmidi_dev *rdev; rdev = private_data; if (!try_module_get(rdev->card->module)) return -EFAULT; rdev->flags |= SNDRV_VIRMIDI_USE; return 0; } /* * unuse callback - disallow input to rawmidi device */ static int snd_virmidi_unuse(void *private_data, struct snd_seq_port_subscribe *info) { struct snd_virmidi_dev *rdev; rdev = private_data; rdev->flags &= ~SNDRV_VIRMIDI_USE; module_put(rdev->card->module); return 0; } /* * Register functions */ static const struct snd_rawmidi_ops snd_virmidi_input_ops = { .open = snd_virmidi_input_open, .close = snd_virmidi_input_close, .trigger = snd_virmidi_input_trigger, }; static const struct snd_rawmidi_ops snd_virmidi_output_ops = { .open = snd_virmidi_output_open, .close = snd_virmidi_output_close, .trigger = snd_virmidi_output_trigger, .drain = snd_virmidi_output_drain, }; /* * create a sequencer client and a port */ static int snd_virmidi_dev_attach_seq(struct snd_virmidi_dev *rdev) { int client; struct snd_seq_port_callback pcallbacks; struct snd_seq_port_info *pinfo; int err; if (rdev->client >= 0) return 0; pinfo = kzalloc(sizeof(*pinfo), GFP_KERNEL); if (!pinfo) { err = -ENOMEM; goto __error; } client = snd_seq_create_kernel_client(rdev->card, rdev->device, "%s %d-%d", rdev->rmidi->name, rdev->card->number, rdev->device); if (client < 0) { err = client; goto __error; } rdev->client = client; /* create a port */ pinfo->addr.client = client; sprintf(pinfo->name, "VirMIDI %d-%d", rdev->card->number, rdev->device); /* set all capabilities */ pinfo->capability |= SNDRV_SEQ_PORT_CAP_WRITE | SNDRV_SEQ_PORT_CAP_SYNC_WRITE | SNDRV_SEQ_PORT_CAP_SUBS_WRITE; pinfo->capability |= SNDRV_SEQ_PORT_CAP_READ | SNDRV_SEQ_PORT_CAP_SYNC_READ | SNDRV_SEQ_PORT_CAP_SUBS_READ; pinfo->capability |= SNDRV_SEQ_PORT_CAP_DUPLEX; pinfo->direction = SNDRV_SEQ_PORT_DIR_BIDIRECTION; pinfo->type = SNDRV_SEQ_PORT_TYPE_MIDI_GENERIC | SNDRV_SEQ_PORT_TYPE_SOFTWARE | SNDRV_SEQ_PORT_TYPE_PORT; pinfo->midi_channels = 16; memset(&pcallbacks, 0, sizeof(pcallbacks)); pcallbacks.owner = THIS_MODULE; pcallbacks.private_data = rdev; pcallbacks.subscribe = snd_virmidi_subscribe; pcallbacks.unsubscribe = snd_virmidi_unsubscribe; pcallbacks.use = snd_virmidi_use; pcallbacks.unuse = snd_virmidi_unuse; pcallbacks.event_input = snd_virmidi_event_input; pinfo->kernel = &pcallbacks; err = snd_seq_kernel_client_ctl(client, SNDRV_SEQ_IOCTL_CREATE_PORT, pinfo); if (err < 0) { snd_seq_delete_kernel_client(client); rdev->client = -1; goto __error; } rdev->port = pinfo->addr.port; err = 0; /* success */ __error: kfree(pinfo); return err; } /* * release the sequencer client */ static void snd_virmidi_dev_detach_seq(struct snd_virmidi_dev *rdev) { if (rdev->client >= 0) { snd_seq_delete_kernel_client(rdev->client); rdev->client = -1; } } /* * register the device */ static int snd_virmidi_dev_register(struct snd_rawmidi *rmidi) { struct snd_virmidi_dev *rdev = rmidi->private_data; int err; switch (rdev->seq_mode) { case SNDRV_VIRMIDI_SEQ_DISPATCH: err = snd_virmidi_dev_attach_seq(rdev); if (err < 0) return err; break; case SNDRV_VIRMIDI_SEQ_ATTACH: if (rdev->client == 0) return -EINVAL; /* should check presence of port more strictly.. */ break; default: pr_err("ALSA: seq_virmidi: seq_mode is not set: %d\n", rdev->seq_mode); return -EINVAL; } return 0; } /* * unregister the device */ static int snd_virmidi_dev_unregister(struct snd_rawmidi *rmidi) { struct snd_virmidi_dev *rdev = rmidi->private_data; if (rdev->seq_mode == SNDRV_VIRMIDI_SEQ_DISPATCH) snd_virmidi_dev_detach_seq(rdev); return 0; } /* * */ static const struct snd_rawmidi_global_ops snd_virmidi_global_ops = { .dev_register = snd_virmidi_dev_register, .dev_unregister = snd_virmidi_dev_unregister, }; /* * free device */ static void snd_virmidi_free(struct snd_rawmidi *rmidi) { struct snd_virmidi_dev *rdev = rmidi->private_data; kfree(rdev); } /* * create a new device * */ /* exported */ int snd_virmidi_new(struct snd_card *card, int device, struct snd_rawmidi **rrmidi) { struct snd_rawmidi *rmidi; struct snd_virmidi_dev *rdev; int err; *rrmidi = NULL; err = snd_rawmidi_new(card, "VirMidi", device, 16, /* may be configurable */ 16, /* may be configurable */ &rmidi); if (err < 0) return err; strcpy(rmidi->name, rmidi->id); rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); if (rdev == NULL) { snd_device_free(card, rmidi); return -ENOMEM; } rdev->card = card; rdev->rmidi = rmidi; rdev->device = device; rdev->client = -1; init_rwsem(&rdev->filelist_sem); rwlock_init(&rdev->filelist_lock); INIT_LIST_HEAD(&rdev->filelist); rdev->seq_mode = SNDRV_VIRMIDI_SEQ_DISPATCH; rmidi->private_data = rdev; rmidi->private_free = snd_virmidi_free; rmidi->ops = &snd_virmidi_global_ops; snd_rawmidi_set_ops(rmidi, SNDRV_RAWMIDI_STREAM_INPUT, &snd_virmidi_input_ops); snd_rawmidi_set_ops(rmidi, SNDRV_RAWMIDI_STREAM_OUTPUT, &snd_virmidi_output_ops); rmidi->info_flags = SNDRV_RAWMIDI_INFO_INPUT | SNDRV_RAWMIDI_INFO_OUTPUT | SNDRV_RAWMIDI_INFO_DUPLEX; *rrmidi = rmidi; return 0; } EXPORT_SYMBOL(snd_virmidi_new);
3 4 3 3 3 3 3 3 3 3 1251 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2007-2012 Nicira, Inc. */ #include <linux/if_vlan.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/skbuff.h> #include <net/dst.h> #include <net/xfrm.h> #include <net/rtnetlink.h> #include "datapath.h" #include "vport-internal_dev.h" #include "vport-netdev.h" struct internal_dev { struct vport *vport; }; static struct vport_ops ovs_internal_vport_ops; static struct internal_dev *internal_dev_priv(struct net_device *netdev) { return netdev_priv(netdev); } /* Called with rcu_read_lock_bh. */ static netdev_tx_t internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev) { int len, err; /* store len value because skb can be freed inside ovs_vport_receive() */ len = skb->len; rcu_read_lock(); err = ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL); rcu_read_unlock(); if (likely(!err)) dev_sw_netstats_tx_add(netdev, 1, len); else netdev->stats.tx_errors++; return NETDEV_TX_OK; } static int internal_dev_open(struct net_device *netdev) { netif_start_queue(netdev); return 0; } static int internal_dev_stop(struct net_device *netdev) { netif_stop_queue(netdev); return 0; } static void internal_dev_getinfo(struct net_device *netdev, struct ethtool_drvinfo *info) { strscpy(info->driver, "openvswitch", sizeof(info->driver)); } static const struct ethtool_ops internal_dev_ethtool_ops = { .get_drvinfo = internal_dev_getinfo, .get_link = ethtool_op_get_link, }; static void internal_dev_destructor(struct net_device *dev) { struct vport *vport = ovs_internal_dev_get_vport(dev); ovs_vport_free(vport); } static const struct net_device_ops internal_dev_netdev_ops = { .ndo_open = internal_dev_open, .ndo_stop = internal_dev_stop, .ndo_start_xmit = internal_dev_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_get_stats64 = dev_get_tstats64, }; static struct rtnl_link_ops internal_dev_link_ops __read_mostly = { .kind = "openvswitch", }; static void do_setup(struct net_device *netdev) { ether_setup(netdev); netdev->max_mtu = ETH_MAX_MTU; netdev->netdev_ops = &internal_dev_netdev_ops; netdev->priv_flags &= ~IFF_TX_SKB_SHARING; netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH | IFF_NO_QUEUE; netdev->needs_free_netdev = true; netdev->priv_destructor = NULL; netdev->ethtool_ops = &internal_dev_ethtool_ops; netdev->rtnl_link_ops = &internal_dev_link_ops; netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL; netdev->vlan_features = netdev->features; netdev->hw_enc_features = netdev->features; netdev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; netdev->hw_features = netdev->features & ~NETIF_F_LLTX; eth_hw_addr_random(netdev); } static struct vport *internal_dev_create(const struct vport_parms *parms) { struct vport *vport; struct internal_dev *internal_dev; struct net_device *dev; int err; vport = ovs_vport_alloc(0, &ovs_internal_vport_ops, parms); if (IS_ERR(vport)) { err = PTR_ERR(vport); goto error; } dev = alloc_netdev(sizeof(struct internal_dev), parms->name, NET_NAME_USER, do_setup); vport->dev = dev; if (!vport->dev) { err = -ENOMEM; goto error_free_vport; } vport->dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!vport->dev->tstats) { err = -ENOMEM; goto error_free_netdev; } dev_net_set(vport->dev, ovs_dp_get_net(vport->dp)); dev->ifindex = parms->desired_ifindex; internal_dev = internal_dev_priv(vport->dev); internal_dev->vport = vport; /* Restrict bridge port to current netns. */ if (vport->port_no == OVSP_LOCAL) vport->dev->features |= NETIF_F_NETNS_LOCAL; rtnl_lock(); err = register_netdevice(vport->dev); if (err) goto error_unlock; vport->dev->priv_destructor = internal_dev_destructor; dev_set_promiscuity(vport->dev, 1); rtnl_unlock(); netif_start_queue(vport->dev); return vport; error_unlock: rtnl_unlock(); free_percpu(dev->tstats); error_free_netdev: free_netdev(dev); error_free_vport: ovs_vport_free(vport); error: return ERR_PTR(err); } static void internal_dev_destroy(struct vport *vport) { netif_stop_queue(vport->dev); rtnl_lock(); dev_set_promiscuity(vport->dev, -1); /* unregister_netdevice() waits for an RCU grace period. */ unregister_netdevice(vport->dev); free_percpu(vport->dev->tstats); rtnl_unlock(); } static int internal_dev_recv(struct sk_buff *skb) { struct net_device *netdev = skb->dev; if (unlikely(!(netdev->flags & IFF_UP))) { kfree_skb(skb); netdev->stats.rx_dropped++; return NETDEV_TX_OK; } skb_dst_drop(skb); nf_reset_ct(skb); secpath_reset(skb); skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, netdev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); dev_sw_netstats_rx_add(netdev, skb->len); netif_rx(skb); return NETDEV_TX_OK; } static struct vport_ops ovs_internal_vport_ops = { .type = OVS_VPORT_TYPE_INTERNAL, .create = internal_dev_create, .destroy = internal_dev_destroy, .send = internal_dev_recv, }; int ovs_is_internal_dev(const struct net_device *netdev) { return netdev->netdev_ops == &internal_dev_netdev_ops; } struct vport *ovs_internal_dev_get_vport(struct net_device *netdev) { if (!ovs_is_internal_dev(netdev)) return NULL; return internal_dev_priv(netdev)->vport; } int ovs_internal_dev_rtnl_link_register(void) { int err; err = rtnl_link_register(&internal_dev_link_ops); if (err < 0) return err; err = ovs_vport_ops_register(&ovs_internal_vport_ops); if (err < 0) rtnl_link_unregister(&internal_dev_link_ops); return err; } void ovs_internal_dev_rtnl_link_unregister(void) { ovs_vport_ops_unregister(&ovs_internal_vport_ops); rtnl_link_unregister(&internal_dev_link_ops); }
182 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 // SPDX-License-Identifier: GPL-2.0 /* * All the USB notify logic * * (C) Copyright 2005 Greg Kroah-Hartman <gregkh@suse.de> * * notifier functions originally based on those in kernel/sys.c * but fixed up to not be so broken. * * Released under the GPLv2 only. */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/notifier.h> #include <linux/usb.h> #include <linux/mutex.h> #include "usb.h" static BLOCKING_NOTIFIER_HEAD(usb_notifier_list); /** * usb_register_notify - register a notifier callback whenever a usb change happens * @nb: pointer to the notifier block for the callback events. * * These changes are either USB devices or busses being added or removed. */ void usb_register_notify(struct notifier_block *nb) { blocking_notifier_chain_register(&usb_notifier_list, nb); } EXPORT_SYMBOL_GPL(usb_register_notify); /** * usb_unregister_notify - unregister a notifier callback * @nb: pointer to the notifier block for the callback events. * * usb_register_notify() must have been previously called for this function * to work properly. */ void usb_unregister_notify(struct notifier_block *nb) { blocking_notifier_chain_unregister(&usb_notifier_list, nb); } EXPORT_SYMBOL_GPL(usb_unregister_notify); void usb_notify_add_device(struct usb_device *udev) { blocking_notifier_call_chain(&usb_notifier_list, USB_DEVICE_ADD, udev); } void usb_notify_remove_device(struct usb_device *udev) { blocking_notifier_call_chain(&usb_notifier_list, USB_DEVICE_REMOVE, udev); } void usb_notify_add_bus(struct usb_bus *ubus) { blocking_notifier_call_chain(&usb_notifier_list, USB_BUS_ADD, ubus); } void usb_notify_remove_bus(struct usb_bus *ubus) { blocking_notifier_call_chain(&usb_notifier_list, USB_BUS_REMOVE, ubus); }
13 7 8 8 30 33 13 2 2 38 45 46 16 43 141 2 2 2 2 19 3 3 2 2 3 2 27 24 24 24 16 18 18 6 6 34 27 7 7 19 13 1 2 2 2 27 1 2 27 13 13 13 2 27 2 2 2 27 27 27 27 27 4 4 4 1 2 2 2 2 2 2 2 2 2 2 2 2 6 2 2 2 2 2 2 2 2 2 5 6 6 5 2 1 1 1 1 31 31 18 12 1 31 31 30 31 31 25 25 25 25 3 3 2 2 1 1 3 8 8 5 8 2 31 1 1 1 33 32 8 17 4 3 1 4 1 1 4 46 11 44 3 33 7 19 5 12 20 32 2 22 16 15 7 1 1 6 22 23 13 1 13 1 1 1247 1246 141 141 99 71 71 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2005 Voltaire Inc. All rights reserved. * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. * Copyright (c) 1999-2019, Mellanox Technologies, Inc. All rights reserved. * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. */ #include <linux/completion.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/mutex.h> #include <linux/random.h> #include <linux/rbtree.h> #include <linux/igmp.h> #include <linux/xarray.h> #include <linux/inetdevice.h> #include <linux/slab.h> #include <linux/module.h> #include <net/route.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/netevent.h> #include <net/tcp.h> #include <net/ipv6.h> #include <net/ip_fib.h> #include <net/ip6_route.h> #include <rdma/rdma_cm.h> #include <rdma/rdma_cm_ib.h> #include <rdma/rdma_netlink.h> #include <rdma/ib.h> #include <rdma/ib_cache.h> #include <rdma/ib_cm.h> #include <rdma/ib_sa.h> #include <rdma/iw_cm.h> #include "core_priv.h" #include "cma_priv.h" #include "cma_trace.h" MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("Generic RDMA CM Agent"); MODULE_LICENSE("Dual BSD/GPL"); #define CMA_CM_RESPONSE_TIMEOUT 20 #define CMA_MAX_CM_RETRIES 15 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) #define CMA_IBOE_PACKET_LIFETIME 16 #define CMA_PREFERRED_ROCE_GID_TYPE IB_GID_TYPE_ROCE_UDP_ENCAP static const char * const cma_events[] = { [RDMA_CM_EVENT_ADDR_RESOLVED] = "address resolved", [RDMA_CM_EVENT_ADDR_ERROR] = "address error", [RDMA_CM_EVENT_ROUTE_RESOLVED] = "route resolved ", [RDMA_CM_EVENT_ROUTE_ERROR] = "route error", [RDMA_CM_EVENT_CONNECT_REQUEST] = "connect request", [RDMA_CM_EVENT_CONNECT_RESPONSE] = "connect response", [RDMA_CM_EVENT_CONNECT_ERROR] = "connect error", [RDMA_CM_EVENT_UNREACHABLE] = "unreachable", [RDMA_CM_EVENT_REJECTED] = "rejected", [RDMA_CM_EVENT_ESTABLISHED] = "established", [RDMA_CM_EVENT_DISCONNECTED] = "disconnected", [RDMA_CM_EVENT_DEVICE_REMOVAL] = "device removal", [RDMA_CM_EVENT_MULTICAST_JOIN] = "multicast join", [RDMA_CM_EVENT_MULTICAST_ERROR] = "multicast error", [RDMA_CM_EVENT_ADDR_CHANGE] = "address change", [RDMA_CM_EVENT_TIMEWAIT_EXIT] = "timewait exit", }; static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, enum ib_gid_type gid_type); const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event) { size_t index = event; return (index < ARRAY_SIZE(cma_events) && cma_events[index]) ? cma_events[index] : "unrecognized event"; } EXPORT_SYMBOL(rdma_event_msg); const char *__attribute_const__ rdma_reject_msg(struct rdma_cm_id *id, int reason) { if (rdma_ib_or_roce(id->device, id->port_num)) return ibcm_reject_msg(reason); if (rdma_protocol_iwarp(id->device, id->port_num)) return iwcm_reject_msg(reason); WARN_ON_ONCE(1); return "unrecognized transport"; } EXPORT_SYMBOL(rdma_reject_msg); /** * rdma_is_consumer_reject - return true if the consumer rejected the connect * request. * @id: Communication identifier that received the REJECT event. * @reason: Value returned in the REJECT event status field. */ static bool rdma_is_consumer_reject(struct rdma_cm_id *id, int reason) { if (rdma_ib_or_roce(id->device, id->port_num)) return reason == IB_CM_REJ_CONSUMER_DEFINED; if (rdma_protocol_iwarp(id->device, id->port_num)) return reason == -ECONNREFUSED; WARN_ON_ONCE(1); return false; } const void *rdma_consumer_reject_data(struct rdma_cm_id *id, struct rdma_cm_event *ev, u8 *data_len) { const void *p; if (rdma_is_consumer_reject(id, ev->status)) { *data_len = ev->param.conn.private_data_len; p = ev->param.conn.private_data; } else { *data_len = 0; p = NULL; } return p; } EXPORT_SYMBOL(rdma_consumer_reject_data); /** * rdma_iw_cm_id() - return the iw_cm_id pointer for this cm_id. * @id: Communication Identifier */ struct iw_cm_id *rdma_iw_cm_id(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); if (id->device->node_type == RDMA_NODE_RNIC) return id_priv->cm_id.iw; return NULL; } EXPORT_SYMBOL(rdma_iw_cm_id); /** * rdma_res_to_id() - return the rdma_cm_id pointer for this restrack. * @res: rdma resource tracking entry pointer */ struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res) { struct rdma_id_private *id_priv = container_of(res, struct rdma_id_private, res); return &id_priv->id; } EXPORT_SYMBOL(rdma_res_to_id); static int cma_add_one(struct ib_device *device); static void cma_remove_one(struct ib_device *device, void *client_data); static struct ib_client cma_client = { .name = "cma", .add = cma_add_one, .remove = cma_remove_one }; static struct ib_sa_client sa_client; static LIST_HEAD(dev_list); static LIST_HEAD(listen_any_list); static DEFINE_MUTEX(lock); static struct rb_root id_table = RB_ROOT; /* Serialize operations of id_table tree */ static DEFINE_SPINLOCK(id_table_lock); static struct workqueue_struct *cma_wq; static unsigned int cma_pernet_id; struct cma_pernet { struct xarray tcp_ps; struct xarray udp_ps; struct xarray ipoib_ps; struct xarray ib_ps; }; static struct cma_pernet *cma_pernet(struct net *net) { return net_generic(net, cma_pernet_id); } static struct xarray *cma_pernet_xa(struct net *net, enum rdma_ucm_port_space ps) { struct cma_pernet *pernet = cma_pernet(net); switch (ps) { case RDMA_PS_TCP: return &pernet->tcp_ps; case RDMA_PS_UDP: return &pernet->udp_ps; case RDMA_PS_IPOIB: return &pernet->ipoib_ps; case RDMA_PS_IB: return &pernet->ib_ps; default: return NULL; } } struct id_table_entry { struct list_head id_list; struct rb_node rb_node; }; struct cma_device { struct list_head list; struct ib_device *device; struct completion comp; refcount_t refcount; struct list_head id_list; enum ib_gid_type *default_gid_type; u8 *default_roce_tos; }; struct rdma_bind_list { enum rdma_ucm_port_space ps; struct hlist_head owners; unsigned short port; }; static int cma_ps_alloc(struct net *net, enum rdma_ucm_port_space ps, struct rdma_bind_list *bind_list, int snum) { struct xarray *xa = cma_pernet_xa(net, ps); return xa_insert(xa, snum, bind_list, GFP_KERNEL); } static struct rdma_bind_list *cma_ps_find(struct net *net, enum rdma_ucm_port_space ps, int snum) { struct xarray *xa = cma_pernet_xa(net, ps); return xa_load(xa, snum); } static void cma_ps_remove(struct net *net, enum rdma_ucm_port_space ps, int snum) { struct xarray *xa = cma_pernet_xa(net, ps); xa_erase(xa, snum); } enum { CMA_OPTION_AFONLY, }; void cma_dev_get(struct cma_device *cma_dev) { refcount_inc(&cma_dev->refcount); } void cma_dev_put(struct cma_device *cma_dev) { if (refcount_dec_and_test(&cma_dev->refcount)) complete(&cma_dev->comp); } struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, void *cookie) { struct cma_device *cma_dev; struct cma_device *found_cma_dev = NULL; mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) if (filter(cma_dev->device, cookie)) { found_cma_dev = cma_dev; break; } if (found_cma_dev) cma_dev_get(found_cma_dev); mutex_unlock(&lock); return found_cma_dev; } int cma_get_default_gid_type(struct cma_device *cma_dev, u32 port) { if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; return cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)]; } int cma_set_default_gid_type(struct cma_device *cma_dev, u32 port, enum ib_gid_type default_gid_type) { unsigned long supported_gids; if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; if (default_gid_type == IB_GID_TYPE_IB && rdma_protocol_roce_eth_encap(cma_dev->device, port)) default_gid_type = IB_GID_TYPE_ROCE; supported_gids = roce_gid_type_mask_support(cma_dev->device, port); if (!(supported_gids & 1 << default_gid_type)) return -EINVAL; cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)] = default_gid_type; return 0; } int cma_get_default_roce_tos(struct cma_device *cma_dev, u32 port) { if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; return cma_dev->default_roce_tos[port - rdma_start_port(cma_dev->device)]; } int cma_set_default_roce_tos(struct cma_device *cma_dev, u32 port, u8 default_roce_tos) { if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; cma_dev->default_roce_tos[port - rdma_start_port(cma_dev->device)] = default_roce_tos; return 0; } struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev) { return cma_dev->device; } /* * Device removal can occur at anytime, so we need extra handling to * serialize notifying the user of device removal with other callbacks. * We do this by disabling removal notification while a callback is in process, * and reporting it after the callback completes. */ struct cma_multicast { struct rdma_id_private *id_priv; union { struct ib_sa_multicast *sa_mc; struct { struct work_struct work; struct rdma_cm_event event; } iboe_join; }; struct list_head list; void *context; struct sockaddr_storage addr; u8 join_state; }; struct cma_work { struct work_struct work; struct rdma_id_private *id; enum rdma_cm_state old_state; enum rdma_cm_state new_state; struct rdma_cm_event event; }; union cma_ip_addr { struct in6_addr ip6; struct { __be32 pad[3]; __be32 addr; } ip4; }; struct cma_hdr { u8 cma_version; u8 ip_version; /* IP version: 7:4 */ __be16 port; union cma_ip_addr src_addr; union cma_ip_addr dst_addr; }; #define CMA_VERSION 0x00 struct cma_req_info { struct sockaddr_storage listen_addr_storage; struct sockaddr_storage src_addr_storage; struct ib_device *device; union ib_gid local_gid; __be64 service_id; int port; bool has_gid; u16 pkey; }; static int cma_comp_exch(struct rdma_id_private *id_priv, enum rdma_cm_state comp, enum rdma_cm_state exch) { unsigned long flags; int ret; /* * The FSM uses a funny double locking where state is protected by both * the handler_mutex and the spinlock. State is not allowed to change * to/from a handler_mutex protected value without also holding * handler_mutex. */ if (comp == RDMA_CM_CONNECT || exch == RDMA_CM_CONNECT) lockdep_assert_held(&id_priv->handler_mutex); spin_lock_irqsave(&id_priv->lock, flags); if ((ret = (id_priv->state == comp))) id_priv->state = exch; spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr) { return hdr->ip_version >> 4; } static void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) { hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); } static struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv) { return (struct sockaddr *)&id_priv->id.route.addr.src_addr; } static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv) { return (struct sockaddr *)&id_priv->id.route.addr.dst_addr; } static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join) { struct in_device *in_dev = NULL; if (ndev) { rtnl_lock(); in_dev = __in_dev_get_rtnl(ndev); if (in_dev) { if (join) ip_mc_inc_group(in_dev, *(__be32 *)(mgid->raw + 12)); else ip_mc_dec_group(in_dev, *(__be32 *)(mgid->raw + 12)); } rtnl_unlock(); } return (in_dev) ? 0 : -ENODEV; } static int compare_netdev_and_ip(int ifindex_a, struct sockaddr *sa, struct id_table_entry *entry_b) { struct rdma_id_private *id_priv = list_first_entry( &entry_b->id_list, struct rdma_id_private, id_list_entry); int ifindex_b = id_priv->id.route.addr.dev_addr.bound_dev_if; struct sockaddr *sb = cma_dst_addr(id_priv); if (ifindex_a != ifindex_b) return (ifindex_a > ifindex_b) ? 1 : -1; if (sa->sa_family != sb->sa_family) return sa->sa_family - sb->sa_family; if (sa->sa_family == AF_INET && __builtin_object_size(sa, 0) >= sizeof(struct sockaddr_in)) { return memcmp(&((struct sockaddr_in *)sa)->sin_addr, &((struct sockaddr_in *)sb)->sin_addr, sizeof(((struct sockaddr_in *)sa)->sin_addr)); } if (sa->sa_family == AF_INET6 && __builtin_object_size(sa, 0) >= sizeof(struct sockaddr_in6)) { return ipv6_addr_cmp(&((struct sockaddr_in6 *)sa)->sin6_addr, &((struct sockaddr_in6 *)sb)->sin6_addr); } return -1; } static int cma_add_id_to_tree(struct rdma_id_private *node_id_priv) { struct rb_node **new, *parent = NULL; struct id_table_entry *this, *node; unsigned long flags; int result; node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) return -ENOMEM; spin_lock_irqsave(&id_table_lock, flags); new = &id_table.rb_node; while (*new) { this = container_of(*new, struct id_table_entry, rb_node); result = compare_netdev_and_ip( node_id_priv->id.route.addr.dev_addr.bound_dev_if, cma_dst_addr(node_id_priv), this); parent = *new; if (result < 0) new = &((*new)->rb_left); else if (result > 0) new = &((*new)->rb_right); else { list_add_tail(&node_id_priv->id_list_entry, &this->id_list); kfree(node); goto unlock; } } INIT_LIST_HEAD(&node->id_list); list_add_tail(&node_id_priv->id_list_entry, &node->id_list); rb_link_node(&node->rb_node, parent, new); rb_insert_color(&node->rb_node, &id_table); unlock: spin_unlock_irqrestore(&id_table_lock, flags); return 0; } static struct id_table_entry * node_from_ndev_ip(struct rb_root *root, int ifindex, struct sockaddr *sa) { struct rb_node *node = root->rb_node; struct id_table_entry *data; int result; while (node) { data = container_of(node, struct id_table_entry, rb_node); result = compare_netdev_and_ip(ifindex, sa, data); if (result < 0) node = node->rb_left; else if (result > 0) node = node->rb_right; else return data; } return NULL; } static void cma_remove_id_from_tree(struct rdma_id_private *id_priv) { struct id_table_entry *data; unsigned long flags; spin_lock_irqsave(&id_table_lock, flags); if (list_empty(&id_priv->id_list_entry)) goto out; data = node_from_ndev_ip(&id_table, id_priv->id.route.addr.dev_addr.bound_dev_if, cma_dst_addr(id_priv)); if (!data) goto out; list_del_init(&id_priv->id_list_entry); if (list_empty(&data->id_list)) { rb_erase(&data->rb_node, &id_table); kfree(data); } out: spin_unlock_irqrestore(&id_table_lock, flags); } static void _cma_attach_to_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { cma_dev_get(cma_dev); id_priv->cma_dev = cma_dev; id_priv->id.device = cma_dev->device; id_priv->id.route.addr.dev_addr.transport = rdma_node_get_transport(cma_dev->device->node_type); list_add_tail(&id_priv->device_item, &cma_dev->id_list); trace_cm_id_attach(id_priv, cma_dev->device); } static void cma_attach_to_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { _cma_attach_to_dev(id_priv, cma_dev); id_priv->gid_type = cma_dev->default_gid_type[id_priv->id.port_num - rdma_start_port(cma_dev->device)]; } static void cma_release_dev(struct rdma_id_private *id_priv) { mutex_lock(&lock); list_del_init(&id_priv->device_item); cma_dev_put(id_priv->cma_dev); id_priv->cma_dev = NULL; id_priv->id.device = NULL; if (id_priv->id.route.addr.dev_addr.sgid_attr) { rdma_put_gid_attr(id_priv->id.route.addr.dev_addr.sgid_attr); id_priv->id.route.addr.dev_addr.sgid_attr = NULL; } mutex_unlock(&lock); } static inline unsigned short cma_family(struct rdma_id_private *id_priv) { return id_priv->id.route.addr.src_addr.ss_family; } static int cma_set_default_qkey(struct rdma_id_private *id_priv) { struct ib_sa_mcmember_rec rec; int ret = 0; switch (id_priv->id.ps) { case RDMA_PS_UDP: case RDMA_PS_IB: id_priv->qkey = RDMA_UDP_QKEY; break; case RDMA_PS_IPOIB: ib_addr_get_mgid(&id_priv->id.route.addr.dev_addr, &rec.mgid); ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num, &rec.mgid, &rec); if (!ret) id_priv->qkey = be32_to_cpu(rec.qkey); break; default: break; } return ret; } static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey) { if (!qkey || (id_priv->qkey && (id_priv->qkey != qkey))) return -EINVAL; id_priv->qkey = qkey; return 0; } static void cma_translate_ib(struct sockaddr_ib *sib, struct rdma_dev_addr *dev_addr) { dev_addr->dev_type = ARPHRD_INFINIBAND; rdma_addr_set_sgid(dev_addr, (union ib_gid *) &sib->sib_addr); ib_addr_set_pkey(dev_addr, ntohs(sib->sib_pkey)); } static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) { int ret; if (addr->sa_family != AF_IB) { ret = rdma_translate_ip(addr, dev_addr); } else { cma_translate_ib((struct sockaddr_ib *) addr, dev_addr); ret = 0; } return ret; } static const struct ib_gid_attr * cma_validate_port(struct ib_device *device, u32 port, enum ib_gid_type gid_type, union ib_gid *gid, struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr = ERR_PTR(-ENODEV); int bound_if_index = dev_addr->bound_dev_if; int dev_type = dev_addr->dev_type; struct net_device *ndev = NULL; if (!rdma_dev_access_netns(device, id_priv->id.route.addr.dev_addr.net)) goto out; if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port)) goto out; if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port)) goto out; /* * For drivers that do not associate more than one net device with * their gid tables, such as iWARP drivers, it is sufficient to * return the first table entry. * * Other driver classes might be included in the future. */ if (rdma_protocol_iwarp(device, port)) { sgid_attr = rdma_get_gid_attr(device, port, 0); if (IS_ERR(sgid_attr)) goto out; rcu_read_lock(); ndev = rcu_dereference(sgid_attr->ndev); if (!net_eq(dev_net(ndev), dev_addr->net) || ndev->ifindex != bound_if_index) sgid_attr = ERR_PTR(-ENODEV); rcu_read_unlock(); goto out; } if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) { ndev = dev_get_by_index(dev_addr->net, bound_if_index); if (!ndev) goto out; } else { gid_type = IB_GID_TYPE_IB; } sgid_attr = rdma_find_gid_by_port(device, gid, gid_type, port, ndev); dev_put(ndev); out: return sgid_attr; } static void cma_bind_sgid_attr(struct rdma_id_private *id_priv, const struct ib_gid_attr *sgid_attr) { WARN_ON(id_priv->id.route.addr.dev_addr.sgid_attr); id_priv->id.route.addr.dev_addr.sgid_attr = sgid_attr; } /** * cma_acquire_dev_by_src_ip - Acquire cma device, port, gid attribute * based on source ip address. * @id_priv: cm_id which should be bound to cma device * * cma_acquire_dev_by_src_ip() binds cm id to cma device, port and GID attribute * based on source IP address. It returns 0 on success or error code otherwise. * It is applicable to active and passive side cm_id. */ static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr; union ib_gid gid, iboe_gid, *gidp; struct cma_device *cma_dev; enum ib_gid_type gid_type; int ret = -ENODEV; u32 port; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &iboe_gid); memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof(gid)); mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) { rdma_for_each_port (cma_dev->device, port) { gidp = rdma_protocol_roce(cma_dev->device, port) ? &iboe_gid : &gid; gid_type = cma_dev->default_gid_type[port - 1]; sgid_attr = cma_validate_port(cma_dev->device, port, gid_type, gidp, id_priv); if (!IS_ERR(sgid_attr)) { id_priv->id.port_num = port; cma_bind_sgid_attr(id_priv, sgid_attr); cma_attach_to_dev(id_priv, cma_dev); ret = 0; goto out; } } } out: mutex_unlock(&lock); return ret; } /** * cma_ib_acquire_dev - Acquire cma device, port and SGID attribute * @id_priv: cm id to bind to cma device * @listen_id_priv: listener cm id to match against * @req: Pointer to req structure containaining incoming * request information * cma_ib_acquire_dev() acquires cma device, port and SGID attribute when * rdma device matches for listen_id and incoming request. It also verifies * that a GID table entry is present for the source address. * Returns 0 on success, or returns error code otherwise. */ static int cma_ib_acquire_dev(struct rdma_id_private *id_priv, const struct rdma_id_private *listen_id_priv, struct cma_req_info *req) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr; enum ib_gid_type gid_type; union ib_gid gid; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; if (rdma_protocol_roce(req->device, req->port)) rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &gid); else memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof(gid)); gid_type = listen_id_priv->cma_dev->default_gid_type[req->port - 1]; sgid_attr = cma_validate_port(req->device, req->port, gid_type, &gid, id_priv); if (IS_ERR(sgid_attr)) return PTR_ERR(sgid_attr); id_priv->id.port_num = req->port; cma_bind_sgid_attr(id_priv, sgid_attr); /* Need to acquire lock to protect against reader * of cma_dev->id_list such as cma_netdev_callback() and * cma_process_remove(). */ mutex_lock(&lock); cma_attach_to_dev(id_priv, listen_id_priv->cma_dev); mutex_unlock(&lock); rdma_restrack_add(&id_priv->res); return 0; } static int cma_iw_acquire_dev(struct rdma_id_private *id_priv, const struct rdma_id_private *listen_id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr; struct cma_device *cma_dev; enum ib_gid_type gid_type; int ret = -ENODEV; union ib_gid gid; u32 port; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof(gid)); mutex_lock(&lock); cma_dev = listen_id_priv->cma_dev; port = listen_id_priv->id.port_num; gid_type = listen_id_priv->gid_type; sgid_attr = cma_validate_port(cma_dev->device, port, gid_type, &gid, id_priv); if (!IS_ERR(sgid_attr)) { id_priv->id.port_num = port; cma_bind_sgid_attr(id_priv, sgid_attr); ret = 0; goto out; } list_for_each_entry(cma_dev, &dev_list, list) { rdma_for_each_port (cma_dev->device, port) { if (listen_id_priv->cma_dev == cma_dev && listen_id_priv->id.port_num == port) continue; gid_type = cma_dev->default_gid_type[port - 1]; sgid_attr = cma_validate_port(cma_dev->device, port, gid_type, &gid, id_priv); if (!IS_ERR(sgid_attr)) { id_priv->id.port_num = port; cma_bind_sgid_attr(id_priv, sgid_attr); ret = 0; goto out; } } } out: if (!ret) { cma_attach_to_dev(id_priv, cma_dev); rdma_restrack_add(&id_priv->res); } mutex_unlock(&lock); return ret; } /* * Select the source IB device and address to reach the destination IB address. */ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) { struct cma_device *cma_dev, *cur_dev; struct sockaddr_ib *addr; union ib_gid gid, sgid, *dgid; unsigned int p; u16 pkey, index; enum ib_port_state port_state; int ret; int i; cma_dev = NULL; addr = (struct sockaddr_ib *) cma_dst_addr(id_priv); dgid = (union ib_gid *) &addr->sib_addr; pkey = ntohs(addr->sib_pkey); mutex_lock(&lock); list_for_each_entry(cur_dev, &dev_list, list) { rdma_for_each_port (cur_dev->device, p) { if (!rdma_cap_af_ib(cur_dev->device, p)) continue; if (ib_find_cached_pkey(cur_dev->device, p, pkey, &index)) continue; if (ib_get_cached_port_state(cur_dev->device, p, &port_state)) continue; for (i = 0; i < cur_dev->device->port_data[p].immutable.gid_tbl_len; ++i) { ret = rdma_query_gid(cur_dev->device, p, i, &gid); if (ret) continue; if (!memcmp(&gid, dgid, sizeof(gid))) { cma_dev = cur_dev; sgid = gid; id_priv->id.port_num = p; goto found; } if (!cma_dev && (gid.global.subnet_prefix == dgid->global.subnet_prefix) && port_state == IB_PORT_ACTIVE) { cma_dev = cur_dev; sgid = gid; id_priv->id.port_num = p; goto found; } } } } mutex_unlock(&lock); return -ENODEV; found: cma_attach_to_dev(id_priv, cma_dev); rdma_restrack_add(&id_priv->res); mutex_unlock(&lock); addr = (struct sockaddr_ib *)cma_src_addr(id_priv); memcpy(&addr->sib_addr, &sgid, sizeof(sgid)); cma_translate_ib(addr, &id_priv->id.route.addr.dev_addr); return 0; } static void cma_id_get(struct rdma_id_private *id_priv) { refcount_inc(&id_priv->refcount); } static void cma_id_put(struct rdma_id_private *id_priv) { if (refcount_dec_and_test(&id_priv->refcount)) complete(&id_priv->comp); } static struct rdma_id_private * __rdma_create_id(struct net *net, rdma_cm_event_handler event_handler, void *context, enum rdma_ucm_port_space ps, enum ib_qp_type qp_type, const struct rdma_id_private *parent) { struct rdma_id_private *id_priv; id_priv = kzalloc(sizeof *id_priv, GFP_KERNEL); if (!id_priv) return ERR_PTR(-ENOMEM); id_priv->state = RDMA_CM_IDLE; id_priv->id.context = context; id_priv->id.event_handler = event_handler; id_priv->id.ps = ps; id_priv->id.qp_type = qp_type; id_priv->tos_set = false; id_priv->timeout_set = false; id_priv->min_rnr_timer_set = false; id_priv->gid_type = IB_GID_TYPE_IB; spin_lock_init(&id_priv->lock); mutex_init(&id_priv->qp_mutex); init_completion(&id_priv->comp); refcount_set(&id_priv->refcount, 1); mutex_init(&id_priv->handler_mutex); INIT_LIST_HEAD(&id_priv->device_item); INIT_LIST_HEAD(&id_priv->id_list_entry); INIT_LIST_HEAD(&id_priv->listen_list); INIT_LIST_HEAD(&id_priv->mc_list); get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num); id_priv->id.route.addr.dev_addr.net = get_net(net); id_priv->seq_num &= 0x00ffffff; rdma_restrack_new(&id_priv->res, RDMA_RESTRACK_CM_ID); if (parent) rdma_restrack_parent_name(&id_priv->res, &parent->res); return id_priv; } struct rdma_cm_id * __rdma_create_kernel_id(struct net *net, rdma_cm_event_handler event_handler, void *context, enum rdma_ucm_port_space ps, enum ib_qp_type qp_type, const char *caller) { struct rdma_id_private *ret; ret = __rdma_create_id(net, event_handler, context, ps, qp_type, NULL); if (IS_ERR(ret)) return ERR_CAST(ret); rdma_restrack_set_name(&ret->res, caller); return &ret->id; } EXPORT_SYMBOL(__rdma_create_kernel_id); struct rdma_cm_id *rdma_create_user_id(rdma_cm_event_handler event_handler, void *context, enum rdma_ucm_port_space ps, enum ib_qp_type qp_type) { struct rdma_id_private *ret; ret = __rdma_create_id(current->nsproxy->net_ns, event_handler, context, ps, qp_type, NULL); if (IS_ERR(ret)) return ERR_CAST(ret); rdma_restrack_set_name(&ret->res, NULL); return &ret->id; } EXPORT_SYMBOL(rdma_create_user_id); static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) return ret; ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); if (ret) return ret; qp_attr.qp_state = IB_QPS_RTR; ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE); if (ret) return ret; qp_attr.qp_state = IB_QPS_RTS; qp_attr.sq_psn = 0; ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN); return ret; } static int cma_init_conn_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) return ret; return ib_modify_qp(qp, &qp_attr, qp_attr_mask); } int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr) { struct rdma_id_private *id_priv; struct ib_qp *qp; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (id->device != pd->device) { ret = -EINVAL; goto out_err; } qp_init_attr->port_num = id->port_num; qp = ib_create_qp(pd, qp_init_attr); if (IS_ERR(qp)) { ret = PTR_ERR(qp); goto out_err; } if (id->qp_type == IB_QPT_UD) ret = cma_init_ud_qp(id_priv, qp); else ret = cma_init_conn_qp(id_priv, qp); if (ret) goto out_destroy; id->qp = qp; id_priv->qp_num = qp->qp_num; id_priv->srq = (qp->srq != NULL); trace_cm_qp_create(id_priv, pd, qp_init_attr, 0); return 0; out_destroy: ib_destroy_qp(qp); out_err: trace_cm_qp_create(id_priv, pd, qp_init_attr, ret); return ret; } EXPORT_SYMBOL(rdma_create_qp); void rdma_destroy_qp(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); trace_cm_qp_destroy(id_priv); mutex_lock(&id_priv->qp_mutex); ib_destroy_qp(id_priv->id.qp); id_priv->id.qp = NULL; mutex_unlock(&id_priv->qp_mutex); } EXPORT_SYMBOL(rdma_destroy_qp); static int cma_modify_qp_rtr(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } /* Need to update QP attributes from default values. */ qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); if (ret) goto out; qp_attr.qp_state = IB_QPS_RTR; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; BUG_ON(id_priv->cma_dev->device != id_priv->id.device); if (conn_param) qp_attr.max_dest_rd_atomic = conn_param->responder_resources; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_modify_qp_rts(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } qp_attr.qp_state = IB_QPS_RTS; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; if (conn_param) qp_attr.max_rd_atomic = conn_param->initiator_depth; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_modify_qp_err(struct rdma_id_private *id_priv) { struct ib_qp_attr qp_attr; int ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } qp_attr.qp_state = IB_QPS_ERR; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, IB_QP_STATE); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; int ret; u16 pkey; if (rdma_cap_eth_ah(id_priv->id.device, id_priv->id.port_num)) pkey = 0xffff; else pkey = ib_addr_get_pkey(dev_addr); ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num, pkey, &qp_attr->pkey_index); if (ret) return ret; qp_attr->port_num = id_priv->id.port_num; *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT; if (id_priv->id.qp_type == IB_QPT_UD) { ret = cma_set_default_qkey(id_priv); if (ret) return ret; qp_attr->qkey = id_priv->qkey; *qp_attr_mask |= IB_QP_QKEY; } else { qp_attr->qp_access_flags = 0; *qp_attr_mask |= IB_QP_ACCESS_FLAGS; } return 0; } int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct rdma_id_private *id_priv; int ret = 0; id_priv = container_of(id, struct rdma_id_private, id); if (rdma_cap_ib_cm(id->device, id->port_num)) { if (!id_priv->cm_id.ib || (id_priv->id.qp_type == IB_QPT_UD)) ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask); else ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr, qp_attr_mask); if (qp_attr->qp_state == IB_QPS_RTR) qp_attr->rq_psn = id_priv->seq_num; } else if (rdma_cap_iw_cm(id->device, id->port_num)) { if (!id_priv->cm_id.iw) { qp_attr->qp_access_flags = 0; *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS; } else ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr, qp_attr_mask); qp_attr->port_num = id_priv->id.port_num; *qp_attr_mask |= IB_QP_PORT; } else { ret = -ENOSYS; } if ((*qp_attr_mask & IB_QP_TIMEOUT) && id_priv->timeout_set) qp_attr->timeout = id_priv->timeout; if ((*qp_attr_mask & IB_QP_MIN_RNR_TIMER) && id_priv->min_rnr_timer_set) qp_attr->min_rnr_timer = id_priv->min_rnr_timer; return ret; } EXPORT_SYMBOL(rdma_init_qp_attr); static inline bool cma_zero_addr(const struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: return ipv4_is_zeronet(((struct sockaddr_in *)addr)->sin_addr.s_addr); case AF_INET6: return ipv6_addr_any(&((struct sockaddr_in6 *)addr)->sin6_addr); case AF_IB: return ib_addr_any(&((struct sockaddr_ib *)addr)->sib_addr); default: return false; } } static inline bool cma_loopback_addr(const struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: return ipv4_is_loopback( ((struct sockaddr_in *)addr)->sin_addr.s_addr); case AF_INET6: return ipv6_addr_loopback( &((struct sockaddr_in6 *)addr)->sin6_addr); case AF_IB: return ib_addr_loopback( &((struct sockaddr_ib *)addr)->sib_addr); default: return false; } } static inline bool cma_any_addr(const struct sockaddr *addr) { return cma_zero_addr(addr) || cma_loopback_addr(addr); } static int cma_addr_cmp(const struct sockaddr *src, const struct sockaddr *dst) { if (src->sa_family != dst->sa_family) return -1; switch (src->sa_family) { case AF_INET: return ((struct sockaddr_in *)src)->sin_addr.s_addr != ((struct sockaddr_in *)dst)->sin_addr.s_addr; case AF_INET6: { struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *)src; struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *)dst; bool link_local; if (ipv6_addr_cmp(&src_addr6->sin6_addr, &dst_addr6->sin6_addr)) return 1; link_local = ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL; /* Link local must match their scope_ids */ return link_local ? (src_addr6->sin6_scope_id != dst_addr6->sin6_scope_id) : 0; } default: return ib_addr_cmp(&((struct sockaddr_ib *) src)->sib_addr, &((struct sockaddr_ib *) dst)->sib_addr); } } static __be16 cma_port(const struct sockaddr *addr) { struct sockaddr_ib *sib; switch (addr->sa_family) { case AF_INET: return ((struct sockaddr_in *) addr)->sin_port; case AF_INET6: return ((struct sockaddr_in6 *) addr)->sin6_port; case AF_IB: sib = (struct sockaddr_ib *) addr; return htons((u16) (be64_to_cpu(sib->sib_sid) & be64_to_cpu(sib->sib_sid_mask))); default: return 0; } } static inline int cma_any_port(const struct sockaddr *addr) { return !cma_port(addr); } static void cma_save_ib_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, const struct rdma_cm_id *listen_id, const struct sa_path_rec *path) { struct sockaddr_ib *listen_ib, *ib; listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr; if (src_addr) { ib = (struct sockaddr_ib *)src_addr; ib->sib_family = AF_IB; if (path) { ib->sib_pkey = path->pkey; ib->sib_flowinfo = path->flow_label; memcpy(&ib->sib_addr, &path->sgid, 16); ib->sib_sid = path->service_id; ib->sib_scope_id = 0; } else { ib->sib_pkey = listen_ib->sib_pkey; ib->sib_flowinfo = listen_ib->sib_flowinfo; ib->sib_addr = listen_ib->sib_addr; ib->sib_sid = listen_ib->sib_sid; ib->sib_scope_id = listen_ib->sib_scope_id; } ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL); } if (dst_addr) { ib = (struct sockaddr_ib *)dst_addr; ib->sib_family = AF_IB; if (path) { ib->sib_pkey = path->pkey; ib->sib_flowinfo = path->flow_label; memcpy(&ib->sib_addr, &path->dgid, 16); } } } static void cma_save_ip4_info(struct sockaddr_in *src_addr, struct sockaddr_in *dst_addr, struct cma_hdr *hdr, __be16 local_port) { if (src_addr) { *src_addr = (struct sockaddr_in) { .sin_family = AF_INET, .sin_addr.s_addr = hdr->dst_addr.ip4.addr, .sin_port = local_port, }; } if (dst_addr) { *dst_addr = (struct sockaddr_in) { .sin_family = AF_INET, .sin_addr.s_addr = hdr->src_addr.ip4.addr, .sin_port = hdr->port, }; } } static void cma_save_ip6_info(struct sockaddr_in6 *src_addr, struct sockaddr_in6 *dst_addr, struct cma_hdr *hdr, __be16 local_port) { if (src_addr) { *src_addr = (struct sockaddr_in6) { .sin6_family = AF_INET6, .sin6_addr = hdr->dst_addr.ip6, .sin6_port = local_port, }; } if (dst_addr) { *dst_addr = (struct sockaddr_in6) { .sin6_family = AF_INET6, .sin6_addr = hdr->src_addr.ip6, .sin6_port = hdr->port, }; } } static u16 cma_port_from_service_id(__be64 service_id) { return (u16)be64_to_cpu(service_id); } static int cma_save_ip_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, const struct ib_cm_event *ib_event, __be64 service_id) { struct cma_hdr *hdr; __be16 port; hdr = ib_event->private_data; if (hdr->cma_version != CMA_VERSION) return -EINVAL; port = htons(cma_port_from_service_id(service_id)); switch (cma_get_ip_ver(hdr)) { case 4: cma_save_ip4_info((struct sockaddr_in *)src_addr, (struct sockaddr_in *)dst_addr, hdr, port); break; case 6: cma_save_ip6_info((struct sockaddr_in6 *)src_addr, (struct sockaddr_in6 *)dst_addr, hdr, port); break; default: return -EAFNOSUPPORT; } return 0; } static int cma_save_net_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, const struct rdma_cm_id *listen_id, const struct ib_cm_event *ib_event, sa_family_t sa_family, __be64 service_id) { if (sa_family == AF_IB) { if (ib_event->event == IB_CM_REQ_RECEIVED) cma_save_ib_info(src_addr, dst_addr, listen_id, ib_event->param.req_rcvd.primary_path); else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) cma_save_ib_info(src_addr, dst_addr, listen_id, NULL); return 0; } return cma_save_ip_info(src_addr, dst_addr, ib_event, service_id); } static int cma_save_req_info(const struct ib_cm_event *ib_event, struct cma_req_info *req) { const struct ib_cm_req_event_param *req_param = &ib_event->param.req_rcvd; const struct ib_cm_sidr_req_event_param *sidr_param = &ib_event->param.sidr_req_rcvd; switch (ib_event->event) { case IB_CM_REQ_RECEIVED: req->device = req_param->listen_id->device; req->port = req_param->port; memcpy(&req->local_gid, &req_param->primary_path->sgid, sizeof(req->local_gid)); req->has_gid = true; req->service_id = req_param->primary_path->service_id; req->pkey = be16_to_cpu(req_param->primary_path->pkey); if (req->pkey != req_param->bth_pkey) pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and primary path P_Key (0x%x)\n" "RDMA CMA: in the future this may cause the request to be dropped\n", req_param->bth_pkey, req->pkey); break; case IB_CM_SIDR_REQ_RECEIVED: req->device = sidr_param->listen_id->device; req->port = sidr_param->port; req->has_gid = false; req->service_id = sidr_param->service_id; req->pkey = sidr_param->pkey; if (req->pkey != sidr_param->bth_pkey) pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and SIDR request payload P_Key (0x%x)\n" "RDMA CMA: in the future this may cause the request to be dropped\n", sidr_param->bth_pkey, req->pkey); break; default: return -EINVAL; } return 0; } static bool validate_ipv4_net_dev(struct net_device *net_dev, const struct sockaddr_in *dst_addr, const struct sockaddr_in *src_addr) { __be32 daddr = dst_addr->sin_addr.s_addr, saddr = src_addr->sin_addr.s_addr; struct fib_result res; struct flowi4 fl4; int err; bool ret; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || ipv4_is_lbcast(daddr) || ipv4_is_zeronet(saddr) || ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr) || ipv4_is_loopback(saddr)) return false; memset(&fl4, 0, sizeof(fl4)); fl4.flowi4_oif = net_dev->ifindex; fl4.daddr = daddr; fl4.saddr = saddr; rcu_read_lock(); err = fib_lookup(dev_net(net_dev), &fl4, &res, 0); ret = err == 0 && FIB_RES_DEV(res) == net_dev; rcu_read_unlock(); return ret; } static bool validate_ipv6_net_dev(struct net_device *net_dev, const struct sockaddr_in6 *dst_addr, const struct sockaddr_in6 *src_addr) { #if IS_ENABLED(CONFIG_IPV6) const int strict = ipv6_addr_type(&dst_addr->sin6_addr) & IPV6_ADDR_LINKLOCAL; struct rt6_info *rt = rt6_lookup(dev_net(net_dev), &dst_addr->sin6_addr, &src_addr->sin6_addr, net_dev->ifindex, NULL, strict); bool ret; if (!rt) return false; ret = rt->rt6i_idev->dev == net_dev; ip6_rt_put(rt); return ret; #else return false; #endif } static bool validate_net_dev(struct net_device *net_dev, const struct sockaddr *daddr, const struct sockaddr *saddr) { const struct sockaddr_in *daddr4 = (const struct sockaddr_in *)daddr; const struct sockaddr_in *saddr4 = (const struct sockaddr_in *)saddr; const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr; const struct sockaddr_in6 *saddr6 = (const struct sockaddr_in6 *)saddr; switch (daddr->sa_family) { case AF_INET: return saddr->sa_family == AF_INET && validate_ipv4_net_dev(net_dev, daddr4, saddr4); case AF_INET6: return saddr->sa_family == AF_INET6 && validate_ipv6_net_dev(net_dev, daddr6, saddr6); default: return false; } } static struct net_device * roce_get_net_dev_by_cm_event(const struct ib_cm_event *ib_event) { const struct ib_gid_attr *sgid_attr = NULL; struct net_device *ndev; if (ib_event->event == IB_CM_REQ_RECEIVED) sgid_attr = ib_event->param.req_rcvd.ppath_sgid_attr; else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) sgid_attr = ib_event->param.sidr_req_rcvd.sgid_attr; if (!sgid_attr) return NULL; rcu_read_lock(); ndev = rdma_read_gid_attr_ndev_rcu(sgid_attr); if (IS_ERR(ndev)) ndev = NULL; else dev_hold(ndev); rcu_read_unlock(); return ndev; } static struct net_device *cma_get_net_dev(const struct ib_cm_event *ib_event, struct cma_req_info *req) { struct sockaddr *listen_addr = (struct sockaddr *)&req->listen_addr_storage; struct sockaddr *src_addr = (struct sockaddr *)&req->src_addr_storage; struct net_device *net_dev; const union ib_gid *gid = req->has_gid ? &req->local_gid : NULL; int err; err = cma_save_ip_info(listen_addr, src_addr, ib_event, req->service_id); if (err) return ERR_PTR(err); if (rdma_protocol_roce(req->device, req->port)) net_dev = roce_get_net_dev_by_cm_event(ib_event); else net_dev = ib_get_net_dev_by_params(req->device, req->port, req->pkey, gid, listen_addr); if (!net_dev) return ERR_PTR(-ENODEV); return net_dev; } static enum rdma_ucm_port_space rdma_ps_from_service_id(__be64 service_id) { return (be64_to_cpu(service_id) >> 16) & 0xffff; } static bool cma_match_private_data(struct rdma_id_private *id_priv, const struct cma_hdr *hdr) { struct sockaddr *addr = cma_src_addr(id_priv); __be32 ip4_addr; struct in6_addr ip6_addr; if (cma_any_addr(addr) && !id_priv->afonly) return true; switch (addr->sa_family) { case AF_INET: ip4_addr = ((struct sockaddr_in *)addr)->sin_addr.s_addr; if (cma_get_ip_ver(hdr) != 4) return false; if (!cma_any_addr(addr) && hdr->dst_addr.ip4.addr != ip4_addr) return false; break; case AF_INET6: ip6_addr = ((struct sockaddr_in6 *)addr)->sin6_addr; if (cma_get_ip_ver(hdr) != 6) return false; if (!cma_any_addr(addr) && memcmp(&hdr->dst_addr.ip6, &ip6_addr, sizeof(ip6_addr))) return false; break; case AF_IB: return true; default: return false; } return true; } static bool cma_protocol_roce(const struct rdma_cm_id *id) { struct ib_device *device = id->device; const u32 port_num = id->port_num ?: rdma_start_port(device); return rdma_protocol_roce(device, port_num); } static bool cma_is_req_ipv6_ll(const struct cma_req_info *req) { const struct sockaddr *daddr = (const struct sockaddr *)&req->listen_addr_storage; const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr; /* Returns true if the req is for IPv6 link local */ return (daddr->sa_family == AF_INET6 && (ipv6_addr_type(&daddr6->sin6_addr) & IPV6_ADDR_LINKLOCAL)); } static bool cma_match_net_dev(const struct rdma_cm_id *id, const struct net_device *net_dev, const struct cma_req_info *req) { const struct rdma_addr *addr = &id->route.addr; if (!net_dev) /* This request is an AF_IB request */ return (!id->port_num || id->port_num == req->port) && (addr->src_addr.ss_family == AF_IB); /* * If the request is not for IPv6 link local, allow matching * request to any netdevice of the one or multiport rdma device. */ if (!cma_is_req_ipv6_ll(req)) return true; /* * Net namespaces must match, and if the listner is listening * on a specific netdevice than netdevice must match as well. */ if (net_eq(dev_net(net_dev), addr->dev_addr.net) && (!!addr->dev_addr.bound_dev_if == (addr->dev_addr.bound_dev_if == net_dev->ifindex))) return true; else return false; } static struct rdma_id_private *cma_find_listener( const struct rdma_bind_list *bind_list, const struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event, const struct cma_req_info *req, const struct net_device *net_dev) { struct rdma_id_private *id_priv, *id_priv_dev; lockdep_assert_held(&lock); if (!bind_list) return ERR_PTR(-EINVAL); hlist_for_each_entry(id_priv, &bind_list->owners, node) { if (cma_match_private_data(id_priv, ib_event->private_data)) { if (id_priv->id.device == cm_id->device && cma_match_net_dev(&id_priv->id, net_dev, req)) return id_priv; list_for_each_entry(id_priv_dev, &id_priv->listen_list, listen_item) { if (id_priv_dev->id.device == cm_id->device && cma_match_net_dev(&id_priv_dev->id, net_dev, req)) return id_priv_dev; } } } return ERR_PTR(-EINVAL); } static struct rdma_id_private * cma_ib_id_from_event(struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event, struct cma_req_info *req, struct net_device **net_dev) { struct rdma_bind_list *bind_list; struct rdma_id_private *id_priv; int err; err = cma_save_req_info(ib_event, req); if (err) return ERR_PTR(err); *net_dev = cma_get_net_dev(ib_event, req); if (IS_ERR(*net_dev)) { if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) { /* Assuming the protocol is AF_IB */ *net_dev = NULL; } else { return ERR_CAST(*net_dev); } } mutex_lock(&lock); /* * Net namespace might be getting deleted while route lookup, * cm_id lookup is in progress. Therefore, perform netdevice * validation, cm_id lookup under rcu lock. * RCU lock along with netdevice state check, synchronizes with * netdevice migrating to different net namespace and also avoids * case where net namespace doesn't get deleted while lookup is in * progress. * If the device state is not IFF_UP, its properties such as ifindex * and nd_net cannot be trusted to remain valid without rcu lock. * net/core/dev.c change_net_namespace() ensures to synchronize with * ongoing operations on net device after device is closed using * synchronize_net(). */ rcu_read_lock(); if (*net_dev) { /* * If netdevice is down, it is likely that it is administratively * down or it might be migrating to different namespace. * In that case avoid further processing, as the net namespace * or ifindex may change. */ if (((*net_dev)->flags & IFF_UP) == 0) { id_priv = ERR_PTR(-EHOSTUNREACH); goto err; } if (!validate_net_dev(*net_dev, (struct sockaddr *)&req->src_addr_storage, (struct sockaddr *)&req->listen_addr_storage)) { id_priv = ERR_PTR(-EHOSTUNREACH); goto err; } } bind_list = cma_ps_find(*net_dev ? dev_net(*net_dev) : &init_net, rdma_ps_from_service_id(req->service_id), cma_port_from_service_id(req->service_id)); id_priv = cma_find_listener(bind_list, cm_id, ib_event, req, *net_dev); err: rcu_read_unlock(); mutex_unlock(&lock); if (IS_ERR(id_priv) && *net_dev) { dev_put(*net_dev); *net_dev = NULL; } return id_priv; } static inline u8 cma_user_data_offset(struct rdma_id_private *id_priv) { return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr); } static void cma_cancel_route(struct rdma_id_private *id_priv) { if (rdma_cap_ib_sa(id_priv->id.device, id_priv->id.port_num)) { if (id_priv->query) ib_sa_cancel_query(id_priv->query_id, id_priv->query); } } static void _cma_cancel_listens(struct rdma_id_private *id_priv) { struct rdma_id_private *dev_id_priv; lockdep_assert_held(&lock); /* * Remove from listen_any_list to prevent added devices from spawning * additional listen requests. */ list_del_init(&id_priv->listen_any_item); while (!list_empty(&id_priv->listen_list)) { dev_id_priv = list_first_entry(&id_priv->listen_list, struct rdma_id_private, listen_item); /* sync with device removal to avoid duplicate destruction */ list_del_init(&dev_id_priv->device_item); list_del_init(&dev_id_priv->listen_item); mutex_unlock(&lock); rdma_destroy_id(&dev_id_priv->id); mutex_lock(&lock); } } static void cma_cancel_listens(struct rdma_id_private *id_priv) { mutex_lock(&lock); _cma_cancel_listens(id_priv); mutex_unlock(&lock); } static void cma_cancel_operation(struct rdma_id_private *id_priv, enum rdma_cm_state state) { switch (state) { case RDMA_CM_ADDR_QUERY: /* * We can avoid doing the rdma_addr_cancel() based on state, * only RDMA_CM_ADDR_QUERY has a work that could still execute. * Notice that the addr_handler work could still be exiting * outside this state, however due to the interaction with the * handler_mutex the work is guaranteed not to touch id_priv * during exit. */ rdma_addr_cancel(&id_priv->id.route.addr.dev_addr); break; case RDMA_CM_ROUTE_QUERY: cma_cancel_route(id_priv); break; case RDMA_CM_LISTEN: if (cma_any_addr(cma_src_addr(id_priv)) && !id_priv->cma_dev) cma_cancel_listens(id_priv); break; default: break; } } static void cma_release_port(struct rdma_id_private *id_priv) { struct rdma_bind_list *bind_list = id_priv->bind_list; struct net *net = id_priv->id.route.addr.dev_addr.net; if (!bind_list) return; mutex_lock(&lock); hlist_del(&id_priv->node); if (hlist_empty(&bind_list->owners)) { cma_ps_remove(net, bind_list->ps, bind_list->port); kfree(bind_list); } mutex_unlock(&lock); } static void destroy_mc(struct rdma_id_private *id_priv, struct cma_multicast *mc) { bool send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN); if (rdma_cap_ib_mcast(id_priv->id.device, id_priv->id.port_num)) ib_sa_free_multicast(mc->sa_mc); if (rdma_protocol_roce(id_priv->id.device, id_priv->id.port_num)) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct net_device *ndev = NULL; if (dev_addr->bound_dev_if) ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (ndev && !send_only) { enum ib_gid_type gid_type; union ib_gid mgid; gid_type = id_priv->cma_dev->default_gid_type [id_priv->id.port_num - rdma_start_port( id_priv->cma_dev->device)]; cma_iboe_set_mgid((struct sockaddr *)&mc->addr, &mgid, gid_type); cma_igmp_send(ndev, &mgid, false); } dev_put(ndev); cancel_work_sync(&mc->iboe_join.work); } kfree(mc); } static void cma_leave_mc_groups(struct rdma_id_private *id_priv) { struct cma_multicast *mc; while (!list_empty(&id_priv->mc_list)) { mc = list_first_entry(&id_priv->mc_list, struct cma_multicast, list); list_del(&mc->list); destroy_mc(id_priv, mc); } } static void _destroy_id(struct rdma_id_private *id_priv, enum rdma_cm_state state) { cma_cancel_operation(id_priv, state); rdma_restrack_del(&id_priv->res); cma_remove_id_from_tree(id_priv); if (id_priv->cma_dev) { if (rdma_cap_ib_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.ib) ib_destroy_cm_id(id_priv->cm_id.ib); } else if (rdma_cap_iw_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.iw) iw_destroy_cm_id(id_priv->cm_id.iw); } cma_leave_mc_groups(id_priv); cma_release_dev(id_priv); } cma_release_port(id_priv); cma_id_put(id_priv); wait_for_completion(&id_priv->comp); if (id_priv->internal_id) cma_id_put(id_priv->id.context); kfree(id_priv->id.route.path_rec); kfree(id_priv->id.route.path_rec_inbound); kfree(id_priv->id.route.path_rec_outbound); put_net(id_priv->id.route.addr.dev_addr.net); kfree(id_priv); } /* * destroy an ID from within the handler_mutex. This ensures that no other * handlers can start running concurrently. */ static void destroy_id_handler_unlock(struct rdma_id_private *id_priv) __releases(&idprv->handler_mutex) { enum rdma_cm_state state; unsigned long flags; trace_cm_id_destroy(id_priv); /* * Setting the state to destroyed under the handler mutex provides a * fence against calling handler callbacks. If this is invoked due to * the failure of a handler callback then it guarentees that no future * handlers will be called. */ lockdep_assert_held(&id_priv->handler_mutex); spin_lock_irqsave(&id_priv->lock, flags); state = id_priv->state; id_priv->state = RDMA_CM_DESTROYING; spin_unlock_irqrestore(&id_priv->lock, flags); mutex_unlock(&id_priv->handler_mutex); _destroy_id(id_priv, state); } void rdma_destroy_id(struct rdma_cm_id *id) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->handler_mutex); destroy_id_handler_unlock(id_priv); } EXPORT_SYMBOL(rdma_destroy_id); static int cma_rep_recv(struct rdma_id_private *id_priv) { int ret; ret = cma_modify_qp_rtr(id_priv, NULL); if (ret) goto reject; ret = cma_modify_qp_rts(id_priv, NULL); if (ret) goto reject; trace_cm_send_rtu(id_priv); ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0); if (ret) goto reject; return 0; reject: pr_debug_ratelimited("RDMA CM: CONNECT_ERROR: failed to handle reply. status %d\n", ret); cma_modify_qp_err(id_priv); trace_cm_send_rej(id_priv); ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, NULL, 0); return ret; } static void cma_set_rep_event_data(struct rdma_cm_event *event, const struct ib_cm_rep_event_param *rep_data, void *private_data) { event->param.conn.private_data = private_data; event->param.conn.private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE; event->param.conn.responder_resources = rep_data->responder_resources; event->param.conn.initiator_depth = rep_data->initiator_depth; event->param.conn.flow_control = rep_data->flow_control; event->param.conn.rnr_retry_count = rep_data->rnr_retry_count; event->param.conn.srq = rep_data->srq; event->param.conn.qp_num = rep_data->remote_qpn; event->ece.vendor_id = rep_data->ece.vendor_id; event->ece.attr_mod = rep_data->ece.attr_mod; } static int cma_cm_event_handler(struct rdma_id_private *id_priv, struct rdma_cm_event *event) { int ret; lockdep_assert_held(&id_priv->handler_mutex); trace_cm_event_handler(id_priv, event); ret = id_priv->id.event_handler(&id_priv->id, event); trace_cm_event_done(id_priv, event, ret); return ret; } static int cma_ib_handler(struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv = cm_id->context; struct rdma_cm_event event = {}; enum rdma_cm_state state; int ret; mutex_lock(&id_priv->handler_mutex); state = READ_ONCE(id_priv->state); if ((ib_event->event != IB_CM_TIMEWAIT_EXIT && state != RDMA_CM_CONNECT) || (ib_event->event == IB_CM_TIMEWAIT_EXIT && state != RDMA_CM_DISCONNECT)) goto out; switch (ib_event->event) { case IB_CM_REQ_ERROR: case IB_CM_REP_ERROR: event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -ETIMEDOUT; break; case IB_CM_REP_RECEIVED: if (state == RDMA_CM_CONNECT && (id_priv->id.qp_type != IB_QPT_UD)) { trace_cm_send_mra(id_priv); ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); } if (id_priv->id.qp) { event.status = cma_rep_recv(id_priv); event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR : RDMA_CM_EVENT_ESTABLISHED; } else { event.event = RDMA_CM_EVENT_CONNECT_RESPONSE; } cma_set_rep_event_data(&event, &ib_event->param.rep_rcvd, ib_event->private_data); break; case IB_CM_RTU_RECEIVED: case IB_CM_USER_ESTABLISHED: event.event = RDMA_CM_EVENT_ESTABLISHED; break; case IB_CM_DREQ_ERROR: event.status = -ETIMEDOUT; fallthrough; case IB_CM_DREQ_RECEIVED: case IB_CM_DREP_RECEIVED: if (!cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_DISCONNECT)) goto out; event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IB_CM_TIMEWAIT_EXIT: event.event = RDMA_CM_EVENT_TIMEWAIT_EXIT; break; case IB_CM_MRA_RECEIVED: /* ignore event */ goto out; case IB_CM_REJ_RECEIVED: pr_debug_ratelimited("RDMA CM: REJECTED: %s\n", rdma_reject_msg(&id_priv->id, ib_event->param.rej_rcvd.reason)); cma_modify_qp_err(id_priv); event.status = ib_event->param.rej_rcvd.reason; event.event = RDMA_CM_EVENT_REJECTED; event.param.conn.private_data = ib_event->private_data; event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE; break; default: pr_err("RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); goto out; } ret = cma_cm_event_handler(id_priv, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; destroy_id_handler_unlock(id_priv); return ret; } out: mutex_unlock(&id_priv->handler_mutex); return 0; } static struct rdma_id_private * cma_ib_new_conn_id(const struct rdma_cm_id *listen_id, const struct ib_cm_event *ib_event, struct net_device *net_dev) { struct rdma_id_private *listen_id_priv; struct rdma_id_private *id_priv; struct rdma_cm_id *id; struct rdma_route *rt; const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family; struct sa_path_rec *path = ib_event->param.req_rcvd.primary_path; const __be64 service_id = ib_event->param.req_rcvd.primary_path->service_id; int ret; listen_id_priv = container_of(listen_id, struct rdma_id_private, id); id_priv = __rdma_create_id(listen_id->route.addr.dev_addr.net, listen_id->event_handler, listen_id->context, listen_id->ps, ib_event->param.req_rcvd.qp_type, listen_id_priv); if (IS_ERR(id_priv)) return NULL; id = &id_priv->id; if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, (struct sockaddr *)&id->route.addr.dst_addr, listen_id, ib_event, ss_family, service_id)) goto err; rt = &id->route; rt->num_pri_alt_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1; rt->path_rec = kmalloc_array(rt->num_pri_alt_paths, sizeof(*rt->path_rec), GFP_KERNEL); if (!rt->path_rec) goto err; rt->path_rec[0] = *path; if (rt->num_pri_alt_paths == 2) rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; if (net_dev) { rdma_copy_src_l2_addr(&rt->addr.dev_addr, net_dev); } else { if (!cma_protocol_roce(listen_id) && cma_any_addr(cma_src_addr(id_priv))) { rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND; rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid); ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey)); } else if (!cma_any_addr(cma_src_addr(id_priv))) { ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr); if (ret) goto err; } } rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid); id_priv->state = RDMA_CM_CONNECT; return id_priv; err: rdma_destroy_id(id); return NULL; } static struct rdma_id_private * cma_ib_new_udp_id(const struct rdma_cm_id *listen_id, const struct ib_cm_event *ib_event, struct net_device *net_dev) { const struct rdma_id_private *listen_id_priv; struct rdma_id_private *id_priv; struct rdma_cm_id *id; const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family; struct net *net = listen_id->route.addr.dev_addr.net; int ret; listen_id_priv = container_of(listen_id, struct rdma_id_private, id); id_priv = __rdma_create_id(net, listen_id->event_handler, listen_id->context, listen_id->ps, IB_QPT_UD, listen_id_priv); if (IS_ERR(id_priv)) return NULL; id = &id_priv->id; if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, (struct sockaddr *)&id->route.addr.dst_addr, listen_id, ib_event, ss_family, ib_event->param.sidr_req_rcvd.service_id)) goto err; if (net_dev) { rdma_copy_src_l2_addr(&id->route.addr.dev_addr, net_dev); } else { if (!cma_any_addr(cma_src_addr(id_priv))) { ret = cma_translate_addr(cma_src_addr(id_priv), &id->route.addr.dev_addr); if (ret) goto err; } } id_priv->state = RDMA_CM_CONNECT; return id_priv; err: rdma_destroy_id(id); return NULL; } static void cma_set_req_event_data(struct rdma_cm_event *event, const struct ib_cm_req_event_param *req_data, void *private_data, int offset) { event->param.conn.private_data = private_data + offset; event->param.conn.private_data_len = IB_CM_REQ_PRIVATE_DATA_SIZE - offset; event->param.conn.responder_resources = req_data->responder_resources; event->param.conn.initiator_depth = req_data->initiator_depth; event->param.conn.flow_control = req_data->flow_control; event->param.conn.retry_count = req_data->retry_count; event->param.conn.rnr_retry_count = req_data->rnr_retry_count; event->param.conn.srq = req_data->srq; event->param.conn.qp_num = req_data->remote_qpn; event->ece.vendor_id = req_data->ece.vendor_id; event->ece.attr_mod = req_data->ece.attr_mod; } static int cma_ib_check_req_qp_type(const struct rdma_cm_id *id, const struct ib_cm_event *ib_event) { return (((ib_event->event == IB_CM_REQ_RECEIVED) && (ib_event->param.req_rcvd.qp_type == id->qp_type)) || ((ib_event->event == IB_CM_SIDR_REQ_RECEIVED) && (id->qp_type == IB_QPT_UD)) || (!id->qp_type)); } static int cma_ib_req_handler(struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event) { struct rdma_id_private *listen_id, *conn_id = NULL; struct rdma_cm_event event = {}; struct cma_req_info req = {}; struct net_device *net_dev; u8 offset; int ret; listen_id = cma_ib_id_from_event(cm_id, ib_event, &req, &net_dev); if (IS_ERR(listen_id)) return PTR_ERR(listen_id); trace_cm_req_handler(listen_id, ib_event->event); if (!cma_ib_check_req_qp_type(&listen_id->id, ib_event)) { ret = -EINVAL; goto net_dev_put; } mutex_lock(&listen_id->handler_mutex); if (READ_ONCE(listen_id->state) != RDMA_CM_LISTEN) { ret = -ECONNABORTED; goto err_unlock; } offset = cma_user_data_offset(listen_id); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) { conn_id = cma_ib_new_udp_id(&listen_id->id, ib_event, net_dev); event.param.ud.private_data = ib_event->private_data + offset; event.param.ud.private_data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset; } else { conn_id = cma_ib_new_conn_id(&listen_id->id, ib_event, net_dev); cma_set_req_event_data(&event, &ib_event->param.req_rcvd, ib_event->private_data, offset); } if (!conn_id) { ret = -ENOMEM; goto err_unlock; } mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); ret = cma_ib_acquire_dev(conn_id, listen_id, &req); if (ret) { destroy_id_handler_unlock(conn_id); goto err_unlock; } conn_id->cm_id.ib = cm_id; cm_id->context = conn_id; cm_id->cm_handler = cma_ib_handler; ret = cma_cm_event_handler(conn_id, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ conn_id->cm_id.ib = NULL; mutex_unlock(&listen_id->handler_mutex); destroy_id_handler_unlock(conn_id); goto net_dev_put; } if (READ_ONCE(conn_id->state) == RDMA_CM_CONNECT && conn_id->id.qp_type != IB_QPT_UD) { trace_cm_send_mra(cm_id->context); ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); } mutex_unlock(&conn_id->handler_mutex); err_unlock: mutex_unlock(&listen_id->handler_mutex); net_dev_put: dev_put(net_dev); return ret; } __be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr) { if (addr->sa_family == AF_IB) return ((struct sockaddr_ib *) addr)->sib_sid; return cpu_to_be64(((u64)id->ps << 16) + be16_to_cpu(cma_port(addr))); } EXPORT_SYMBOL(rdma_get_service_id); void rdma_read_gids(struct rdma_cm_id *cm_id, union ib_gid *sgid, union ib_gid *dgid) { struct rdma_addr *addr = &cm_id->route.addr; if (!cm_id->device) { if (sgid) memset(sgid, 0, sizeof(*sgid)); if (dgid) memset(dgid, 0, sizeof(*dgid)); return; } if (rdma_protocol_roce(cm_id->device, cm_id->port_num)) { if (sgid) rdma_ip2gid((struct sockaddr *)&addr->src_addr, sgid); if (dgid) rdma_ip2gid((struct sockaddr *)&addr->dst_addr, dgid); } else { if (sgid) rdma_addr_get_sgid(&addr->dev_addr, sgid); if (dgid) rdma_addr_get_dgid(&addr->dev_addr, dgid); } } EXPORT_SYMBOL(rdma_read_gids); static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) { struct rdma_id_private *id_priv = iw_id->context; struct rdma_cm_event event = {}; int ret = 0; struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) goto out; switch (iw_event->event) { case IW_CM_EVENT_CLOSE: event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IW_CM_EVENT_CONNECT_REPLY: memcpy(cma_src_addr(id_priv), laddr, rdma_addr_size(laddr)); memcpy(cma_dst_addr(id_priv), raddr, rdma_addr_size(raddr)); switch (iw_event->status) { case 0: event.event = RDMA_CM_EVENT_ESTABLISHED; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; break; case -ECONNRESET: case -ECONNREFUSED: event.event = RDMA_CM_EVENT_REJECTED; break; case -ETIMEDOUT: event.event = RDMA_CM_EVENT_UNREACHABLE; break; default: event.event = RDMA_CM_EVENT_CONNECT_ERROR; break; } break; case IW_CM_EVENT_ESTABLISHED: event.event = RDMA_CM_EVENT_ESTABLISHED; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; break; default: goto out; } event.status = iw_event->status; event.param.conn.private_data = iw_event->private_data; event.param.conn.private_data_len = iw_event->private_data_len; ret = cma_cm_event_handler(id_priv, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.iw = NULL; destroy_id_handler_unlock(id_priv); return ret; } out: mutex_unlock(&id_priv->handler_mutex); return ret; } static int iw_conn_req_handler(struct iw_cm_id *cm_id, struct iw_cm_event *iw_event) { struct rdma_id_private *listen_id, *conn_id; struct rdma_cm_event event = {}; int ret = -ECONNABORTED; struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; event.event = RDMA_CM_EVENT_CONNECT_REQUEST; event.param.conn.private_data = iw_event->private_data; event.param.conn.private_data_len = iw_event->private_data_len; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; listen_id = cm_id->context; mutex_lock(&listen_id->handler_mutex); if (READ_ONCE(listen_id->state) != RDMA_CM_LISTEN) goto out; /* Create a new RDMA id for the new IW CM ID */ conn_id = __rdma_create_id(listen_id->id.route.addr.dev_addr.net, listen_id->id.event_handler, listen_id->id.context, RDMA_PS_TCP, IB_QPT_RC, listen_id); if (IS_ERR(conn_id)) { ret = -ENOMEM; goto out; } mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); conn_id->state = RDMA_CM_CONNECT; ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr); if (ret) { mutex_unlock(&listen_id->handler_mutex); destroy_id_handler_unlock(conn_id); return ret; } ret = cma_iw_acquire_dev(conn_id, listen_id); if (ret) { mutex_unlock(&listen_id->handler_mutex); destroy_id_handler_unlock(conn_id); return ret; } conn_id->cm_id.iw = cm_id; cm_id->context = conn_id; cm_id->cm_handler = cma_iw_handler; memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr)); memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr)); ret = cma_cm_event_handler(conn_id, &event); if (ret) { /* User wants to destroy the CM ID */ conn_id->cm_id.iw = NULL; mutex_unlock(&listen_id->handler_mutex); destroy_id_handler_unlock(conn_id); return ret; } mutex_unlock(&conn_id->handler_mutex); out: mutex_unlock(&listen_id->handler_mutex); return ret; } static int cma_ib_listen(struct rdma_id_private *id_priv) { struct sockaddr *addr; struct ib_cm_id *id; __be64 svc_id; addr = cma_src_addr(id_priv); svc_id = rdma_get_service_id(&id_priv->id, addr); id = ib_cm_insert_listen(id_priv->id.device, cma_ib_req_handler, svc_id); if (IS_ERR(id)) return PTR_ERR(id); id_priv->cm_id.ib = id; return 0; } static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) { int ret; struct iw_cm_id *id; id = iw_create_cm_id(id_priv->id.device, iw_conn_req_handler, id_priv); if (IS_ERR(id)) return PTR_ERR(id); mutex_lock(&id_priv->qp_mutex); id->tos = id_priv->tos; id->tos_set = id_priv->tos_set; mutex_unlock(&id_priv->qp_mutex); id->afonly = id_priv->afonly; id_priv->cm_id.iw = id; memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv), rdma_addr_size(cma_src_addr(id_priv))); ret = iw_cm_listen(id_priv->cm_id.iw, backlog); if (ret) { iw_destroy_cm_id(id_priv->cm_id.iw); id_priv->cm_id.iw = NULL; } return ret; } static int cma_listen_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { struct rdma_id_private *id_priv = id->context; /* Listening IDs are always destroyed on removal */ if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) return -1; id->context = id_priv->id.context; id->event_handler = id_priv->id.event_handler; trace_cm_event_handler(id_priv, event); return id_priv->id.event_handler(id, event); } static int cma_listen_on_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev, struct rdma_id_private **to_destroy) { struct rdma_id_private *dev_id_priv; struct net *net = id_priv->id.route.addr.dev_addr.net; int ret; lockdep_assert_held(&lock); *to_destroy = NULL; if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev->device, 1)) return 0; dev_id_priv = __rdma_create_id(net, cma_listen_handler, id_priv, id_priv->id.ps, id_priv->id.qp_type, id_priv); if (IS_ERR(dev_id_priv)) return PTR_ERR(dev_id_priv); dev_id_priv->state = RDMA_CM_ADDR_BOUND; memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv), rdma_addr_size(cma_src_addr(id_priv))); _cma_attach_to_dev(dev_id_priv, cma_dev); rdma_restrack_add(&dev_id_priv->res); cma_id_get(id_priv); dev_id_priv->internal_id = 1; dev_id_priv->afonly = id_priv->afonly; mutex_lock(&id_priv->qp_mutex); dev_id_priv->tos_set = id_priv->tos_set; dev_id_priv->tos = id_priv->tos; mutex_unlock(&id_priv->qp_mutex); ret = rdma_listen(&dev_id_priv->id, id_priv->backlog); if (ret) goto err_listen; list_add_tail(&dev_id_priv->listen_item, &id_priv->listen_list); return 0; err_listen: /* Caller must destroy this after releasing lock */ *to_destroy = dev_id_priv; dev_warn(&cma_dev->device->dev, "RDMA CMA: %s, error %d\n", __func__, ret); return ret; } static int cma_listen_on_all(struct rdma_id_private *id_priv) { struct rdma_id_private *to_destroy; struct cma_device *cma_dev; int ret; mutex_lock(&lock); list_add_tail(&id_priv->listen_any_item, &listen_any_list); list_for_each_entry(cma_dev, &dev_list, list) { ret = cma_listen_on_dev(id_priv, cma_dev, &to_destroy); if (ret) { /* Prevent racing with cma_process_remove() */ if (to_destroy) list_del_init(&to_destroy->device_item); goto err_listen; } } mutex_unlock(&lock); return 0; err_listen: _cma_cancel_listens(id_priv); mutex_unlock(&lock); if (to_destroy) rdma_destroy_id(&to_destroy->id); return ret; } void rdma_set_service_type(struct rdma_cm_id *id, int tos) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->qp_mutex); id_priv->tos = (u8) tos; id_priv->tos_set = true; mutex_unlock(&id_priv->qp_mutex); } EXPORT_SYMBOL(rdma_set_service_type); /** * rdma_set_ack_timeout() - Set the ack timeout of QP associated * with a connection identifier. * @id: Communication identifier to associated with service type. * @timeout: Ack timeout to set a QP, expressed as 4.096 * 2^(timeout) usec. * * This function should be called before rdma_connect() on active side, * and on passive side before rdma_accept(). It is applicable to primary * path only. The timeout will affect the local side of the QP, it is not * negotiated with remote side and zero disables the timer. In case it is * set before rdma_resolve_route, the value will also be used to determine * PacketLifeTime for RoCE. * * Return: 0 for success */ int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout) { struct rdma_id_private *id_priv; if (id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_INI) return -EINVAL; id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->qp_mutex); id_priv->timeout = timeout; id_priv->timeout_set = true; mutex_unlock(&id_priv->qp_mutex); return 0; } EXPORT_SYMBOL(rdma_set_ack_timeout); /** * rdma_set_min_rnr_timer() - Set the minimum RNR Retry timer of the * QP associated with a connection identifier. * @id: Communication identifier to associated with service type. * @min_rnr_timer: 5-bit value encoded as Table 45: "Encoding for RNR NAK * Timer Field" in the IBTA specification. * * This function should be called before rdma_connect() on active * side, and on passive side before rdma_accept(). The timer value * will be associated with the local QP. When it receives a send it is * not read to handle, typically if the receive queue is empty, an RNR * Retry NAK is returned to the requester with the min_rnr_timer * encoded. The requester will then wait at least the time specified * in the NAK before retrying. The default is zero, which translates * to a minimum RNR Timer value of 655 ms. * * Return: 0 for success */ int rdma_set_min_rnr_timer(struct rdma_cm_id *id, u8 min_rnr_timer) { struct rdma_id_private *id_priv; /* It is a five-bit value */ if (min_rnr_timer & 0xe0) return -EINVAL; if (WARN_ON(id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_TGT)) return -EINVAL; id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->qp_mutex); id_priv->min_rnr_timer = min_rnr_timer; id_priv->min_rnr_timer_set = true; mutex_unlock(&id_priv->qp_mutex); return 0; } EXPORT_SYMBOL(rdma_set_min_rnr_timer); static int route_set_path_rec_inbound(struct cma_work *work, struct sa_path_rec *path_rec) { struct rdma_route *route = &work->id->id.route; if (!route->path_rec_inbound) { route->path_rec_inbound = kzalloc(sizeof(*route->path_rec_inbound), GFP_KERNEL); if (!route->path_rec_inbound) return -ENOMEM; } *route->path_rec_inbound = *path_rec; return 0; } static int route_set_path_rec_outbound(struct cma_work *work, struct sa_path_rec *path_rec) { struct rdma_route *route = &work->id->id.route; if (!route->path_rec_outbound) { route->path_rec_outbound = kzalloc(sizeof(*route->path_rec_outbound), GFP_KERNEL); if (!route->path_rec_outbound) return -ENOMEM; } *route->path_rec_outbound = *path_rec; return 0; } static void cma_query_handler(int status, struct sa_path_rec *path_rec, unsigned int num_prs, void *context) { struct cma_work *work = context; struct rdma_route *route; int i; route = &work->id->id.route; if (status) goto fail; for (i = 0; i < num_prs; i++) { if (!path_rec[i].flags || (path_rec[i].flags & IB_PATH_GMP)) *route->path_rec = path_rec[i]; else if (path_rec[i].flags & IB_PATH_INBOUND) status = route_set_path_rec_inbound(work, &path_rec[i]); else if (path_rec[i].flags & IB_PATH_OUTBOUND) status = route_set_path_rec_outbound(work, &path_rec[i]); else status = -EINVAL; if (status) goto fail; } route->num_pri_alt_paths = 1; queue_work(cma_wq, &work->work); return; fail: work->old_state = RDMA_CM_ROUTE_QUERY; work->new_state = RDMA_CM_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_ERROR; work->event.status = status; pr_debug_ratelimited("RDMA CM: ROUTE_ERROR: failed to query path. status %d\n", status); queue_work(cma_wq, &work->work); } static int cma_query_ib_route(struct rdma_id_private *id_priv, unsigned long timeout_ms, struct cma_work *work) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct sa_path_rec path_rec; ib_sa_comp_mask comp_mask; struct sockaddr_in6 *sin6; struct sockaddr_ib *sib; memset(&path_rec, 0, sizeof path_rec); if (rdma_cap_opa_ah(id_priv->id.device, id_priv->id.port_num)) path_rec.rec_type = SA_PATH_REC_TYPE_OPA; else path_rec.rec_type = SA_PATH_REC_TYPE_IB; rdma_addr_get_sgid(dev_addr, &path_rec.sgid); rdma_addr_get_dgid(dev_addr, &path_rec.dgid); path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); path_rec.numb_path = 1; path_rec.reversible = 1; path_rec.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH | IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID; switch (cma_family(id_priv)) { case AF_INET: path_rec.qos_class = cpu_to_be16((u16) id_priv->tos); comp_mask |= IB_SA_PATH_REC_QOS_CLASS; break; case AF_INET6: sin6 = (struct sockaddr_in6 *) cma_src_addr(id_priv); path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20); comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; break; case AF_IB: sib = (struct sockaddr_ib *) cma_src_addr(id_priv); path_rec.traffic_class = (u8) (be32_to_cpu(sib->sib_flowinfo) >> 20); comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; break; } id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device, id_priv->id.port_num, &path_rec, comp_mask, timeout_ms, GFP_KERNEL, cma_query_handler, work, &id_priv->query); return (id_priv->query_id < 0) ? id_priv->query_id : 0; } static void cma_iboe_join_work_handler(struct work_struct *work) { struct cma_multicast *mc = container_of(work, struct cma_multicast, iboe_join.work); struct rdma_cm_event *event = &mc->iboe_join.event; struct rdma_id_private *id_priv = mc->id_priv; int ret; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) goto out_unlock; ret = cma_cm_event_handler(id_priv, event); WARN_ON(ret); out_unlock: mutex_unlock(&id_priv->handler_mutex); if (event->event == RDMA_CM_EVENT_MULTICAST_JOIN) rdma_destroy_ah_attr(&event->param.ud.ah_attr); } static void cma_work_handler(struct work_struct *_work) { struct cma_work *work = container_of(_work, struct cma_work, work); struct rdma_id_private *id_priv = work->id; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) goto out_unlock; if (work->old_state != 0 || work->new_state != 0) { if (!cma_comp_exch(id_priv, work->old_state, work->new_state)) goto out_unlock; } if (cma_cm_event_handler(id_priv, &work->event)) { cma_id_put(id_priv); destroy_id_handler_unlock(id_priv); goto out_free; } out_unlock: mutex_unlock(&id_priv->handler_mutex); cma_id_put(id_priv); out_free: if (work->event.event == RDMA_CM_EVENT_MULTICAST_JOIN) rdma_destroy_ah_attr(&work->event.param.ud.ah_attr); kfree(work); } static void cma_init_resolve_route_work(struct cma_work *work, struct rdma_id_private *id_priv) { work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ROUTE_QUERY; work->new_state = RDMA_CM_ROUTE_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; } static void enqueue_resolve_addr_work(struct cma_work *work, struct rdma_id_private *id_priv) { /* Balances with cma_id_put() in cma_work_handler */ cma_id_get(id_priv); work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ADDR_QUERY; work->new_state = RDMA_CM_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; queue_work(cma_wq, &work->work); } static int cma_resolve_ib_route(struct rdma_id_private *id_priv, unsigned long timeout_ms) { struct rdma_route *route = &id_priv->id.route; struct cma_work *work; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; cma_init_resolve_route_work(work, id_priv); if (!route->path_rec) route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; goto err1; } ret = cma_query_ib_route(id_priv, timeout_ms, work); if (ret) goto err2; return 0; err2: kfree(route->path_rec); route->path_rec = NULL; err1: kfree(work); return ret; } static enum ib_gid_type cma_route_gid_type(enum rdma_network_type network_type, unsigned long supported_gids, enum ib_gid_type default_gid) { if ((network_type == RDMA_NETWORK_IPV4 || network_type == RDMA_NETWORK_IPV6) && test_bit(IB_GID_TYPE_ROCE_UDP_ENCAP, &supported_gids)) return IB_GID_TYPE_ROCE_UDP_ENCAP; return default_gid; } /* * cma_iboe_set_path_rec_l2_fields() is helper function which sets * path record type based on GID type. * It also sets up other L2 fields which includes destination mac address * netdev ifindex, of the path record. * It returns the netdev of the bound interface for this path record entry. */ static struct net_device * cma_iboe_set_path_rec_l2_fields(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; enum ib_gid_type gid_type = IB_GID_TYPE_ROCE; struct rdma_addr *addr = &route->addr; unsigned long supported_gids; struct net_device *ndev; if (!addr->dev_addr.bound_dev_if) return NULL; ndev = dev_get_by_index(addr->dev_addr.net, addr->dev_addr.bound_dev_if); if (!ndev) return NULL; supported_gids = roce_gid_type_mask_support(id_priv->id.device, id_priv->id.port_num); gid_type = cma_route_gid_type(addr->dev_addr.network, supported_gids, id_priv->gid_type); /* Use the hint from IP Stack to select GID Type */ if (gid_type < ib_network_to_gid_type(addr->dev_addr.network)) gid_type = ib_network_to_gid_type(addr->dev_addr.network); route->path_rec->rec_type = sa_conv_gid_to_pathrec_type(gid_type); route->path_rec->roce.route_resolved = true; sa_path_set_dmac(route->path_rec, addr->dev_addr.dst_dev_addr); return ndev; } int rdma_set_ib_path(struct rdma_cm_id *id, struct sa_path_rec *path_rec) { struct rdma_id_private *id_priv; struct net_device *ndev; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_RESOLVED)) return -EINVAL; id->route.path_rec = kmemdup(path_rec, sizeof(*path_rec), GFP_KERNEL); if (!id->route.path_rec) { ret = -ENOMEM; goto err; } if (rdma_protocol_roce(id->device, id->port_num)) { ndev = cma_iboe_set_path_rec_l2_fields(id_priv); if (!ndev) { ret = -ENODEV; goto err_free; } dev_put(ndev); } id->route.num_pri_alt_paths = 1; return 0; err_free: kfree(id->route.path_rec); id->route.path_rec = NULL; err: cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_ADDR_RESOLVED); return ret; } EXPORT_SYMBOL(rdma_set_ib_path); static int cma_resolve_iw_route(struct rdma_id_private *id_priv) { struct cma_work *work; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; cma_init_resolve_route_work(work, id_priv); queue_work(cma_wq, &work->work); return 0; } static int get_vlan_ndev_tc(struct net_device *vlan_ndev, int prio) { struct net_device *dev; dev = vlan_dev_real_dev(vlan_ndev); if (dev->num_tc) return netdev_get_prio_tc_map(dev, prio); return (vlan_dev_get_egress_qos_mask(vlan_ndev, prio) & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; } struct iboe_prio_tc_map { int input_prio; int output_tc; bool found; }; static int get_lower_vlan_dev_tc(struct net_device *dev, struct netdev_nested_priv *priv) { struct iboe_prio_tc_map *map = (struct iboe_prio_tc_map *)priv->data; if (is_vlan_dev(dev)) map->output_tc = get_vlan_ndev_tc(dev, map->input_prio); else if (dev->num_tc) map->output_tc = netdev_get_prio_tc_map(dev, map->input_prio); else map->output_tc = 0; /* We are interested only in first level VLAN device, so always * return 1 to stop iterating over next level devices. */ map->found = true; return 1; } static int iboe_tos_to_sl(struct net_device *ndev, int tos) { struct iboe_prio_tc_map prio_tc_map = {}; int prio = rt_tos2priority(tos); struct netdev_nested_priv priv; /* If VLAN device, get it directly from the VLAN netdev */ if (is_vlan_dev(ndev)) return get_vlan_ndev_tc(ndev, prio); prio_tc_map.input_prio = prio; priv.data = (void *)&prio_tc_map; rcu_read_lock(); netdev_walk_all_lower_dev_rcu(ndev, get_lower_vlan_dev_tc, &priv); rcu_read_unlock(); /* If map is found from lower device, use it; Otherwise * continue with the current netdevice to get priority to tc map. */ if (prio_tc_map.found) return prio_tc_map.output_tc; else if (ndev->num_tc) return netdev_get_prio_tc_map(ndev, prio); else return 0; } static __be32 cma_get_roce_udp_flow_label(struct rdma_id_private *id_priv) { struct sockaddr_in6 *addr6; u16 dport, sport; u32 hash, fl; addr6 = (struct sockaddr_in6 *)cma_src_addr(id_priv); fl = be32_to_cpu(addr6->sin6_flowinfo) & IB_GRH_FLOWLABEL_MASK; if ((cma_family(id_priv) != AF_INET6) || !fl) { dport = be16_to_cpu(cma_port(cma_dst_addr(id_priv))); sport = be16_to_cpu(cma_port(cma_src_addr(id_priv))); hash = (u32)sport * 31 + dport; fl = hash & IB_GRH_FLOWLABEL_MASK; } return cpu_to_be32(fl); } static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; struct rdma_addr *addr = &route->addr; struct cma_work *work; int ret; struct net_device *ndev; u8 default_roce_tos = id_priv->cma_dev->default_roce_tos[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; u8 tos; mutex_lock(&id_priv->qp_mutex); tos = id_priv->tos_set ? id_priv->tos : default_roce_tos; mutex_unlock(&id_priv->qp_mutex); work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; goto err1; } route->num_pri_alt_paths = 1; ndev = cma_iboe_set_path_rec_l2_fields(id_priv); if (!ndev) { ret = -ENODEV; goto err2; } rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &route->path_rec->sgid); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr, &route->path_rec->dgid); if (((struct sockaddr *)&id_priv->id.route.addr.dst_addr)->sa_family != AF_IB) /* TODO: get the hoplimit from the inet/inet6 device */ route->path_rec->hop_limit = addr->dev_addr.hoplimit; else route->path_rec->hop_limit = 1; route->path_rec->reversible = 1; route->path_rec->pkey = cpu_to_be16(0xffff); route->path_rec->mtu_selector = IB_SA_EQ; route->path_rec->sl = iboe_tos_to_sl(ndev, tos); route->path_rec->traffic_class = tos; route->path_rec->mtu = iboe_get_mtu(ndev->mtu); route->path_rec->rate_selector = IB_SA_EQ; route->path_rec->rate = IB_RATE_PORT_CURRENT; dev_put(ndev); route->path_rec->packet_life_time_selector = IB_SA_EQ; /* In case ACK timeout is set, use this value to calculate * PacketLifeTime. As per IBTA 12.7.34, * local ACK timeout = (2 * PacketLifeTime + Local CA’s ACK delay). * Assuming a negligible local ACK delay, we can use * PacketLifeTime = local ACK timeout/2 * as a reasonable approximation for RoCE networks. */ mutex_lock(&id_priv->qp_mutex); if (id_priv->timeout_set && id_priv->timeout) route->path_rec->packet_life_time = id_priv->timeout - 1; else route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME; mutex_unlock(&id_priv->qp_mutex); if (!route->path_rec->mtu) { ret = -EINVAL; goto err2; } if (rdma_protocol_roce_udp_encap(id_priv->id.device, id_priv->id.port_num)) route->path_rec->flow_label = cma_get_roce_udp_flow_label(id_priv); cma_init_resolve_route_work(work, id_priv); queue_work(cma_wq, &work->work); return 0; err2: kfree(route->path_rec); route->path_rec = NULL; route->num_pri_alt_paths = 0; err1: kfree(work); return ret; } int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms) { struct rdma_id_private *id_priv; int ret; if (!timeout_ms) return -EINVAL; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY)) return -EINVAL; cma_id_get(id_priv); if (rdma_cap_ib_sa(id->device, id->port_num)) ret = cma_resolve_ib_route(id_priv, timeout_ms); else if (rdma_protocol_roce(id->device, id->port_num)) { ret = cma_resolve_iboe_route(id_priv); if (!ret) cma_add_id_to_tree(id_priv); } else if (rdma_protocol_iwarp(id->device, id->port_num)) ret = cma_resolve_iw_route(id_priv); else ret = -ENOSYS; if (ret) goto err; return 0; err: cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED); cma_id_put(id_priv); return ret; } EXPORT_SYMBOL(rdma_resolve_route); static void cma_set_loopback(struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: ((struct sockaddr_in *) addr)->sin_addr.s_addr = htonl(INADDR_LOOPBACK); break; case AF_INET6: ipv6_addr_set(&((struct sockaddr_in6 *) addr)->sin6_addr, 0, 0, 0, htonl(1)); break; default: ib_addr_set(&((struct sockaddr_ib *) addr)->sib_addr, 0, 0, 0, htonl(1)); break; } } static int cma_bind_loopback(struct rdma_id_private *id_priv) { struct cma_device *cma_dev, *cur_dev; union ib_gid gid; enum ib_port_state port_state; unsigned int p; u16 pkey; int ret; cma_dev = NULL; mutex_lock(&lock); list_for_each_entry(cur_dev, &dev_list, list) { if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cur_dev->device, 1)) continue; if (!cma_dev) cma_dev = cur_dev; rdma_for_each_port (cur_dev->device, p) { if (!ib_get_cached_port_state(cur_dev->device, p, &port_state) && port_state == IB_PORT_ACTIVE) { cma_dev = cur_dev; goto port_found; } } } if (!cma_dev) { ret = -ENODEV; goto out; } p = 1; port_found: ret = rdma_query_gid(cma_dev->device, p, 0, &gid); if (ret) goto out; ret = ib_get_cached_pkey(cma_dev->device, p, 0, &pkey); if (ret) goto out; id_priv->id.route.addr.dev_addr.dev_type = (rdma_protocol_ib(cma_dev->device, p)) ? ARPHRD_INFINIBAND : ARPHRD_ETHER; rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid); ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey); id_priv->id.port_num = p; cma_attach_to_dev(id_priv, cma_dev); rdma_restrack_add(&id_priv->res); cma_set_loopback(cma_src_addr(id_priv)); out: mutex_unlock(&lock); return ret; } static void addr_handler(int status, struct sockaddr *src_addr, struct rdma_dev_addr *dev_addr, void *context) { struct rdma_id_private *id_priv = context; struct rdma_cm_event event = {}; struct sockaddr *addr; struct sockaddr_storage old_addr; mutex_lock(&id_priv->handler_mutex); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_RESOLVED)) goto out; /* * Store the previous src address, so that if we fail to acquire * matching rdma device, old address can be restored back, which helps * to cancel the cma listen operation correctly. */ addr = cma_src_addr(id_priv); memcpy(&old_addr, addr, rdma_addr_size(addr)); memcpy(addr, src_addr, rdma_addr_size(src_addr)); if (!status && !id_priv->cma_dev) { status = cma_acquire_dev_by_src_ip(id_priv); if (status) pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to acquire device. status %d\n", status); rdma_restrack_add(&id_priv->res); } else if (status) { pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to resolve IP. status %d\n", status); } if (status) { memcpy(addr, &old_addr, rdma_addr_size((struct sockaddr *)&old_addr)); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ADDR_BOUND)) goto out; event.event = RDMA_CM_EVENT_ADDR_ERROR; event.status = status; } else event.event = RDMA_CM_EVENT_ADDR_RESOLVED; if (cma_cm_event_handler(id_priv, &event)) { destroy_id_handler_unlock(id_priv); return; } out: mutex_unlock(&id_priv->handler_mutex); } static int cma_resolve_loopback(struct rdma_id_private *id_priv) { struct cma_work *work; union ib_gid gid; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; if (!id_priv->cma_dev) { ret = cma_bind_loopback(id_priv); if (ret) goto err; } rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid); rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid); enqueue_resolve_addr_work(work, id_priv); return 0; err: kfree(work); return ret; } static int cma_resolve_ib_addr(struct rdma_id_private *id_priv) { struct cma_work *work; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; if (!id_priv->cma_dev) { ret = cma_resolve_ib_dev(id_priv); if (ret) goto err; } rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, (union ib_gid *) &(((struct sockaddr_ib *) &id_priv->id.route.addr.dst_addr)->sib_addr)); enqueue_resolve_addr_work(work, id_priv); return 0; err: kfree(work); return ret; } int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse) { struct rdma_id_private *id_priv; unsigned long flags; int ret; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irqsave(&id_priv->lock, flags); if ((reuse && id_priv->state != RDMA_CM_LISTEN) || id_priv->state == RDMA_CM_IDLE) { id_priv->reuseaddr = reuse; ret = 0; } else { ret = -EINVAL; } spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } EXPORT_SYMBOL(rdma_set_reuseaddr); int rdma_set_afonly(struct rdma_cm_id *id, int afonly) { struct rdma_id_private *id_priv; unsigned long flags; int ret; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irqsave(&id_priv->lock, flags); if (id_priv->state == RDMA_CM_IDLE || id_priv->state == RDMA_CM_ADDR_BOUND) { id_priv->options |= (1 << CMA_OPTION_AFONLY); id_priv->afonly = afonly; ret = 0; } else { ret = -EINVAL; } spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } EXPORT_SYMBOL(rdma_set_afonly); static void cma_bind_port(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv) { struct sockaddr *addr; struct sockaddr_ib *sib; u64 sid, mask; __be16 port; lockdep_assert_held(&lock); addr = cma_src_addr(id_priv); port = htons(bind_list->port); switch (addr->sa_family) { case AF_INET: ((struct sockaddr_in *) addr)->sin_port = port; break; case AF_INET6: ((struct sockaddr_in6 *) addr)->sin6_port = port; break; case AF_IB: sib = (struct sockaddr_ib *) addr; sid = be64_to_cpu(sib->sib_sid); mask = be64_to_cpu(sib->sib_sid_mask); sib->sib_sid = cpu_to_be64((sid & mask) | (u64) ntohs(port)); sib->sib_sid_mask = cpu_to_be64(~0ULL); break; } id_priv->bind_list = bind_list; hlist_add_head(&id_priv->node, &bind_list->owners); } static int cma_alloc_port(enum rdma_ucm_port_space ps, struct rdma_id_private *id_priv, unsigned short snum) { struct rdma_bind_list *bind_list; int ret; lockdep_assert_held(&lock); bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL); if (!bind_list) return -ENOMEM; ret = cma_ps_alloc(id_priv->id.route.addr.dev_addr.net, ps, bind_list, snum); if (ret < 0) goto err; bind_list->ps = ps; bind_list->port = snum; cma_bind_port(bind_list, id_priv); return 0; err: kfree(bind_list); return ret == -ENOSPC ? -EADDRNOTAVAIL : ret; } static int cma_port_is_unique(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv) { struct rdma_id_private *cur_id; struct sockaddr *daddr = cma_dst_addr(id_priv); struct sockaddr *saddr = cma_src_addr(id_priv); __be16 dport = cma_port(daddr); lockdep_assert_held(&lock); hlist_for_each_entry(cur_id, &bind_list->owners, node) { struct sockaddr *cur_daddr = cma_dst_addr(cur_id); struct sockaddr *cur_saddr = cma_src_addr(cur_id); __be16 cur_dport = cma_port(cur_daddr); if (id_priv == cur_id) continue; /* different dest port -> unique */ if (!cma_any_port(daddr) && !cma_any_port(cur_daddr) && (dport != cur_dport)) continue; /* different src address -> unique */ if (!cma_any_addr(saddr) && !cma_any_addr(cur_saddr) && cma_addr_cmp(saddr, cur_saddr)) continue; /* different dst address -> unique */ if (!cma_any_addr(daddr) && !cma_any_addr(cur_daddr) && cma_addr_cmp(daddr, cur_daddr)) continue; return -EADDRNOTAVAIL; } return 0; } static int cma_alloc_any_port(enum rdma_ucm_port_space ps, struct rdma_id_private *id_priv) { static unsigned int last_used_port; int low, high, remaining; unsigned int rover; struct net *net = id_priv->id.route.addr.dev_addr.net; lockdep_assert_held(&lock); inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; rover = get_random_u32_inclusive(low, remaining + low - 1); retry: if (last_used_port != rover) { struct rdma_bind_list *bind_list; int ret; bind_list = cma_ps_find(net, ps, (unsigned short)rover); if (!bind_list) { ret = cma_alloc_port(ps, id_priv, rover); } else { ret = cma_port_is_unique(bind_list, id_priv); if (!ret) cma_bind_port(bind_list, id_priv); } /* * Remember previously used port number in order to avoid * re-using same port immediately after it is closed. */ if (!ret) last_used_port = rover; if (ret != -EADDRNOTAVAIL) return ret; } if (--remaining) { rover++; if ((rover < low) || (rover > high)) rover = low; goto retry; } return -EADDRNOTAVAIL; } /* * Check that the requested port is available. This is called when trying to * bind to a specific port, or when trying to listen on a bound port. In * the latter case, the provided id_priv may already be on the bind_list, but * we still need to check that it's okay to start listening. */ static int cma_check_port(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv, uint8_t reuseaddr) { struct rdma_id_private *cur_id; struct sockaddr *addr, *cur_addr; lockdep_assert_held(&lock); addr = cma_src_addr(id_priv); hlist_for_each_entry(cur_id, &bind_list->owners, node) { if (id_priv == cur_id) continue; if (reuseaddr && cur_id->reuseaddr) continue; cur_addr = cma_src_addr(cur_id); if (id_priv->afonly && cur_id->afonly && (addr->sa_family != cur_addr->sa_family)) continue; if (cma_any_addr(addr) || cma_any_addr(cur_addr)) return -EADDRNOTAVAIL; if (!cma_addr_cmp(addr, cur_addr)) return -EADDRINUSE; } return 0; } static int cma_use_port(enum rdma_ucm_port_space ps, struct rdma_id_private *id_priv) { struct rdma_bind_list *bind_list; unsigned short snum; int ret; lockdep_assert_held(&lock); snum = ntohs(cma_port(cma_src_addr(id_priv))); if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) return -EACCES; bind_list = cma_ps_find(id_priv->id.route.addr.dev_addr.net, ps, snum); if (!bind_list) { ret = cma_alloc_port(ps, id_priv, snum); } else { ret = cma_check_port(bind_list, id_priv, id_priv->reuseaddr); if (!ret) cma_bind_port(bind_list, id_priv); } return ret; } static enum rdma_ucm_port_space cma_select_inet_ps(struct rdma_id_private *id_priv) { switch (id_priv->id.ps) { case RDMA_PS_TCP: case RDMA_PS_UDP: case RDMA_PS_IPOIB: case RDMA_PS_IB: return id_priv->id.ps; default: return 0; } } static enum rdma_ucm_port_space cma_select_ib_ps(struct rdma_id_private *id_priv) { enum rdma_ucm_port_space ps = 0; struct sockaddr_ib *sib; u64 sid_ps, mask, sid; sib = (struct sockaddr_ib *) cma_src_addr(id_priv); mask = be64_to_cpu(sib->sib_sid_mask) & RDMA_IB_IP_PS_MASK; sid = be64_to_cpu(sib->sib_sid) & mask; if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) { sid_ps = RDMA_IB_IP_PS_IB; ps = RDMA_PS_IB; } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) && (sid == (RDMA_IB_IP_PS_TCP & mask))) { sid_ps = RDMA_IB_IP_PS_TCP; ps = RDMA_PS_TCP; } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) && (sid == (RDMA_IB_IP_PS_UDP & mask))) { sid_ps = RDMA_IB_IP_PS_UDP; ps = RDMA_PS_UDP; } if (ps) { sib->sib_sid = cpu_to_be64(sid_ps | ntohs(cma_port((struct sockaddr *) sib))); sib->sib_sid_mask = cpu_to_be64(RDMA_IB_IP_PS_MASK | be64_to_cpu(sib->sib_sid_mask)); } return ps; } static int cma_get_port(struct rdma_id_private *id_priv) { enum rdma_ucm_port_space ps; int ret; if (cma_family(id_priv) != AF_IB) ps = cma_select_inet_ps(id_priv); else ps = cma_select_ib_ps(id_priv); if (!ps) return -EPROTONOSUPPORT; mutex_lock(&lock); if (cma_any_port(cma_src_addr(id_priv))) ret = cma_alloc_any_port(ps, id_priv); else ret = cma_use_port(ps, id_priv); mutex_unlock(&lock); return ret; } static int cma_check_linklocal(struct rdma_dev_addr *dev_addr, struct sockaddr *addr) { #if IS_ENABLED(CONFIG_IPV6) struct sockaddr_in6 *sin6; if (addr->sa_family != AF_INET6) return 0; sin6 = (struct sockaddr_in6 *) addr; if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) return 0; if (!sin6->sin6_scope_id) return -EINVAL; dev_addr->bound_dev_if = sin6->sin6_scope_id; #endif return 0; } int rdma_listen(struct rdma_cm_id *id, int backlog) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); int ret; if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN)) { struct sockaddr_in any_in = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), }; /* For a well behaved ULP state will be RDMA_CM_IDLE */ ret = rdma_bind_addr(id, (struct sockaddr *)&any_in); if (ret) return ret; if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN))) return -EINVAL; } /* * Once the ID reaches RDMA_CM_LISTEN it is not allowed to be reusable * any more, and has to be unique in the bind list. */ if (id_priv->reuseaddr) { mutex_lock(&lock); ret = cma_check_port(id_priv->bind_list, id_priv, 0); if (!ret) id_priv->reuseaddr = 0; mutex_unlock(&lock); if (ret) goto err; } id_priv->backlog = backlog; if (id_priv->cma_dev) { if (rdma_cap_ib_cm(id->device, 1)) { ret = cma_ib_listen(id_priv); if (ret) goto err; } else if (rdma_cap_iw_cm(id->device, 1)) { ret = cma_iw_listen(id_priv, backlog); if (ret) goto err; } else { ret = -ENOSYS; goto err; } } else { ret = cma_listen_on_all(id_priv); if (ret) goto err; } return 0; err: id_priv->backlog = 0; /* * All the failure paths that lead here will not allow the req_handler's * to have run. */ cma_comp_exch(id_priv, RDMA_CM_LISTEN, RDMA_CM_ADDR_BOUND); return ret; } EXPORT_SYMBOL(rdma_listen); static int rdma_bind_addr_dst(struct rdma_id_private *id_priv, struct sockaddr *addr, const struct sockaddr *daddr) { struct sockaddr *id_daddr; int ret; if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6 && addr->sa_family != AF_IB) return -EAFNOSUPPORT; if (!cma_comp_exch(id_priv, RDMA_CM_IDLE, RDMA_CM_ADDR_BOUND)) return -EINVAL; ret = cma_check_linklocal(&id_priv->id.route.addr.dev_addr, addr); if (ret) goto err1; memcpy(cma_src_addr(id_priv), addr, rdma_addr_size(addr)); if (!cma_any_addr(addr)) { ret = cma_translate_addr(addr, &id_priv->id.route.addr.dev_addr); if (ret) goto err1; ret = cma_acquire_dev_by_src_ip(id_priv); if (ret) goto err1; } if (!(id_priv->options & (1 << CMA_OPTION_AFONLY))) { if (addr->sa_family == AF_INET) id_priv->afonly = 1; #if IS_ENABLED(CONFIG_IPV6) else if (addr->sa_family == AF_INET6) { struct net *net = id_priv->id.route.addr.dev_addr.net; id_priv->afonly = net->ipv6.sysctl.bindv6only; } #endif } id_daddr = cma_dst_addr(id_priv); if (daddr != id_daddr) memcpy(id_daddr, daddr, rdma_addr_size(addr)); id_daddr->sa_family = addr->sa_family; ret = cma_get_port(id_priv); if (ret) goto err2; if (!cma_any_addr(addr)) rdma_restrack_add(&id_priv->res); return 0; err2: if (id_priv->cma_dev) cma_release_dev(id_priv); err1: cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE); return ret; } static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, const struct sockaddr *dst_addr) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); struct sockaddr_storage zero_sock = {}; if (src_addr && src_addr->sa_family) return rdma_bind_addr_dst(id_priv, src_addr, dst_addr); /* * When the src_addr is not specified, automatically supply an any addr */ zero_sock.ss_family = dst_addr->sa_family; if (IS_ENABLED(CONFIG_IPV6) && dst_addr->sa_family == AF_INET6) { struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *)&zero_sock; struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *)dst_addr; src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id; if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL) id->route.addr.dev_addr.bound_dev_if = dst_addr6->sin6_scope_id; } else if (dst_addr->sa_family == AF_IB) { ((struct sockaddr_ib *)&zero_sock)->sib_pkey = ((struct sockaddr_ib *)dst_addr)->sib_pkey; } return rdma_bind_addr_dst(id_priv, (struct sockaddr *)&zero_sock, dst_addr); } /* * If required, resolve the source address for bind and leave the id_priv in * state RDMA_CM_ADDR_BOUND. This oddly uses the state to determine the prior * calls made by ULP, a previously bound ID will not be re-bound and src_addr is * ignored. */ static int resolve_prepare_src(struct rdma_id_private *id_priv, struct sockaddr *src_addr, const struct sockaddr *dst_addr) { int ret; if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) { /* For a well behaved ULP state will be RDMA_CM_IDLE */ ret = cma_bind_addr(&id_priv->id, src_addr, dst_addr); if (ret) return ret; if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY))) return -EINVAL; } else { memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr)); } if (cma_family(id_priv) != dst_addr->sa_family) { ret = -EINVAL; goto err_state; } return 0; err_state: cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); return ret; } int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, const struct sockaddr *dst_addr, unsigned long timeout_ms) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); int ret; ret = resolve_prepare_src(id_priv, src_addr, dst_addr); if (ret) return ret; if (cma_any_addr(dst_addr)) { ret = cma_resolve_loopback(id_priv); } else { if (dst_addr->sa_family == AF_IB) { ret = cma_resolve_ib_addr(id_priv); } else { /* * The FSM can return back to RDMA_CM_ADDR_BOUND after * rdma_resolve_ip() is called, eg through the error * path in addr_handler(). If this happens the existing * request must be canceled before issuing a new one. * Since canceling a request is a bit slow and this * oddball path is rare, keep track once a request has * been issued. The track turns out to be a permanent * state since this is the only cancel as it is * immediately before rdma_resolve_ip(). */ if (id_priv->used_resolve_ip) rdma_addr_cancel(&id->route.addr.dev_addr); else id_priv->used_resolve_ip = 1; ret = rdma_resolve_ip(cma_src_addr(id_priv), dst_addr, &id->route.addr.dev_addr, timeout_ms, addr_handler, false, id_priv); } } if (ret) goto err; return 0; err: cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); return ret; } EXPORT_SYMBOL(rdma_resolve_addr); int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); return rdma_bind_addr_dst(id_priv, addr, cma_dst_addr(id_priv)); } EXPORT_SYMBOL(rdma_bind_addr); static int cma_format_hdr(void *hdr, struct rdma_id_private *id_priv) { struct cma_hdr *cma_hdr; cma_hdr = hdr; cma_hdr->cma_version = CMA_VERSION; if (cma_family(id_priv) == AF_INET) { struct sockaddr_in *src4, *dst4; src4 = (struct sockaddr_in *) cma_src_addr(id_priv); dst4 = (struct sockaddr_in *) cma_dst_addr(id_priv); cma_set_ip_ver(cma_hdr, 4); cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; cma_hdr->port = src4->sin_port; } else if (cma_family(id_priv) == AF_INET6) { struct sockaddr_in6 *src6, *dst6; src6 = (struct sockaddr_in6 *) cma_src_addr(id_priv); dst6 = (struct sockaddr_in6 *) cma_dst_addr(id_priv); cma_set_ip_ver(cma_hdr, 6); cma_hdr->src_addr.ip6 = src6->sin6_addr; cma_hdr->dst_addr.ip6 = dst6->sin6_addr; cma_hdr->port = src6->sin6_port; } return 0; } static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv = cm_id->context; struct rdma_cm_event event = {}; const struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd; int ret; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) goto out; switch (ib_event->event) { case IB_CM_SIDR_REQ_ERROR: event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -ETIMEDOUT; break; case IB_CM_SIDR_REP_RECEIVED: event.param.ud.private_data = ib_event->private_data; event.param.ud.private_data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE; if (rep->status != IB_SIDR_SUCCESS) { event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = ib_event->param.sidr_rep_rcvd.status; pr_debug_ratelimited("RDMA CM: UNREACHABLE: bad SIDR reply. status %d\n", event.status); break; } ret = cma_set_qkey(id_priv, rep->qkey); if (ret) { pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to set qkey. status %d\n", ret); event.event = RDMA_CM_EVENT_ADDR_ERROR; event.status = ret; break; } ib_init_ah_attr_from_path(id_priv->id.device, id_priv->id.port_num, id_priv->id.route.path_rec, &event.param.ud.ah_attr, rep->sgid_attr); event.param.ud.qp_num = rep->qpn; event.param.ud.qkey = rep->qkey; event.event = RDMA_CM_EVENT_ESTABLISHED; event.status = 0; break; default: pr_err("RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); goto out; } ret = cma_cm_event_handler(id_priv, &event); rdma_destroy_ah_attr(&event.param.ud.ah_attr); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; destroy_id_handler_unlock(id_priv); return ret; } out: mutex_unlock(&id_priv->handler_mutex); return 0; } static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_sidr_req_param req; struct ib_cm_id *id; void *private_data; u8 offset; int ret; memset(&req, 0, sizeof req); offset = cma_user_data_offset(id_priv); if (check_add_overflow(offset, conn_param->private_data_len, &req.private_data_len)) return -EINVAL; if (req.private_data_len) { private_data = kzalloc(req.private_data_len, GFP_ATOMIC); if (!private_data) return -ENOMEM; } else { private_data = NULL; } if (conn_param->private_data && conn_param->private_data_len) memcpy(private_data + offset, conn_param->private_data, conn_param->private_data_len); if (private_data) { ret = cma_format_hdr(private_data, id_priv); if (ret) goto out; req.private_data = private_data; } id = ib_create_cm_id(id_priv->id.device, cma_sidr_rep_handler, id_priv); if (IS_ERR(id)) { ret = PTR_ERR(id); goto out; } id_priv->cm_id.ib = id; req.path = id_priv->id.route.path_rec; req.sgid_attr = id_priv->id.route.addr.dev_addr.sgid_attr; req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8); req.max_cm_retries = CMA_MAX_CM_RETRIES; trace_cm_send_sidr_req(id_priv); ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req); if (ret) { ib_destroy_cm_id(id_priv->cm_id.ib); id_priv->cm_id.ib = NULL; } out: kfree(private_data); return ret; } static int cma_connect_ib(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_req_param req; struct rdma_route *route; void *private_data; struct ib_cm_id *id; u8 offset; int ret; memset(&req, 0, sizeof req); offset = cma_user_data_offset(id_priv); if (check_add_overflow(offset, conn_param->private_data_len, &req.private_data_len)) return -EINVAL; if (req.private_data_len) { private_data = kzalloc(req.private_data_len, GFP_ATOMIC); if (!private_data) return -ENOMEM; } else { private_data = NULL; } if (conn_param->private_data && conn_param->private_data_len) memcpy(private_data + offset, conn_param->private_data, conn_param->private_data_len); id = ib_create_cm_id(id_priv->id.device, cma_ib_handler, id_priv); if (IS_ERR(id)) { ret = PTR_ERR(id); goto out; } id_priv->cm_id.ib = id; route = &id_priv->id.route; if (private_data) { ret = cma_format_hdr(private_data, id_priv); if (ret) goto out; req.private_data = private_data; } req.primary_path = &route->path_rec[0]; req.primary_path_inbound = route->path_rec_inbound; req.primary_path_outbound = route->path_rec_outbound; if (route->num_pri_alt_paths == 2) req.alternate_path = &route->path_rec[1]; req.ppath_sgid_attr = id_priv->id.route.addr.dev_addr.sgid_attr; /* Alternate path SGID attribute currently unsupported */ req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); req.qp_num = id_priv->qp_num; req.qp_type = id_priv->id.qp_type; req.starting_psn = id_priv->seq_num; req.responder_resources = conn_param->responder_resources; req.initiator_depth = conn_param->initiator_depth; req.flow_control = conn_param->flow_control; req.retry_count = min_t(u8, 7, conn_param->retry_count); req.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); req.remote_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; req.local_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; req.max_cm_retries = CMA_MAX_CM_RETRIES; req.srq = id_priv->srq ? 1 : 0; req.ece.vendor_id = id_priv->ece.vendor_id; req.ece.attr_mod = id_priv->ece.attr_mod; trace_cm_send_req(id_priv); ret = ib_send_cm_req(id_priv->cm_id.ib, &req); out: if (ret && !IS_ERR(id)) { ib_destroy_cm_id(id); id_priv->cm_id.ib = NULL; } kfree(private_data); return ret; } static int cma_connect_iw(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct iw_cm_id *cm_id; int ret; struct iw_cm_conn_param iw_param; cm_id = iw_create_cm_id(id_priv->id.device, cma_iw_handler, id_priv); if (IS_ERR(cm_id)) return PTR_ERR(cm_id); mutex_lock(&id_priv->qp_mutex); cm_id->tos = id_priv->tos; cm_id->tos_set = id_priv->tos_set; mutex_unlock(&id_priv->qp_mutex); id_priv->cm_id.iw = cm_id; memcpy(&cm_id->local_addr, cma_src_addr(id_priv), rdma_addr_size(cma_src_addr(id_priv))); memcpy(&cm_id->remote_addr, cma_dst_addr(id_priv), rdma_addr_size(cma_dst_addr(id_priv))); ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) goto out; if (conn_param) { iw_param.ord = conn_param->initiator_depth; iw_param.ird = conn_param->responder_resources; iw_param.private_data = conn_param->private_data; iw_param.private_data_len = conn_param->private_data_len; iw_param.qpn = id_priv->id.qp ? id_priv->qp_num : conn_param->qp_num; } else { memset(&iw_param, 0, sizeof iw_param); iw_param.qpn = id_priv->qp_num; } ret = iw_cm_connect(cm_id, &iw_param); out: if (ret) { iw_destroy_cm_id(cm_id); id_priv->cm_id.iw = NULL; } return ret; } /** * rdma_connect_locked - Initiate an active connection request. * @id: Connection identifier to connect. * @conn_param: Connection information used for connected QPs. * * Same as rdma_connect() but can only be called from the * RDMA_CM_EVENT_ROUTE_RESOLVED handler callback. */ int rdma_connect_locked(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); int ret; if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT)) return -EINVAL; if (!id->qp) { id_priv->qp_num = conn_param->qp_num; id_priv->srq = conn_param->srq; } if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD) ret = cma_resolve_ib_udp(id_priv, conn_param); else ret = cma_connect_ib(id_priv, conn_param); } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = cma_connect_iw(id_priv, conn_param); } else { ret = -ENOSYS; } if (ret) goto err_state; return 0; err_state: cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_ROUTE_RESOLVED); return ret; } EXPORT_SYMBOL(rdma_connect_locked); /** * rdma_connect - Initiate an active connection request. * @id: Connection identifier to connect. * @conn_param: Connection information used for connected QPs. * * Users must have resolved a route for the rdma_cm_id to connect with by having * called rdma_resolve_route before calling this routine. * * This call will either connect to a remote QP or obtain remote QP information * for unconnected rdma_cm_id's. The actual operation is based on the * rdma_cm_id's port space. */ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); int ret; mutex_lock(&id_priv->handler_mutex); ret = rdma_connect_locked(id, conn_param); mutex_unlock(&id_priv->handler_mutex); return ret; } EXPORT_SYMBOL(rdma_connect); /** * rdma_connect_ece - Initiate an active connection request with ECE data. * @id: Connection identifier to connect. * @conn_param: Connection information used for connected QPs. * @ece: ECE parameters * * See rdma_connect() explanation. */ int rdma_connect_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, struct rdma_ucm_ece *ece) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); id_priv->ece.vendor_id = ece->vendor_id; id_priv->ece.attr_mod = ece->attr_mod; return rdma_connect(id, conn_param); } EXPORT_SYMBOL(rdma_connect_ece); static int cma_accept_ib(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_rep_param rep; int ret; ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) goto out; ret = cma_modify_qp_rts(id_priv, conn_param); if (ret) goto out; memset(&rep, 0, sizeof rep); rep.qp_num = id_priv->qp_num; rep.starting_psn = id_priv->seq_num; rep.private_data = conn_param->private_data; rep.private_data_len = conn_param->private_data_len; rep.responder_resources = conn_param->responder_resources; rep.initiator_depth = conn_param->initiator_depth; rep.failover_accepted = 0; rep.flow_control = conn_param->flow_control; rep.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); rep.srq = id_priv->srq ? 1 : 0; rep.ece.vendor_id = id_priv->ece.vendor_id; rep.ece.attr_mod = id_priv->ece.attr_mod; trace_cm_send_rep(id_priv); ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep); out: return ret; } static int cma_accept_iw(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct iw_cm_conn_param iw_param; int ret; if (!conn_param) return -EINVAL; ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) return ret; iw_param.ord = conn_param->initiator_depth; iw_param.ird = conn_param->responder_resources; iw_param.private_data = conn_param->private_data; iw_param.private_data_len = conn_param->private_data_len; if (id_priv->id.qp) iw_param.qpn = id_priv->qp_num; else iw_param.qpn = conn_param->qp_num; return iw_cm_accept(id_priv->cm_id.iw, &iw_param); } static int cma_send_sidr_rep(struct rdma_id_private *id_priv, enum ib_cm_sidr_status status, u32 qkey, const void *private_data, int private_data_len) { struct ib_cm_sidr_rep_param rep; int ret; memset(&rep, 0, sizeof rep); rep.status = status; if (status == IB_SIDR_SUCCESS) { if (qkey) ret = cma_set_qkey(id_priv, qkey); else ret = cma_set_default_qkey(id_priv); if (ret) return ret; rep.qp_num = id_priv->qp_num; rep.qkey = id_priv->qkey; rep.ece.vendor_id = id_priv->ece.vendor_id; rep.ece.attr_mod = id_priv->ece.attr_mod; } rep.private_data = private_data; rep.private_data_len = private_data_len; trace_cm_send_sidr_rep(id_priv); return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep); } /** * rdma_accept - Called to accept a connection request or response. * @id: Connection identifier associated with the request. * @conn_param: Information needed to establish the connection. This must be * provided if accepting a connection request. If accepting a connection * response, this parameter must be NULL. * * Typically, this routine is only called by the listener to accept a connection * request. It must also be called on the active side of a connection if the * user is performing their own QP transitions. * * In the case of error, a reject message is sent to the remote side and the * state of the qp associated with the id is modified to error, such that any * previously posted receive buffers would be flushed. * * This function is for use by kernel ULPs and must be called from under the * handler callback. */ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); int ret; lockdep_assert_held(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) return -EINVAL; if (!id->qp && conn_param) { id_priv->qp_num = conn_param->qp_num; id_priv->srq = conn_param->srq; } if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD) { if (conn_param) ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, conn_param->qkey, conn_param->private_data, conn_param->private_data_len); else ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, 0, NULL, 0); } else { if (conn_param) ret = cma_accept_ib(id_priv, conn_param); else ret = cma_rep_recv(id_priv); } } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = cma_accept_iw(id_priv, conn_param); } else { ret = -ENOSYS; } if (ret) goto reject; return 0; reject: cma_modify_qp_err(id_priv); rdma_reject(id, NULL, 0, IB_CM_REJ_CONSUMER_DEFINED); return ret; } EXPORT_SYMBOL(rdma_accept); int rdma_accept_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, struct rdma_ucm_ece *ece) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); id_priv->ece.vendor_id = ece->vendor_id; id_priv->ece.attr_mod = ece->attr_mod; return rdma_accept(id, conn_param); } EXPORT_SYMBOL(rdma_accept_ece); void rdma_lock_handler(struct rdma_cm_id *id) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->handler_mutex); } EXPORT_SYMBOL(rdma_lock_handler); void rdma_unlock_handler(struct rdma_cm_id *id) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); mutex_unlock(&id_priv->handler_mutex); } EXPORT_SYMBOL(rdma_unlock_handler); int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL; switch (id->device->node_type) { case RDMA_NODE_IB_CA: ret = ib_cm_notify(id_priv->cm_id.ib, event); break; default: ret = 0; break; } return ret; } EXPORT_SYMBOL(rdma_notify); int rdma_reject(struct rdma_cm_id *id, const void *private_data, u8 private_data_len, u8 reason) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL; if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD) { ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, 0, private_data, private_data_len); } else { trace_cm_send_rej(id_priv); ret = ib_send_cm_rej(id_priv->cm_id.ib, reason, NULL, 0, private_data, private_data_len); } } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = iw_cm_reject(id_priv->cm_id.iw, private_data, private_data_len); } else { ret = -ENOSYS; } return ret; } EXPORT_SYMBOL(rdma_reject); int rdma_disconnect(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL; if (rdma_cap_ib_cm(id->device, id->port_num)) { ret = cma_modify_qp_err(id_priv); if (ret) goto out; /* Initiate or respond to a disconnect. */ trace_cm_disconnect(id_priv); if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) { if (!ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0)) trace_cm_sent_drep(id_priv); } else { trace_cm_sent_dreq(id_priv); } } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = iw_cm_disconnect(id_priv->cm_id.iw, 0); } else ret = -EINVAL; out: return ret; } EXPORT_SYMBOL(rdma_disconnect); static void cma_make_mc_event(int status, struct rdma_id_private *id_priv, struct ib_sa_multicast *multicast, struct rdma_cm_event *event, struct cma_multicast *mc) { struct rdma_dev_addr *dev_addr; enum ib_gid_type gid_type; struct net_device *ndev; if (status) pr_debug_ratelimited("RDMA CM: MULTICAST_ERROR: failed to join multicast. status %d\n", status); event->status = status; event->param.ud.private_data = mc->context; if (status) { event->event = RDMA_CM_EVENT_MULTICAST_ERROR; return; } dev_addr = &id_priv->id.route.addr.dev_addr; ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); gid_type = id_priv->cma_dev ->default_gid_type[id_priv->id.port_num - rdma_start_port( id_priv->cma_dev->device)]; event->event = RDMA_CM_EVENT_MULTICAST_JOIN; if (ib_init_ah_from_mcmember(id_priv->id.device, id_priv->id.port_num, &multicast->rec, ndev, gid_type, &event->param.ud.ah_attr)) { event->event = RDMA_CM_EVENT_MULTICAST_ERROR; goto out; } event->param.ud.qp_num = 0xFFFFFF; event->param.ud.qkey = id_priv->qkey; out: dev_put(ndev); } static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) { struct cma_multicast *mc = multicast->context; struct rdma_id_private *id_priv = mc->id_priv; struct rdma_cm_event event = {}; int ret = 0; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL || READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING) goto out; ret = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey)); if (!ret) { cma_make_mc_event(status, id_priv, multicast, &event, mc); ret = cma_cm_event_handler(id_priv, &event); } rdma_destroy_ah_attr(&event.param.ud.ah_attr); WARN_ON(ret); out: mutex_unlock(&id_priv->handler_mutex); return 0; } static void cma_set_mgid(struct rdma_id_private *id_priv, struct sockaddr *addr, union ib_gid *mgid) { unsigned char mc_map[MAX_ADDR_LEN]; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct sockaddr_in *sin = (struct sockaddr_in *) addr; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) addr; if (cma_any_addr(addr)) { memset(mgid, 0, sizeof *mgid); } else if ((addr->sa_family == AF_INET6) && ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFFF0FFFF) == 0xFF10A01B)) { /* IPv6 address is an SA assigned MGID. */ memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); } else if (addr->sa_family == AF_IB) { memcpy(mgid, &((struct sockaddr_ib *) addr)->sib_addr, sizeof *mgid); } else if (addr->sa_family == AF_INET6) { ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) mc_map[7] = 0x01; /* Use RDMA CM signature */ *mgid = *(union ib_gid *) (mc_map + 4); } else { ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) mc_map[7] = 0x01; /* Use RDMA CM signature */ *mgid = *(union ib_gid *) (mc_map + 4); } } static int cma_join_ib_multicast(struct rdma_id_private *id_priv, struct cma_multicast *mc) { struct ib_sa_mcmember_rec rec; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; ib_sa_comp_mask comp_mask; int ret; ib_addr_get_mgid(dev_addr, &rec.mgid); ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num, &rec.mgid, &rec); if (ret) return ret; if (!id_priv->qkey) { ret = cma_set_default_qkey(id_priv); if (ret) return ret; } cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid); rec.qkey = cpu_to_be32(id_priv->qkey); rdma_addr_get_sgid(dev_addr, &rec.port_gid); rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); rec.join_state = mc->join_state; comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE | IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_SL | IB_SA_MCMEMBER_REC_FLOW_LABEL | IB_SA_MCMEMBER_REC_TRAFFIC_CLASS; if (id_priv->id.ps == RDMA_PS_IPOIB) comp_mask |= IB_SA_MCMEMBER_REC_RATE | IB_SA_MCMEMBER_REC_RATE_SELECTOR | IB_SA_MCMEMBER_REC_MTU_SELECTOR | IB_SA_MCMEMBER_REC_MTU | IB_SA_MCMEMBER_REC_HOP_LIMIT; mc->sa_mc = ib_sa_join_multicast(&sa_client, id_priv->id.device, id_priv->id.port_num, &rec, comp_mask, GFP_KERNEL, cma_ib_mc_handler, mc); return PTR_ERR_OR_ZERO(mc->sa_mc); } static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, enum ib_gid_type gid_type) { struct sockaddr_in *sin = (struct sockaddr_in *)addr; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; if (cma_any_addr(addr)) { memset(mgid, 0, sizeof *mgid); } else if (addr->sa_family == AF_INET6) { memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); } else { mgid->raw[0] = (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0xff; mgid->raw[1] = (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0x0e; mgid->raw[2] = 0; mgid->raw[3] = 0; mgid->raw[4] = 0; mgid->raw[5] = 0; mgid->raw[6] = 0; mgid->raw[7] = 0; mgid->raw[8] = 0; mgid->raw[9] = 0; mgid->raw[10] = 0xff; mgid->raw[11] = 0xff; *(__be32 *)(&mgid->raw[12]) = sin->sin_addr.s_addr; } } static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, struct cma_multicast *mc) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; int err = 0; struct sockaddr *addr = (struct sockaddr *)&mc->addr; struct net_device *ndev = NULL; struct ib_sa_multicast ib = {}; enum ib_gid_type gid_type; bool send_only; send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN); if (cma_zero_addr(addr)) return -EINVAL; gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; cma_iboe_set_mgid(addr, &ib.rec.mgid, gid_type); ib.rec.pkey = cpu_to_be16(0xffff); if (dev_addr->bound_dev_if) ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (!ndev) return -ENODEV; ib.rec.rate = IB_RATE_PORT_CURRENT; ib.rec.hop_limit = 1; ib.rec.mtu = iboe_get_mtu(ndev->mtu); if (addr->sa_family == AF_INET) { if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { ib.rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; if (!send_only) { err = cma_igmp_send(ndev, &ib.rec.mgid, true); } } } else { if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) err = -ENOTSUPP; } dev_put(ndev); if (err || !ib.rec.mtu) return err ?: -EINVAL; if (!id_priv->qkey) cma_set_default_qkey(id_priv); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &ib.rec.port_gid); INIT_WORK(&mc->iboe_join.work, cma_iboe_join_work_handler); cma_make_mc_event(0, id_priv, &ib, &mc->iboe_join.event, mc); queue_work(cma_wq, &mc->iboe_join.work); return 0; } int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, u8 join_state, void *context) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); struct cma_multicast *mc; int ret; /* Not supported for kernel QPs */ if (WARN_ON(id->qp)) return -EINVAL; /* ULP is calling this wrong. */ if (!id->device || (READ_ONCE(id_priv->state) != RDMA_CM_ADDR_BOUND && READ_ONCE(id_priv->state) != RDMA_CM_ADDR_RESOLVED)) return -EINVAL; if (id_priv->id.qp_type != IB_QPT_UD) return -EINVAL; mc = kzalloc(sizeof(*mc), GFP_KERNEL); if (!mc) return -ENOMEM; memcpy(&mc->addr, addr, rdma_addr_size(addr)); mc->context = context; mc->id_priv = id_priv; mc->join_state = join_state; if (rdma_protocol_roce(id->device, id->port_num)) { ret = cma_iboe_join_multicast(id_priv, mc); if (ret) goto out_err; } else if (rdma_cap_ib_mcast(id->device, id->port_num)) { ret = cma_join_ib_multicast(id_priv, mc); if (ret) goto out_err; } else { ret = -ENOSYS; goto out_err; } spin_lock(&id_priv->lock); list_add(&mc->list, &id_priv->mc_list); spin_unlock(&id_priv->lock); return 0; out_err: kfree(mc); return ret; } EXPORT_SYMBOL(rdma_join_multicast); void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) { struct rdma_id_private *id_priv; struct cma_multicast *mc; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irq(&id_priv->lock); list_for_each_entry(mc, &id_priv->mc_list, list) { if (memcmp(&mc->addr, addr, rdma_addr_size(addr)) != 0) continue; list_del(&mc->list); spin_unlock_irq(&id_priv->lock); WARN_ON(id_priv->cma_dev->device != id->device); destroy_mc(id_priv, mc); return; } spin_unlock_irq(&id_priv->lock); } EXPORT_SYMBOL(rdma_leave_multicast); static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr; struct cma_work *work; dev_addr = &id_priv->id.route.addr.dev_addr; if ((dev_addr->bound_dev_if == ndev->ifindex) && (net_eq(dev_net(ndev), dev_addr->net)) && memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) { pr_info("RDMA CM addr change for ndev %s used by id %p\n", ndev->name, &id_priv->id); work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; INIT_WORK(&work->work, cma_work_handler); work->id = id_priv; work->event.event = RDMA_CM_EVENT_ADDR_CHANGE; cma_id_get(id_priv); queue_work(cma_wq, &work->work); } return 0; } static int cma_netdev_callback(struct notifier_block *self, unsigned long event, void *ptr) { struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct cma_device *cma_dev; struct rdma_id_private *id_priv; int ret = NOTIFY_DONE; if (event != NETDEV_BONDING_FAILOVER) return NOTIFY_DONE; if (!netif_is_bond_master(ndev)) return NOTIFY_DONE; mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) list_for_each_entry(id_priv, &cma_dev->id_list, device_item) { ret = cma_netdev_change(ndev, id_priv); if (ret) goto out; } out: mutex_unlock(&lock); return ret; } static void cma_netevent_work_handler(struct work_struct *_work) { struct rdma_id_private *id_priv = container_of(_work, struct rdma_id_private, id.net_work); struct rdma_cm_event event = {}; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) goto out_unlock; event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -ETIMEDOUT; if (cma_cm_event_handler(id_priv, &event)) { __acquire(&id_priv->handler_mutex); id_priv->cm_id.ib = NULL; cma_id_put(id_priv); destroy_id_handler_unlock(id_priv); return; } out_unlock: mutex_unlock(&id_priv->handler_mutex); cma_id_put(id_priv); } static int cma_netevent_callback(struct notifier_block *self, unsigned long event, void *ctx) { struct id_table_entry *ips_node = NULL; struct rdma_id_private *current_id; struct neighbour *neigh = ctx; unsigned long flags; if (event != NETEVENT_NEIGH_UPDATE) return NOTIFY_DONE; spin_lock_irqsave(&id_table_lock, flags); if (neigh->tbl->family == AF_INET6) { struct sockaddr_in6 neigh_sock_6; neigh_sock_6.sin6_family = AF_INET6; neigh_sock_6.sin6_addr = *(struct in6_addr *)neigh->primary_key; ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex, (struct sockaddr *)&neigh_sock_6); } else if (neigh->tbl->family == AF_INET) { struct sockaddr_in neigh_sock_4; neigh_sock_4.sin_family = AF_INET; neigh_sock_4.sin_addr.s_addr = *(__be32 *)(neigh->primary_key); ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex, (struct sockaddr *)&neigh_sock_4); } else goto out; if (!ips_node) goto out; list_for_each_entry(current_id, &ips_node->id_list, id_list_entry) { if (!memcmp(current_id->id.route.addr.dev_addr.dst_dev_addr, neigh->ha, ETH_ALEN)) continue; INIT_WORK(&current_id->id.net_work, cma_netevent_work_handler); cma_id_get(current_id); queue_work(cma_wq, &current_id->id.net_work); } out: spin_unlock_irqrestore(&id_table_lock, flags); return NOTIFY_DONE; } static struct notifier_block cma_nb = { .notifier_call = cma_netdev_callback }; static struct notifier_block cma_netevent_cb = { .notifier_call = cma_netevent_callback }; static void cma_send_device_removal_put(struct rdma_id_private *id_priv) { struct rdma_cm_event event = { .event = RDMA_CM_EVENT_DEVICE_REMOVAL }; enum rdma_cm_state state; unsigned long flags; mutex_lock(&id_priv->handler_mutex); /* Record that we want to remove the device */ spin_lock_irqsave(&id_priv->lock, flags); state = id_priv->state; if (state == RDMA_CM_DESTROYING || state == RDMA_CM_DEVICE_REMOVAL) { spin_unlock_irqrestore(&id_priv->lock, flags); mutex_unlock(&id_priv->handler_mutex); cma_id_put(id_priv); return; } id_priv->state = RDMA_CM_DEVICE_REMOVAL; spin_unlock_irqrestore(&id_priv->lock, flags); if (cma_cm_event_handler(id_priv, &event)) { /* * At this point the ULP promises it won't call * rdma_destroy_id() concurrently */ cma_id_put(id_priv); mutex_unlock(&id_priv->handler_mutex); trace_cm_id_destroy(id_priv); _destroy_id(id_priv, state); return; } mutex_unlock(&id_priv->handler_mutex); /* * If this races with destroy then the thread that first assigns state * to a destroying does the cancel. */ cma_cancel_operation(id_priv, state); cma_id_put(id_priv); } static void cma_process_remove(struct cma_device *cma_dev) { mutex_lock(&lock); while (!list_empty(&cma_dev->id_list)) { struct rdma_id_private *id_priv = list_first_entry( &cma_dev->id_list, struct rdma_id_private, device_item); list_del_init(&id_priv->listen_item); list_del_init(&id_priv->device_item); cma_id_get(id_priv); mutex_unlock(&lock); cma_send_device_removal_put(id_priv); mutex_lock(&lock); } mutex_unlock(&lock); cma_dev_put(cma_dev); wait_for_completion(&cma_dev->comp); } static bool cma_supported(struct ib_device *device) { u32 i; rdma_for_each_port(device, i) { if (rdma_cap_ib_cm(device, i) || rdma_cap_iw_cm(device, i)) return true; } return false; } static int cma_add_one(struct ib_device *device) { struct rdma_id_private *to_destroy; struct cma_device *cma_dev; struct rdma_id_private *id_priv; unsigned long supported_gids = 0; int ret; u32 i; if (!cma_supported(device)) return -EOPNOTSUPP; cma_dev = kmalloc(sizeof(*cma_dev), GFP_KERNEL); if (!cma_dev) return -ENOMEM; cma_dev->device = device; cma_dev->default_gid_type = kcalloc(device->phys_port_cnt, sizeof(*cma_dev->default_gid_type), GFP_KERNEL); if (!cma_dev->default_gid_type) { ret = -ENOMEM; goto free_cma_dev; } cma_dev->default_roce_tos = kcalloc(device->phys_port_cnt, sizeof(*cma_dev->default_roce_tos), GFP_KERNEL); if (!cma_dev->default_roce_tos) { ret = -ENOMEM; goto free_gid_type; } rdma_for_each_port (device, i) { supported_gids = roce_gid_type_mask_support(device, i); WARN_ON(!supported_gids); if (supported_gids & (1 << CMA_PREFERRED_ROCE_GID_TYPE)) cma_dev->default_gid_type[i - rdma_start_port(device)] = CMA_PREFERRED_ROCE_GID_TYPE; else cma_dev->default_gid_type[i - rdma_start_port(device)] = find_first_bit(&supported_gids, BITS_PER_LONG); cma_dev->default_roce_tos[i - rdma_start_port(device)] = 0; } init_completion(&cma_dev->comp); refcount_set(&cma_dev->refcount, 1); INIT_LIST_HEAD(&cma_dev->id_list); ib_set_client_data(device, &cma_client, cma_dev); mutex_lock(&lock); list_add_tail(&cma_dev->list, &dev_list); list_for_each_entry(id_priv, &listen_any_list, listen_any_item) { ret = cma_listen_on_dev(id_priv, cma_dev, &to_destroy); if (ret) goto free_listen; } mutex_unlock(&lock); trace_cm_add_one(device); return 0; free_listen: list_del(&cma_dev->list); mutex_unlock(&lock); /* cma_process_remove() will delete to_destroy */ cma_process_remove(cma_dev); kfree(cma_dev->default_roce_tos); free_gid_type: kfree(cma_dev->default_gid_type); free_cma_dev: kfree(cma_dev); return ret; } static void cma_remove_one(struct ib_device *device, void *client_data) { struct cma_device *cma_dev = client_data; trace_cm_remove_one(device); mutex_lock(&lock); list_del(&cma_dev->list); mutex_unlock(&lock); cma_process_remove(cma_dev); kfree(cma_dev->default_roce_tos); kfree(cma_dev->default_gid_type); kfree(cma_dev); } static int cma_init_net(struct net *net) { struct cma_pernet *pernet = cma_pernet(net); xa_init(&pernet->tcp_ps); xa_init(&pernet->udp_ps); xa_init(&pernet->ipoib_ps); xa_init(&pernet->ib_ps); return 0; } static void cma_exit_net(struct net *net) { struct cma_pernet *pernet = cma_pernet(net); WARN_ON(!xa_empty(&pernet->tcp_ps)); WARN_ON(!xa_empty(&pernet->udp_ps)); WARN_ON(!xa_empty(&pernet->ipoib_ps)); WARN_ON(!xa_empty(&pernet->ib_ps)); } static struct pernet_operations cma_pernet_operations = { .init = cma_init_net, .exit = cma_exit_net, .id = &cma_pernet_id, .size = sizeof(struct cma_pernet), }; static int __init cma_init(void) { int ret; /* * There is a rare lock ordering dependency in cma_netdev_callback() * that only happens when bonding is enabled. Teach lockdep that rtnl * must never be nested under lock so it can find these without having * to test with bonding. */ if (IS_ENABLED(CONFIG_LOCKDEP)) { rtnl_lock(); mutex_lock(&lock); mutex_unlock(&lock); rtnl_unlock(); } cma_wq = alloc_ordered_workqueue("rdma_cm", WQ_MEM_RECLAIM); if (!cma_wq) return -ENOMEM; ret = register_pernet_subsys(&cma_pernet_operations); if (ret) goto err_wq; ib_sa_register_client(&sa_client); register_netdevice_notifier(&cma_nb); register_netevent_notifier(&cma_netevent_cb); ret = ib_register_client(&cma_client); if (ret) goto err; ret = cma_configfs_init(); if (ret) goto err_ib; return 0; err_ib: ib_unregister_client(&cma_client); err: unregister_netevent_notifier(&cma_netevent_cb); unregister_netdevice_notifier(&cma_nb); ib_sa_unregister_client(&sa_client); unregister_pernet_subsys(&cma_pernet_operations); err_wq: destroy_workqueue(cma_wq); return ret; } static void __exit cma_cleanup(void) { cma_configfs_exit(); ib_unregister_client(&cma_client); unregister_netevent_notifier(&cma_netevent_cb); unregister_netdevice_notifier(&cma_nb); ib_sa_unregister_client(&sa_client); unregister_pernet_subsys(&cma_pernet_operations); destroy_workqueue(cma_wq); } module_init(cma_init); module_exit(cma_cleanup);
23234 22 27 27 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM x86_fpu #if !defined(_TRACE_FPU_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_FPU_H #include <linux/tracepoint.h> DECLARE_EVENT_CLASS(x86_fpu, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu), TP_STRUCT__entry( __field(struct fpu *, fpu) __field(bool, load_fpu) __field(u64, xfeatures) __field(u64, xcomp_bv) ), TP_fast_assign( __entry->fpu = fpu; __entry->load_fpu = test_thread_flag(TIF_NEED_FPU_LOAD); if (boot_cpu_has(X86_FEATURE_OSXSAVE)) { __entry->xfeatures = fpu->fpstate->regs.xsave.header.xfeatures; __entry->xcomp_bv = fpu->fpstate->regs.xsave.header.xcomp_bv; } ), TP_printk("x86/fpu: %p load: %d xfeatures: %llx xcomp_bv: %llx", __entry->fpu, __entry->load_fpu, __entry->xfeatures, __entry->xcomp_bv ) ); DEFINE_EVENT(x86_fpu, x86_fpu_before_save, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_after_save, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_before_restore, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_after_restore, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_regs_activated, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_regs_deactivated, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_init_state, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_dropped, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_copy_src, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_copy_dst, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_xstate_check_failed, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH asm/trace/ #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE fpu #endif /* _TRACE_FPU_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
61 1 61 84 14 69 60 46 46 60 60 60 2 4 4 4 4 4 3 1 1 4 1 1 1 4 60 60 60 58 60 60 15 15 56 58 58 46 46 45 4 4 1 4 54 54 54 21 54 54 54 54 54 53 54 54 58 57 58 58 22 73 73 72 71 69 68 67 66 66 66 65 61 56 65 63 13 52 61 62 75 74 73 73 73 15 57 61 61 2 2 61 1 60 17 60 60 60 58 57 30 46 28 46 48 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 // SPDX-License-Identifier: LGPL-2.1 /* * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd. * Written by Takashi Sato <t-sato@yk.jp.nec.com> * Akira Fujita <a-fujita@rs.jp.nec.com> */ #include <linux/fs.h> #include <linux/quotaops.h> #include <linux/slab.h> #include <linux/sched/mm.h> #include "ext4_jbd2.h" #include "ext4.h" #include "ext4_extents.h" /** * get_ext_path() - Find an extent path for designated logical block number. * @inode: inode to be searched * @lblock: logical block number to find an extent path * @ppath: pointer to an extent path pointer (for output) * * ext4_find_extent wrapper. Return 0 on success, or a negative error value * on failure. */ static inline int get_ext_path(struct inode *inode, ext4_lblk_t lblock, struct ext4_ext_path **ppath) { struct ext4_ext_path *path; path = ext4_find_extent(inode, lblock, ppath, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); if (path[ext_depth(inode)].p_ext == NULL) { ext4_free_ext_path(path); *ppath = NULL; return -ENODATA; } *ppath = path; return 0; } /** * ext4_double_down_write_data_sem() - write lock two inodes's i_data_sem * @first: inode to be locked * @second: inode to be locked * * Acquire write lock of i_data_sem of the two inodes */ void ext4_double_down_write_data_sem(struct inode *first, struct inode *second) { if (first < second) { down_write(&EXT4_I(first)->i_data_sem); down_write_nested(&EXT4_I(second)->i_data_sem, I_DATA_SEM_OTHER); } else { down_write(&EXT4_I(second)->i_data_sem); down_write_nested(&EXT4_I(first)->i_data_sem, I_DATA_SEM_OTHER); } } /** * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem * * @orig_inode: original inode structure to be released its lock first * @donor_inode: donor inode structure to be released its lock second * Release write lock of i_data_sem of two inodes (orig and donor). */ void ext4_double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) { up_write(&EXT4_I(orig_inode)->i_data_sem); up_write(&EXT4_I(donor_inode)->i_data_sem); } /** * mext_check_coverage - Check that all extents in range has the same type * * @inode: inode in question * @from: block offset of inode * @count: block count to be checked * @unwritten: extents expected to be unwritten * @err: pointer to save error value * * Return 1 if all extents in range has expected type, and zero otherwise. */ static int mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, int unwritten, int *err) { struct ext4_ext_path *path = NULL; struct ext4_extent *ext; int ret = 0; ext4_lblk_t last = from + count; while (from < last) { *err = get_ext_path(inode, from, &path); if (*err) goto out; ext = path[ext_depth(inode)].p_ext; if (unwritten != ext4_ext_is_unwritten(ext)) goto out; from += ext4_ext_get_actual_len(ext); } ret = 1; out: ext4_free_ext_path(path); return ret; } /** * mext_folio_double_lock - Grab and lock folio on both @inode1 and @inode2 * * @inode1: the inode structure * @inode2: the inode structure * @index1: folio index * @index2: folio index * @folio: result folio vector * * Grab two locked folio for inode's by inode order */ static int mext_folio_double_lock(struct inode *inode1, struct inode *inode2, pgoff_t index1, pgoff_t index2, struct folio *folio[2]) { struct address_space *mapping[2]; unsigned int flags; BUG_ON(!inode1 || !inode2); if (inode1 < inode2) { mapping[0] = inode1->i_mapping; mapping[1] = inode2->i_mapping; } else { swap(index1, index2); mapping[0] = inode2->i_mapping; mapping[1] = inode1->i_mapping; } flags = memalloc_nofs_save(); folio[0] = __filemap_get_folio(mapping[0], index1, FGP_WRITEBEGIN, mapping_gfp_mask(mapping[0])); if (IS_ERR(folio[0])) { memalloc_nofs_restore(flags); return PTR_ERR(folio[0]); } folio[1] = __filemap_get_folio(mapping[1], index2, FGP_WRITEBEGIN, mapping_gfp_mask(mapping[1])); memalloc_nofs_restore(flags); if (IS_ERR(folio[1])) { folio_unlock(folio[0]); folio_put(folio[0]); return PTR_ERR(folio[1]); } /* * __filemap_get_folio() may not wait on folio's writeback if * BDI not demand that. But it is reasonable to be very conservative * here and explicitly wait on folio's writeback */ folio_wait_writeback(folio[0]); folio_wait_writeback(folio[1]); if (inode1 > inode2) swap(folio[0], folio[1]); return 0; } /* Force page buffers uptodate w/o dropping page's lock */ static int mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to) { struct inode *inode = folio->mapping->host; sector_t block; struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; unsigned int blocksize, block_start, block_end; int i, err, nr = 0, partial = 0; BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); if (folio_test_uptodate(folio)) return 0; blocksize = i_blocksize(inode); head = folio_buffers(folio); if (!head) head = create_empty_buffers(folio, blocksize, 0); block = (sector_t)folio->index << (PAGE_SHIFT - inode->i_blkbits); for (bh = head, block_start = 0; bh != head || !block_start; block++, block_start = block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (!buffer_uptodate(bh)) partial = 1; continue; } if (buffer_uptodate(bh)) continue; if (!buffer_mapped(bh)) { err = ext4_get_block(inode, block, bh, 0); if (err) { folio_set_error(folio); return err; } if (!buffer_mapped(bh)) { folio_zero_range(folio, block_start, blocksize); set_buffer_uptodate(bh); continue; } } BUG_ON(nr >= MAX_BUF_PER_PAGE); arr[nr++] = bh; } /* No io required */ if (!nr) goto out; for (i = 0; i < nr; i++) { bh = arr[i]; if (!bh_uptodate_or_lock(bh)) { err = ext4_read_bh(bh, 0, NULL); if (err) return err; } } out: if (!partial) folio_mark_uptodate(folio); return 0; } /** * move_extent_per_page - Move extent data per page * * @o_filp: file structure of original file * @donor_inode: donor inode * @orig_page_offset: page index on original file * @donor_page_offset: page index on donor file * @data_offset_in_page: block index where data swapping starts * @block_len_in_page: the number of blocks to be swapped * @unwritten: orig extent is unwritten or not * @err: pointer to save return value * * Save the data in original inode blocks and replace original inode extents * with donor inode extents by calling ext4_swap_extents(). * Finally, write out the saved data in new original inode blocks. Return * replaced block count. */ static int move_extent_per_page(struct file *o_filp, struct inode *donor_inode, pgoff_t orig_page_offset, pgoff_t donor_page_offset, int data_offset_in_page, int block_len_in_page, int unwritten, int *err) { struct inode *orig_inode = file_inode(o_filp); struct folio *folio[2] = {NULL, NULL}; handle_t *handle; ext4_lblk_t orig_blk_offset, donor_blk_offset; unsigned long blocksize = orig_inode->i_sb->s_blocksize; unsigned int tmp_data_size, data_size, replaced_size; int i, err2, jblocks, retries = 0; int replaced_count = 0; int from = data_offset_in_page << orig_inode->i_blkbits; int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits; struct super_block *sb = orig_inode->i_sb; struct buffer_head *bh = NULL; /* * It needs twice the amount of ordinary journal buffers because * inode and donor_inode may change each different metadata blocks. */ again: *err = 0; jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks); if (IS_ERR(handle)) { *err = PTR_ERR(handle); return 0; } orig_blk_offset = orig_page_offset * blocks_per_page + data_offset_in_page; donor_blk_offset = donor_page_offset * blocks_per_page + data_offset_in_page; /* Calculate data_size */ if ((orig_blk_offset + block_len_in_page - 1) == ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { /* Replace the last block */ tmp_data_size = orig_inode->i_size & (blocksize - 1); /* * If data_size equal zero, it shows data_size is multiples of * blocksize. So we set appropriate value. */ if (tmp_data_size == 0) tmp_data_size = blocksize; data_size = tmp_data_size + ((block_len_in_page - 1) << orig_inode->i_blkbits); } else data_size = block_len_in_page << orig_inode->i_blkbits; replaced_size = data_size; *err = mext_folio_double_lock(orig_inode, donor_inode, orig_page_offset, donor_page_offset, folio); if (unlikely(*err < 0)) goto stop_journal; /* * If orig extent was unwritten it can become initialized * at any time after i_data_sem was dropped, in order to * serialize with delalloc we have recheck extent while we * hold page's lock, if it is still the case data copy is not * necessary, just swap data blocks between orig and donor. */ VM_BUG_ON_FOLIO(folio_test_large(folio[0]), folio[0]); VM_BUG_ON_FOLIO(folio_test_large(folio[1]), folio[1]); VM_BUG_ON_FOLIO(folio_nr_pages(folio[0]) != folio_nr_pages(folio[1]), folio[1]); if (unwritten) { ext4_double_down_write_data_sem(orig_inode, donor_inode); /* If any of extents in range became initialized we have to * fallback to data copying */ unwritten = mext_check_coverage(orig_inode, orig_blk_offset, block_len_in_page, 1, err); if (*err) goto drop_data_sem; unwritten &= mext_check_coverage(donor_inode, donor_blk_offset, block_len_in_page, 1, err); if (*err) goto drop_data_sem; if (!unwritten) { ext4_double_up_write_data_sem(orig_inode, donor_inode); goto data_copy; } if (!filemap_release_folio(folio[0], 0) || !filemap_release_folio(folio[1], 0)) { *err = -EBUSY; goto drop_data_sem; } replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode, orig_blk_offset, donor_blk_offset, block_len_in_page, 1, err); drop_data_sem: ext4_double_up_write_data_sem(orig_inode, donor_inode); goto unlock_folios; } data_copy: *err = mext_page_mkuptodate(folio[0], from, from + replaced_size); if (*err) goto unlock_folios; /* At this point all buffers in range are uptodate, old mapping layout * is no longer required, try to drop it now. */ if (!filemap_release_folio(folio[0], 0) || !filemap_release_folio(folio[1], 0)) { *err = -EBUSY; goto unlock_folios; } ext4_double_down_write_data_sem(orig_inode, donor_inode); replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode, orig_blk_offset, donor_blk_offset, block_len_in_page, 1, err); ext4_double_up_write_data_sem(orig_inode, donor_inode); if (*err) { if (replaced_count) { block_len_in_page = replaced_count; replaced_size = block_len_in_page << orig_inode->i_blkbits; } else goto unlock_folios; } /* Perform all necessary steps similar write_begin()/write_end() * but keeping in mind that i_size will not change */ bh = folio_buffers(folio[0]); if (!bh) bh = create_empty_buffers(folio[0], 1 << orig_inode->i_blkbits, 0); for (i = 0; i < data_offset_in_page; i++) bh = bh->b_this_page; for (i = 0; i < block_len_in_page; i++) { *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0); if (*err < 0) goto repair_branches; bh = bh->b_this_page; } block_commit_write(&folio[0]->page, from, from + replaced_size); /* Even in case of data=writeback it is reasonable to pin * inode to transaction, to prevent unexpected data loss */ *err = ext4_jbd2_inode_add_write(handle, orig_inode, (loff_t)orig_page_offset << PAGE_SHIFT, replaced_size); unlock_folios: folio_unlock(folio[0]); folio_put(folio[0]); folio_unlock(folio[1]); folio_put(folio[1]); stop_journal: ext4_journal_stop(handle); if (*err == -ENOSPC && ext4_should_retry_alloc(sb, &retries)) goto again; /* Buffer was busy because probably is pinned to journal transaction, * force transaction commit may help to free it. */ if (*err == -EBUSY && retries++ < 4 && EXT4_SB(sb)->s_journal && jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal)) goto again; return replaced_count; repair_branches: /* * This should never ever happen! * Extents are swapped already, but we are not able to copy data. * Try to swap extents to it's original places */ ext4_double_down_write_data_sem(orig_inode, donor_inode); replaced_count = ext4_swap_extents(handle, donor_inode, orig_inode, orig_blk_offset, donor_blk_offset, block_len_in_page, 0, &err2); ext4_double_up_write_data_sem(orig_inode, donor_inode); if (replaced_count != block_len_in_page) { ext4_error_inode_block(orig_inode, (sector_t)(orig_blk_offset), EIO, "Unable to copy data block," " data will be lost."); *err = -EIO; } replaced_count = 0; goto unlock_folios; } /** * mext_check_arguments - Check whether move extent can be done * * @orig_inode: original inode * @donor_inode: donor inode * @orig_start: logical start offset in block for orig * @donor_start: logical start offset in block for donor * @len: the number of blocks to be moved * * Check the arguments of ext4_move_extents() whether the files can be * exchanged with each other. * Return 0 on success, or a negative error value on failure. */ static int mext_check_arguments(struct inode *orig_inode, struct inode *donor_inode, __u64 orig_start, __u64 donor_start, __u64 *len) { __u64 orig_eof, donor_eof; unsigned int blkbits = orig_inode->i_blkbits; unsigned int blocksize = 1 << blkbits; orig_eof = (i_size_read(orig_inode) + blocksize - 1) >> blkbits; donor_eof = (i_size_read(donor_inode) + blocksize - 1) >> blkbits; if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { ext4_debug("ext4 move extent: suid or sgid is set" " to donor file [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode)) return -EPERM; /* Ext4 move extent does not support swap files */ if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { ext4_debug("ext4 move extent: The argument files should not be swap files [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -ETXTBSY; } if (ext4_is_quota_file(orig_inode) && ext4_is_quota_file(donor_inode)) { ext4_debug("ext4 move extent: The argument files should not be quota files [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EOPNOTSUPP; } /* Ext4 move extent supports only extent based file */ if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) { ext4_debug("ext4 move extent: orig file is not extents " "based file [ino:orig %lu]\n", orig_inode->i_ino); return -EOPNOTSUPP; } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) { ext4_debug("ext4 move extent: donor file is not extents " "based file [ino:donor %lu]\n", donor_inode->i_ino); return -EOPNOTSUPP; } if ((!orig_inode->i_size) || (!donor_inode->i_size)) { ext4_debug("ext4 move extent: File size is 0 byte\n"); return -EINVAL; } /* Start offset should be same */ if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) != (donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) { ext4_debug("ext4 move extent: orig and donor's start " "offsets are not aligned [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } if ((orig_start >= EXT_MAX_BLOCKS) || (donor_start >= EXT_MAX_BLOCKS) || (*len > EXT_MAX_BLOCKS) || (donor_start + *len >= EXT_MAX_BLOCKS) || (orig_start + *len >= EXT_MAX_BLOCKS)) { ext4_debug("ext4 move extent: Can't handle over [%u] blocks " "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } if (orig_eof <= orig_start) *len = 0; else if (orig_eof < orig_start + *len - 1) *len = orig_eof - orig_start; if (donor_eof <= donor_start) *len = 0; else if (donor_eof < donor_start + *len - 1) *len = donor_eof - donor_start; if (!*len) { ext4_debug("ext4 move extent: len should not be 0 " "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } return 0; } /** * ext4_move_extents - Exchange the specified range of a file * * @o_filp: file structure of the original file * @d_filp: file structure of the donor file * @orig_blk: start offset in block for orig * @donor_blk: start offset in block for donor * @len: the number of blocks to be moved * @moved_len: moved block length * * This function returns 0 and moved block length is set in moved_len * if succeed, otherwise returns error value. * */ int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, __u64 donor_blk, __u64 len, __u64 *moved_len) { struct inode *orig_inode = file_inode(o_filp); struct inode *donor_inode = file_inode(d_filp); struct ext4_ext_path *path = NULL; int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits; ext4_lblk_t o_end, o_start = orig_blk; ext4_lblk_t d_start = donor_blk; int ret; if (orig_inode->i_sb != donor_inode->i_sb) { ext4_debug("ext4 move extent: The argument files " "should be in same FS [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } /* orig and donor should be different inodes */ if (orig_inode == donor_inode) { ext4_debug("ext4 move extent: The argument files should not " "be same inode [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } /* Regular file check */ if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { ext4_debug("ext4 move extent: The argument files should be " "regular file [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } /* TODO: it's not obvious how to swap blocks for inodes with full journaling enabled */ if (ext4_should_journal_data(orig_inode) || ext4_should_journal_data(donor_inode)) { ext4_msg(orig_inode->i_sb, KERN_ERR, "Online defrag not supported with data journaling"); return -EOPNOTSUPP; } if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode)) { ext4_msg(orig_inode->i_sb, KERN_ERR, "Online defrag not supported for encrypted files"); return -EOPNOTSUPP; } /* Protect orig and donor inodes against a truncate */ lock_two_nondirectories(orig_inode, donor_inode); /* Wait for all existing dio workers */ inode_dio_wait(orig_inode); inode_dio_wait(donor_inode); /* Protect extent tree against block allocations via delalloc */ ext4_double_down_write_data_sem(orig_inode, donor_inode); /* Check the filesystem environment whether move_extent can be done */ ret = mext_check_arguments(orig_inode, donor_inode, orig_blk, donor_blk, &len); if (ret) goto out; o_end = o_start + len; while (o_start < o_end) { struct ext4_extent *ex; ext4_lblk_t cur_blk, next_blk; pgoff_t orig_page_index, donor_page_index; int offset_in_page; int unwritten, cur_len; ret = get_ext_path(orig_inode, o_start, &path); if (ret) goto out; ex = path[path->p_depth].p_ext; cur_blk = le32_to_cpu(ex->ee_block); cur_len = ext4_ext_get_actual_len(ex); /* Check hole before the start pos */ if (cur_blk + cur_len - 1 < o_start) { next_blk = ext4_ext_next_allocated_block(path); if (next_blk == EXT_MAX_BLOCKS) { ret = -ENODATA; goto out; } d_start += next_blk - o_start; o_start = next_blk; continue; /* Check hole after the start pos */ } else if (cur_blk > o_start) { /* Skip hole */ d_start += cur_blk - o_start; o_start = cur_blk; /* Extent inside requested range ?*/ if (cur_blk >= o_end) goto out; } else { /* in_range(o_start, o_blk, o_len) */ cur_len += cur_blk - o_start; } unwritten = ext4_ext_is_unwritten(ex); if (o_end - o_start < cur_len) cur_len = o_end - o_start; orig_page_index = o_start >> (PAGE_SHIFT - orig_inode->i_blkbits); donor_page_index = d_start >> (PAGE_SHIFT - donor_inode->i_blkbits); offset_in_page = o_start % blocks_per_page; if (cur_len > blocks_per_page - offset_in_page) cur_len = blocks_per_page - offset_in_page; /* * Up semaphore to avoid following problems: * a. transaction deadlock among ext4_journal_start, * ->write_begin via pagefault, and jbd2_journal_commit * b. racing with ->read_folio, ->write_begin, and * ext4_get_block in move_extent_per_page */ ext4_double_up_write_data_sem(orig_inode, donor_inode); /* Swap original branches with new branches */ move_extent_per_page(o_filp, donor_inode, orig_page_index, donor_page_index, offset_in_page, cur_len, unwritten, &ret); ext4_double_down_write_data_sem(orig_inode, donor_inode); if (ret < 0) break; o_start += cur_len; d_start += cur_len; } *moved_len = o_start - orig_blk; if (*moved_len > len) *moved_len = len; out: if (*moved_len) { ext4_discard_preallocations(orig_inode, 0); ext4_discard_preallocations(donor_inode, 0); } ext4_free_ext_path(path); ext4_double_up_write_data_sem(orig_inode, donor_inode); unlock_two_nondirectories(orig_inode, donor_inode); return ret; }
17 39 18 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * V4L2 sub-device support header. * * Copyright (C) 2008 Hans Verkuil <hverkuil@xs4all.nl> */ #ifndef _V4L2_SUBDEV_H #define _V4L2_SUBDEV_H #include <linux/types.h> #include <linux/v4l2-subdev.h> #include <media/media-entity.h> #include <media/v4l2-async.h> #include <media/v4l2-common.h> #include <media/v4l2-dev.h> #include <media/v4l2-fh.h> #include <media/v4l2-mediabus.h> /* generic v4l2_device notify callback notification values */ #define V4L2_SUBDEV_IR_RX_NOTIFY _IOW('v', 0, u32) #define V4L2_SUBDEV_IR_RX_FIFO_SERVICE_REQ 0x00000001 #define V4L2_SUBDEV_IR_RX_END_OF_RX_DETECTED 0x00000002 #define V4L2_SUBDEV_IR_RX_HW_FIFO_OVERRUN 0x00000004 #define V4L2_SUBDEV_IR_RX_SW_FIFO_OVERRUN 0x00000008 #define V4L2_SUBDEV_IR_TX_NOTIFY _IOW('v', 1, u32) #define V4L2_SUBDEV_IR_TX_FIFO_SERVICE_REQ 0x00000001 #define V4L2_DEVICE_NOTIFY_EVENT _IOW('v', 2, struct v4l2_event) struct v4l2_device; struct v4l2_ctrl_handler; struct v4l2_event; struct v4l2_event_subscription; struct v4l2_fh; struct v4l2_subdev; struct v4l2_subdev_fh; struct tuner_setup; struct v4l2_mbus_frame_desc; struct led_classdev; /** * struct v4l2_decode_vbi_line - used to decode_vbi_line * * @is_second_field: Set to 0 for the first (odd) field; * set to 1 for the second (even) field. * @p: Pointer to the sliced VBI data from the decoder. On exit, points to * the start of the payload. * @line: Line number of the sliced VBI data (1-23) * @type: VBI service type (V4L2_SLICED_*). 0 if no service found */ struct v4l2_decode_vbi_line { u32 is_second_field; u8 *p; u32 line; u32 type; }; /* * Sub-devices are devices that are connected somehow to the main bridge * device. These devices are usually audio/video muxers/encoders/decoders or * sensors and webcam controllers. * * Usually these devices are controlled through an i2c bus, but other buses * may also be used. * * The v4l2_subdev struct provides a way of accessing these devices in a * generic manner. Most operations that these sub-devices support fall in * a few categories: core ops, audio ops, video ops and tuner ops. * * More categories can be added if needed, although this should remain a * limited set (no more than approx. 8 categories). * * Each category has its own set of ops that subdev drivers can implement. * * A subdev driver can leave the pointer to the category ops NULL if * it does not implement them (e.g. an audio subdev will generally not * implement the video category ops). The exception is the core category: * this must always be present. * * These ops are all used internally so it is no problem to change, remove * or add ops or move ops from one to another category. Currently these * ops are based on the original ioctls, but since ops are not limited to * one argument there is room for improvement here once all i2c subdev * drivers are converted to use these ops. */ /* * Core ops: it is highly recommended to implement at least these ops: * * log_status * g_register * s_register * * This provides basic debugging support. * * The ioctl ops is meant for generic ioctl-like commands. Depending on * the use-case it might be better to use subdev-specific ops (currently * not yet implemented) since ops provide proper type-checking. */ /** * enum v4l2_subdev_io_pin_bits - Subdevice external IO pin configuration * bits * * @V4L2_SUBDEV_IO_PIN_DISABLE: disables a pin config. ENABLE assumed. * @V4L2_SUBDEV_IO_PIN_OUTPUT: set it if pin is an output. * @V4L2_SUBDEV_IO_PIN_INPUT: set it if pin is an input. * @V4L2_SUBDEV_IO_PIN_SET_VALUE: to set the output value via * &struct v4l2_subdev_io_pin_config->value. * @V4L2_SUBDEV_IO_PIN_ACTIVE_LOW: pin active is bit 0. * Otherwise, ACTIVE HIGH is assumed. */ enum v4l2_subdev_io_pin_bits { V4L2_SUBDEV_IO_PIN_DISABLE = 0, V4L2_SUBDEV_IO_PIN_OUTPUT = 1, V4L2_SUBDEV_IO_PIN_INPUT = 2, V4L2_SUBDEV_IO_PIN_SET_VALUE = 3, V4L2_SUBDEV_IO_PIN_ACTIVE_LOW = 4, }; /** * struct v4l2_subdev_io_pin_config - Subdevice external IO pin configuration * * @flags: bitmask with flags for this pin's config, whose bits are defined by * &enum v4l2_subdev_io_pin_bits. * @pin: Chip external IO pin to configure * @function: Internal signal pad/function to route to IO pin * @value: Initial value for pin - e.g. GPIO output value * @strength: Pin drive strength */ struct v4l2_subdev_io_pin_config { u32 flags; u8 pin; u8 function; u8 value; u8 strength; }; /** * struct v4l2_subdev_core_ops - Define core ops callbacks for subdevs * * @log_status: callback for VIDIOC_LOG_STATUS() ioctl handler code. * * @s_io_pin_config: configure one or more chip I/O pins for chips that * multiplex different internal signal pads out to IO pins. This function * takes a pointer to an array of 'n' pin configuration entries, one for * each pin being configured. This function could be called at times * other than just subdevice initialization. * * @init: initialize the sensor registers to some sort of reasonable default * values. Do not use for new drivers and should be removed in existing * drivers. * * @load_fw: load firmware. * * @reset: generic reset command. The argument selects which subsystems to * reset. Passing 0 will always reset the whole chip. Do not use for new * drivers without discussing this first on the linux-media mailinglist. * There should be no reason normally to reset a device. * * @s_gpio: set GPIO pins. Very simple right now, might need to be extended with * a direction argument if needed. * * @command: called by in-kernel drivers in order to call functions internal * to subdev drivers driver that have a separate callback. * * @ioctl: called at the end of ioctl() syscall handler at the V4L2 core. * used to provide support for private ioctls used on the driver. * * @compat_ioctl32: called when a 32 bits application uses a 64 bits Kernel, * in order to fix data passed from/to userspace. * * @g_register: callback for VIDIOC_DBG_G_REGISTER() ioctl handler code. * * @s_register: callback for VIDIOC_DBG_S_REGISTER() ioctl handler code. * * @s_power: puts subdevice in power saving mode (on == 0) or normal operation * mode (on == 1). DEPRECATED. See * Documentation/driver-api/media/camera-sensor.rst . pre_streamon and * post_streamoff callbacks can be used for e.g. setting the bus to LP-11 * mode before s_stream is called. * * @interrupt_service_routine: Called by the bridge chip's interrupt service * handler, when an interrupt status has be raised due to this subdev, * so that this subdev can handle the details. It may schedule work to be * performed later. It must not sleep. **Called from an IRQ context**. * * @subscribe_event: used by the drivers to request the control framework that * for it to be warned when the value of a control changes. * * @unsubscribe_event: remove event subscription from the control framework. */ struct v4l2_subdev_core_ops { int (*log_status)(struct v4l2_subdev *sd); int (*s_io_pin_config)(struct v4l2_subdev *sd, size_t n, struct v4l2_subdev_io_pin_config *pincfg); int (*init)(struct v4l2_subdev *sd, u32 val); int (*load_fw)(struct v4l2_subdev *sd); int (*reset)(struct v4l2_subdev *sd, u32 val); int (*s_gpio)(struct v4l2_subdev *sd, u32 val); long (*command)(struct v4l2_subdev *sd, unsigned int cmd, void *arg); long (*ioctl)(struct v4l2_subdev *sd, unsigned int cmd, void *arg); #ifdef CONFIG_COMPAT long (*compat_ioctl32)(struct v4l2_subdev *sd, unsigned int cmd, unsigned long arg); #endif #ifdef CONFIG_VIDEO_ADV_DEBUG int (*g_register)(struct v4l2_subdev *sd, struct v4l2_dbg_register *reg); int (*s_register)(struct v4l2_subdev *sd, const struct v4l2_dbg_register *reg); #endif int (*s_power)(struct v4l2_subdev *sd, int on); int (*interrupt_service_routine)(struct v4l2_subdev *sd, u32 status, bool *handled); int (*subscribe_event)(struct v4l2_subdev *sd, struct v4l2_fh *fh, struct v4l2_event_subscription *sub); int (*unsubscribe_event)(struct v4l2_subdev *sd, struct v4l2_fh *fh, struct v4l2_event_subscription *sub); }; /** * struct v4l2_subdev_tuner_ops - Callbacks used when v4l device was opened * in radio mode. * * @standby: puts the tuner in standby mode. It will be woken up * automatically the next time it is used. * * @s_radio: callback that switches the tuner to radio mode. * drivers should explicitly call it when a tuner ops should * operate on radio mode, before being able to handle it. * Used on devices that have both AM/FM radio receiver and TV. * * @s_frequency: callback for VIDIOC_S_FREQUENCY() ioctl handler code. * * @g_frequency: callback for VIDIOC_G_FREQUENCY() ioctl handler code. * freq->type must be filled in. Normally done by video_ioctl2() * or the bridge driver. * * @enum_freq_bands: callback for VIDIOC_ENUM_FREQ_BANDS() ioctl handler code. * * @g_tuner: callback for VIDIOC_G_TUNER() ioctl handler code. * * @s_tuner: callback for VIDIOC_S_TUNER() ioctl handler code. @vt->type must be * filled in. Normally done by video_ioctl2 or the * bridge driver. * * @g_modulator: callback for VIDIOC_G_MODULATOR() ioctl handler code. * * @s_modulator: callback for VIDIOC_S_MODULATOR() ioctl handler code. * * @s_type_addr: sets tuner type and its I2C addr. * * @s_config: sets tda9887 specific stuff, like port1, port2 and qss * * .. note:: * * On devices that have both AM/FM and TV, it is up to the driver * to explicitly call s_radio when the tuner should be switched to * radio mode, before handling other &struct v4l2_subdev_tuner_ops * that would require it. An example of such usage is:: * * static void s_frequency(void *priv, const struct v4l2_frequency *f) * { * ... * if (f.type == V4L2_TUNER_RADIO) * v4l2_device_call_all(v4l2_dev, 0, tuner, s_radio); * ... * v4l2_device_call_all(v4l2_dev, 0, tuner, s_frequency); * } */ struct v4l2_subdev_tuner_ops { int (*standby)(struct v4l2_subdev *sd); int (*s_radio)(struct v4l2_subdev *sd); int (*s_frequency)(struct v4l2_subdev *sd, const struct v4l2_frequency *freq); int (*g_frequency)(struct v4l2_subdev *sd, struct v4l2_frequency *freq); int (*enum_freq_bands)(struct v4l2_subdev *sd, struct v4l2_frequency_band *band); int (*g_tuner)(struct v4l2_subdev *sd, struct v4l2_tuner *vt); int (*s_tuner)(struct v4l2_subdev *sd, const struct v4l2_tuner *vt); int (*g_modulator)(struct v4l2_subdev *sd, struct v4l2_modulator *vm); int (*s_modulator)(struct v4l2_subdev *sd, const struct v4l2_modulator *vm); int (*s_type_addr)(struct v4l2_subdev *sd, struct tuner_setup *type); int (*s_config)(struct v4l2_subdev *sd, const struct v4l2_priv_tun_config *config); }; /** * struct v4l2_subdev_audio_ops - Callbacks used for audio-related settings * * @s_clock_freq: set the frequency (in Hz) of the audio clock output. * Used to slave an audio processor to the video decoder, ensuring that * audio and video remain synchronized. Usual values for the frequency * are 48000, 44100 or 32000 Hz. If the frequency is not supported, then * -EINVAL is returned. * * @s_i2s_clock_freq: sets I2S speed in bps. This is used to provide a standard * way to select I2S clock used by driving digital audio streams at some * board designs. Usual values for the frequency are 1024000 and 2048000. * If the frequency is not supported, then %-EINVAL is returned. * * @s_routing: used to define the input and/or output pins of an audio chip, * and any additional configuration data. * Never attempt to use user-level input IDs (e.g. Composite, S-Video, * Tuner) at this level. An i2c device shouldn't know about whether an * input pin is connected to a Composite connector, become on another * board or platform it might be connected to something else entirely. * The calling driver is responsible for mapping a user-level input to * the right pins on the i2c device. * * @s_stream: used to notify the audio code that stream will start or has * stopped. */ struct v4l2_subdev_audio_ops { int (*s_clock_freq)(struct v4l2_subdev *sd, u32 freq); int (*s_i2s_clock_freq)(struct v4l2_subdev *sd, u32 freq); int (*s_routing)(struct v4l2_subdev *sd, u32 input, u32 output, u32 config); int (*s_stream)(struct v4l2_subdev *sd, int enable); }; /** * struct v4l2_mbus_frame_desc_entry_csi2 * * @vc: CSI-2 virtual channel * @dt: CSI-2 data type ID */ struct v4l2_mbus_frame_desc_entry_csi2 { u8 vc; u8 dt; }; /** * enum v4l2_mbus_frame_desc_flags - media bus frame description flags * * @V4L2_MBUS_FRAME_DESC_FL_LEN_MAX: * Indicates that &struct v4l2_mbus_frame_desc_entry->length field * specifies maximum data length. * @V4L2_MBUS_FRAME_DESC_FL_BLOB: * Indicates that the format does not have line offsets, i.e. * the receiver should use 1D DMA. */ enum v4l2_mbus_frame_desc_flags { V4L2_MBUS_FRAME_DESC_FL_LEN_MAX = BIT(0), V4L2_MBUS_FRAME_DESC_FL_BLOB = BIT(1), }; /** * struct v4l2_mbus_frame_desc_entry - media bus frame description structure * * @flags: bitmask flags, as defined by &enum v4l2_mbus_frame_desc_flags. * @stream: stream in routing configuration * @pixelcode: media bus pixel code, valid if @flags * %FRAME_DESC_FL_BLOB is not set. * @length: number of octets per frame, valid if @flags * %V4L2_MBUS_FRAME_DESC_FL_LEN_MAX is set. * @bus: Bus-specific frame descriptor parameters * @bus.csi2: CSI-2-specific bus configuration */ struct v4l2_mbus_frame_desc_entry { enum v4l2_mbus_frame_desc_flags flags; u32 stream; u32 pixelcode; u32 length; union { struct v4l2_mbus_frame_desc_entry_csi2 csi2; } bus; }; /* * If this number is too small, it should be dropped altogether and the * API switched to a dynamic number of frame descriptor entries. */ #define V4L2_FRAME_DESC_ENTRY_MAX 8 /** * enum v4l2_mbus_frame_desc_type - media bus frame description type * * @V4L2_MBUS_FRAME_DESC_TYPE_UNDEFINED: * Undefined frame desc type. Drivers should not use this, it is * for backwards compatibility. * @V4L2_MBUS_FRAME_DESC_TYPE_PARALLEL: * Parallel media bus. * @V4L2_MBUS_FRAME_DESC_TYPE_CSI2: * CSI-2 media bus. Frame desc parameters must be set in * &struct v4l2_mbus_frame_desc_entry->csi2. */ enum v4l2_mbus_frame_desc_type { V4L2_MBUS_FRAME_DESC_TYPE_UNDEFINED = 0, V4L2_MBUS_FRAME_DESC_TYPE_PARALLEL, V4L2_MBUS_FRAME_DESC_TYPE_CSI2, }; /** * struct v4l2_mbus_frame_desc - media bus data frame description * @type: type of the bus (enum v4l2_mbus_frame_desc_type) * @entry: frame descriptors array * @num_entries: number of entries in @entry array */ struct v4l2_mbus_frame_desc { enum v4l2_mbus_frame_desc_type type; struct v4l2_mbus_frame_desc_entry entry[V4L2_FRAME_DESC_ENTRY_MAX]; unsigned short num_entries; }; /** * enum v4l2_subdev_pre_streamon_flags - Flags for pre_streamon subdev core op * * @V4L2_SUBDEV_PRE_STREAMON_FL_MANUAL_LP: Set the transmitter to either LP-11 * or LP-111 mode before call to s_stream(). */ enum v4l2_subdev_pre_streamon_flags { V4L2_SUBDEV_PRE_STREAMON_FL_MANUAL_LP = BIT(0), }; /** * struct v4l2_subdev_video_ops - Callbacks used when v4l device was opened * in video mode. * * @s_routing: see s_routing in audio_ops, except this version is for video * devices. * * @s_crystal_freq: sets the frequency of the crystal used to generate the * clocks in Hz. An extra flags field allows device specific configuration * regarding clock frequency dividers, etc. If not used, then set flags * to 0. If the frequency is not supported, then -EINVAL is returned. * * @g_std: callback for VIDIOC_G_STD() ioctl handler code. * * @s_std: callback for VIDIOC_S_STD() ioctl handler code. * * @s_std_output: set v4l2_std_id for video OUTPUT devices. This is ignored by * video input devices. * * @g_std_output: get current standard for video OUTPUT devices. This is ignored * by video input devices. * * @querystd: callback for VIDIOC_QUERYSTD() ioctl handler code. * * @g_tvnorms: get &v4l2_std_id with all standards supported by the video * CAPTURE device. This is ignored by video output devices. * * @g_tvnorms_output: get v4l2_std_id with all standards supported by the video * OUTPUT device. This is ignored by video capture devices. * * @g_input_status: get input status. Same as the status field in the * &struct v4l2_input * * @s_stream: start (enabled == 1) or stop (enabled == 0) streaming on the * sub-device. Failure on stop will remove any resources acquired in * streaming start, while the error code is still returned by the driver. * The caller shall track the subdev state, and shall not start or stop an * already started or stopped subdev. Also see call_s_stream wrapper in * v4l2-subdev.c. * * @g_pixelaspect: callback to return the pixelaspect ratio. * * @s_dv_timings: Set custom dv timings in the sub device. This is used * when sub device is capable of setting detailed timing information * in the hardware to generate/detect the video signal. * * @g_dv_timings: Get custom dv timings in the sub device. * * @query_dv_timings: callback for VIDIOC_QUERY_DV_TIMINGS() ioctl handler code. * * @s_rx_buffer: set a host allocated memory buffer for the subdev. The subdev * can adjust @size to a lower value and must not write more data to the * buffer starting at @data than the original value of @size. * * @pre_streamon: May be called before streaming is actually started, to help * initialising the bus. Current usage is to set a CSI-2 transmitter to * LP-11 or LP-111 mode before streaming. See &enum * v4l2_subdev_pre_streamon_flags. * * pre_streamon shall return error if it cannot perform the operation as * indicated by the flags argument. In particular, -EACCES indicates lack * of support for the operation. The caller shall call post_streamoff for * each successful call of pre_streamon. * * @post_streamoff: Called after streaming is stopped, but if and only if * pre_streamon was called earlier. */ struct v4l2_subdev_video_ops { int (*s_routing)(struct v4l2_subdev *sd, u32 input, u32 output, u32 config); int (*s_crystal_freq)(struct v4l2_subdev *sd, u32 freq, u32 flags); int (*g_std)(struct v4l2_subdev *sd, v4l2_std_id *norm); int (*s_std)(struct v4l2_subdev *sd, v4l2_std_id norm); int (*s_std_output)(struct v4l2_subdev *sd, v4l2_std_id std); int (*g_std_output)(struct v4l2_subdev *sd, v4l2_std_id *std); int (*querystd)(struct v4l2_subdev *sd, v4l2_std_id *std); int (*g_tvnorms)(struct v4l2_subdev *sd, v4l2_std_id *std); int (*g_tvnorms_output)(struct v4l2_subdev *sd, v4l2_std_id *std); int (*g_input_status)(struct v4l2_subdev *sd, u32 *status); int (*s_stream)(struct v4l2_subdev *sd, int enable); int (*g_pixelaspect)(struct v4l2_subdev *sd, struct v4l2_fract *aspect); int (*s_dv_timings)(struct v4l2_subdev *sd, struct v4l2_dv_timings *timings); int (*g_dv_timings)(struct v4l2_subdev *sd, struct v4l2_dv_timings *timings); int (*query_dv_timings)(struct v4l2_subdev *sd, struct v4l2_dv_timings *timings); int (*s_rx_buffer)(struct v4l2_subdev *sd, void *buf, unsigned int *size); int (*pre_streamon)(struct v4l2_subdev *sd, u32 flags); int (*post_streamoff)(struct v4l2_subdev *sd); }; /** * struct v4l2_subdev_vbi_ops - Callbacks used when v4l device was opened * in video mode via the vbi device node. * * @decode_vbi_line: video decoders that support sliced VBI need to implement * this ioctl. Field p of the &struct v4l2_decode_vbi_line is set to the * start of the VBI data that was generated by the decoder. The driver * then parses the sliced VBI data and sets the other fields in the * struct accordingly. The pointer p is updated to point to the start of * the payload which can be copied verbatim into the data field of the * &struct v4l2_sliced_vbi_data. If no valid VBI data was found, then the * type field is set to 0 on return. * * @s_vbi_data: used to generate VBI signals on a video signal. * &struct v4l2_sliced_vbi_data is filled with the data packets that * should be output. Note that if you set the line field to 0, then that * VBI signal is disabled. If no valid VBI data was found, then the type * field is set to 0 on return. * * @g_vbi_data: used to obtain the sliced VBI packet from a readback register. * Not all video decoders support this. If no data is available because * the readback register contains invalid or erroneous data %-EIO is * returned. Note that you must fill in the 'id' member and the 'field' * member (to determine whether CC data from the first or second field * should be obtained). * * @g_sliced_vbi_cap: callback for VIDIOC_G_SLICED_VBI_CAP() ioctl handler * code. * * @s_raw_fmt: setup the video encoder/decoder for raw VBI. * * @g_sliced_fmt: retrieve the current sliced VBI settings. * * @s_sliced_fmt: setup the sliced VBI settings. */ struct v4l2_subdev_vbi_ops { int (*decode_vbi_line)(struct v4l2_subdev *sd, struct v4l2_decode_vbi_line *vbi_line); int (*s_vbi_data)(struct v4l2_subdev *sd, const struct v4l2_sliced_vbi_data *vbi_data); int (*g_vbi_data)(struct v4l2_subdev *sd, struct v4l2_sliced_vbi_data *vbi_data); int (*g_sliced_vbi_cap)(struct v4l2_subdev *sd, struct v4l2_sliced_vbi_cap *cap); int (*s_raw_fmt)(struct v4l2_subdev *sd, struct v4l2_vbi_format *fmt); int (*g_sliced_fmt)(struct v4l2_subdev *sd, struct v4l2_sliced_vbi_format *fmt); int (*s_sliced_fmt)(struct v4l2_subdev *sd, struct v4l2_sliced_vbi_format *fmt); }; /** * struct v4l2_subdev_sensor_ops - v4l2-subdev sensor operations * @g_skip_top_lines: number of lines at the top of the image to be skipped. * This is needed for some sensors, which always corrupt * several top lines of the output image, or which send their * metadata in them. * @g_skip_frames: number of frames to skip at stream start. This is needed for * buggy sensors that generate faulty frames when they are * turned on. */ struct v4l2_subdev_sensor_ops { int (*g_skip_top_lines)(struct v4l2_subdev *sd, u32 *lines); int (*g_skip_frames)(struct v4l2_subdev *sd, u32 *frames); }; /** * enum v4l2_subdev_ir_mode- describes the type of IR supported * * @V4L2_SUBDEV_IR_MODE_PULSE_WIDTH: IR uses struct ir_raw_event records */ enum v4l2_subdev_ir_mode { V4L2_SUBDEV_IR_MODE_PULSE_WIDTH, }; /** * struct v4l2_subdev_ir_parameters - Parameters for IR TX or TX * * @bytes_per_data_element: bytes per data element of data in read or * write call. * @mode: IR mode as defined by &enum v4l2_subdev_ir_mode. * @enable: device is active if true * @interrupt_enable: IR interrupts are enabled if true * @shutdown: if true: set hardware to low/no power, false: normal mode * * @modulation: if true, it uses carrier, if false: baseband * @max_pulse_width: maximum pulse width in ns, valid only for baseband signal * @carrier_freq: carrier frequency in Hz, valid only for modulated signal * @duty_cycle: duty cycle percentage, valid only for modulated signal * @invert_level: invert signal level * * @invert_carrier_sense: Send 0/space as a carrier burst. used only in TX. * * @noise_filter_min_width: min time of a valid pulse, in ns. Used only for RX. * @carrier_range_lower: Lower carrier range, in Hz, valid only for modulated * signal. Used only for RX. * @carrier_range_upper: Upper carrier range, in Hz, valid only for modulated * signal. Used only for RX. * @resolution: The receive resolution, in ns . Used only for RX. */ struct v4l2_subdev_ir_parameters { unsigned int bytes_per_data_element; enum v4l2_subdev_ir_mode mode; bool enable; bool interrupt_enable; bool shutdown; bool modulation; u32 max_pulse_width; unsigned int carrier_freq; unsigned int duty_cycle; bool invert_level; /* Tx only */ bool invert_carrier_sense; /* Rx only */ u32 noise_filter_min_width; unsigned int carrier_range_lower; unsigned int carrier_range_upper; u32 resolution; }; /** * struct v4l2_subdev_ir_ops - operations for IR subdevices * * @rx_read: Reads received codes or pulse width data. * The semantics are similar to a non-blocking read() call. * @rx_g_parameters: Get the current operating parameters and state of * the IR receiver. * @rx_s_parameters: Set the current operating parameters and state of * the IR receiver. It is recommended to call * [rt]x_g_parameters first to fill out the current state, and only change * the fields that need to be changed. Upon return, the actual device * operating parameters and state will be returned. Note that hardware * limitations may prevent the actual settings from matching the requested * settings - e.g. an actual carrier setting of 35,904 Hz when 36,000 Hz * was requested. An exception is when the shutdown parameter is true. * The last used operational parameters will be returned, but the actual * state of the hardware be different to minimize power consumption and * processing when shutdown is true. * * @tx_write: Writes codes or pulse width data for transmission. * The semantics are similar to a non-blocking write() call. * @tx_g_parameters: Get the current operating parameters and state of * the IR transmitter. * @tx_s_parameters: Set the current operating parameters and state of * the IR transmitter. It is recommended to call * [rt]x_g_parameters first to fill out the current state, and only change * the fields that need to be changed. Upon return, the actual device * operating parameters and state will be returned. Note that hardware * limitations may prevent the actual settings from matching the requested * settings - e.g. an actual carrier setting of 35,904 Hz when 36,000 Hz * was requested. An exception is when the shutdown parameter is true. * The last used operational parameters will be returned, but the actual * state of the hardware be different to minimize power consumption and * processing when shutdown is true. */ struct v4l2_subdev_ir_ops { /* Receiver */ int (*rx_read)(struct v4l2_subdev *sd, u8 *buf, size_t count, ssize_t *num); int (*rx_g_parameters)(struct v4l2_subdev *sd, struct v4l2_subdev_ir_parameters *params); int (*rx_s_parameters)(struct v4l2_subdev *sd, struct v4l2_subdev_ir_parameters *params); /* Transmitter */ int (*tx_write)(struct v4l2_subdev *sd, u8 *buf, size_t count, ssize_t *num); int (*tx_g_parameters)(struct v4l2_subdev *sd, struct v4l2_subdev_ir_parameters *params); int (*tx_s_parameters)(struct v4l2_subdev *sd, struct v4l2_subdev_ir_parameters *params); }; /** * struct v4l2_subdev_pad_config - Used for storing subdev pad information. * * @format: &struct v4l2_mbus_framefmt * @crop: &struct v4l2_rect to be used for crop * @compose: &struct v4l2_rect to be used for compose * @interval: frame interval */ struct v4l2_subdev_pad_config { struct v4l2_mbus_framefmt format; struct v4l2_rect crop; struct v4l2_rect compose; struct v4l2_fract interval; }; /** * struct v4l2_subdev_stream_config - Used for storing stream configuration. * * @pad: pad number * @stream: stream number * @enabled: has the stream been enabled with v4l2_subdev_enable_stream() * @fmt: &struct v4l2_mbus_framefmt * @crop: &struct v4l2_rect to be used for crop * @compose: &struct v4l2_rect to be used for compose * @interval: frame interval * * This structure stores configuration for a stream. */ struct v4l2_subdev_stream_config { u32 pad; u32 stream; bool enabled; struct v4l2_mbus_framefmt fmt; struct v4l2_rect crop; struct v4l2_rect compose; struct v4l2_fract interval; }; /** * struct v4l2_subdev_stream_configs - A collection of stream configs. * * @num_configs: number of entries in @config. * @configs: an array of &struct v4l2_subdev_stream_configs. */ struct v4l2_subdev_stream_configs { u32 num_configs; struct v4l2_subdev_stream_config *configs; }; /** * struct v4l2_subdev_krouting - subdev routing table * * @num_routes: number of routes * @routes: &struct v4l2_subdev_route * * This structure contains the routing table for a subdev. */ struct v4l2_subdev_krouting { unsigned int num_routes; struct v4l2_subdev_route *routes; }; /** * struct v4l2_subdev_state - Used for storing subdev state information. * * @_lock: default for 'lock' * @lock: mutex for the state. May be replaced by the user. * @sd: the sub-device which the state is related to * @pads: &struct v4l2_subdev_pad_config array * @routing: routing table for the subdev * @stream_configs: stream configurations (only for V4L2_SUBDEV_FL_STREAMS) * * This structure only needs to be passed to the pad op if the 'which' field * of the main argument is set to %V4L2_SUBDEV_FORMAT_TRY. For * %V4L2_SUBDEV_FORMAT_ACTIVE it is safe to pass %NULL. */ struct v4l2_subdev_state { /* lock for the struct v4l2_subdev_state fields */ struct mutex _lock; struct mutex *lock; struct v4l2_subdev *sd; struct v4l2_subdev_pad_config *pads; struct v4l2_subdev_krouting routing; struct v4l2_subdev_stream_configs stream_configs; }; /** * struct v4l2_subdev_pad_ops - v4l2-subdev pad level operations * * @enum_mbus_code: callback for VIDIOC_SUBDEV_ENUM_MBUS_CODE() ioctl handler * code. * @enum_frame_size: callback for VIDIOC_SUBDEV_ENUM_FRAME_SIZE() ioctl handler * code. * * @enum_frame_interval: callback for VIDIOC_SUBDEV_ENUM_FRAME_INTERVAL() ioctl * handler code. * * @get_fmt: callback for VIDIOC_SUBDEV_G_FMT() ioctl handler code. * * @set_fmt: callback for VIDIOC_SUBDEV_S_FMT() ioctl handler code. * * @get_selection: callback for VIDIOC_SUBDEV_G_SELECTION() ioctl handler code. * * @set_selection: callback for VIDIOC_SUBDEV_S_SELECTION() ioctl handler code. * * @get_frame_interval: callback for VIDIOC_SUBDEV_G_FRAME_INTERVAL() * ioctl handler code. * * @set_frame_interval: callback for VIDIOC_SUBDEV_S_FRAME_INTERVAL() * ioctl handler code. * * @get_edid: callback for VIDIOC_SUBDEV_G_EDID() ioctl handler code. * * @set_edid: callback for VIDIOC_SUBDEV_S_EDID() ioctl handler code. * * @dv_timings_cap: callback for VIDIOC_SUBDEV_DV_TIMINGS_CAP() ioctl handler * code. * * @enum_dv_timings: callback for VIDIOC_SUBDEV_ENUM_DV_TIMINGS() ioctl handler * code. * * @link_validate: used by the media controller code to check if the links * that belongs to a pipeline can be used for stream. * * @get_frame_desc: get the current low level media bus frame parameters. * * @set_frame_desc: set the low level media bus frame parameters, @fd array * may be adjusted by the subdev driver to device capabilities. * * @get_mbus_config: get the media bus configuration of a remote sub-device. * The media bus configuration is usually retrieved from the * firmware interface at sub-device probe time, immediately * applied to the hardware and eventually adjusted by the * driver. Remote sub-devices (usually video receivers) shall * use this operation to query the transmitting end bus * configuration in order to adjust their own one accordingly. * Callers should make sure they get the most up-to-date as * possible configuration from the remote end, likely calling * this operation as close as possible to stream on time. The * operation shall fail if the pad index it has been called on * is not valid or in case of unrecoverable failures. * * @set_routing: Enable or disable data connection routes described in the * subdevice routing table. Subdevs that implement this operation * must set the V4L2_SUBDEV_FL_STREAMS flag. * * @enable_streams: Enable the streams defined in streams_mask on the given * source pad. Subdevs that implement this operation must use the active * state management provided by the subdev core (enabled through a call to * v4l2_subdev_init_finalize() at initialization time). Do not call * directly, use v4l2_subdev_enable_streams() instead. * * @disable_streams: Disable the streams defined in streams_mask on the given * source pad. Subdevs that implement this operation must use the active * state management provided by the subdev core (enabled through a call to * v4l2_subdev_init_finalize() at initialization time). Do not call * directly, use v4l2_subdev_disable_streams() instead. */ struct v4l2_subdev_pad_ops { int (*enum_mbus_code)(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_mbus_code_enum *code); int (*enum_frame_size)(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_frame_size_enum *fse); int (*enum_frame_interval)(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_frame_interval_enum *fie); int (*get_fmt)(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_format *format); int (*set_fmt)(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_format *format); int (*get_selection)(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_selection *sel); int (*set_selection)(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_selection *sel); int (*get_frame_interval)(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_frame_interval *interval); int (*set_frame_interval)(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_frame_interval *interval); int (*get_edid)(struct v4l2_subdev *sd, struct v4l2_edid *edid); int (*set_edid)(struct v4l2_subdev *sd, struct v4l2_edid *edid); int (*dv_timings_cap)(struct v4l2_subdev *sd, struct v4l2_dv_timings_cap *cap); int (*enum_dv_timings)(struct v4l2_subdev *sd, struct v4l2_enum_dv_timings *timings); #ifdef CONFIG_MEDIA_CONTROLLER int (*link_validate)(struct v4l2_subdev *sd, struct media_link *link, struct v4l2_subdev_format *source_fmt, struct v4l2_subdev_format *sink_fmt); #endif /* CONFIG_MEDIA_CONTROLLER */ int (*get_frame_desc)(struct v4l2_subdev *sd, unsigned int pad, struct v4l2_mbus_frame_desc *fd); int (*set_frame_desc)(struct v4l2_subdev *sd, unsigned int pad, struct v4l2_mbus_frame_desc *fd); int (*get_mbus_config)(struct v4l2_subdev *sd, unsigned int pad, struct v4l2_mbus_config *config); int (*set_routing)(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, enum v4l2_subdev_format_whence which, struct v4l2_subdev_krouting *route); int (*enable_streams)(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, u32 pad, u64 streams_mask); int (*disable_streams)(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, u32 pad, u64 streams_mask); }; /** * struct v4l2_subdev_ops - Subdev operations * * @core: pointer to &struct v4l2_subdev_core_ops. Can be %NULL * @tuner: pointer to &struct v4l2_subdev_tuner_ops. Can be %NULL * @audio: pointer to &struct v4l2_subdev_audio_ops. Can be %NULL * @video: pointer to &struct v4l2_subdev_video_ops. Can be %NULL * @vbi: pointer to &struct v4l2_subdev_vbi_ops. Can be %NULL * @ir: pointer to &struct v4l2_subdev_ir_ops. Can be %NULL * @sensor: pointer to &struct v4l2_subdev_sensor_ops. Can be %NULL * @pad: pointer to &struct v4l2_subdev_pad_ops. Can be %NULL */ struct v4l2_subdev_ops { const struct v4l2_subdev_core_ops *core; const struct v4l2_subdev_tuner_ops *tuner; const struct v4l2_subdev_audio_ops *audio; const struct v4l2_subdev_video_ops *video; const struct v4l2_subdev_vbi_ops *vbi; const struct v4l2_subdev_ir_ops *ir; const struct v4l2_subdev_sensor_ops *sensor; const struct v4l2_subdev_pad_ops *pad; }; /** * struct v4l2_subdev_internal_ops - V4L2 subdev internal ops * * @init_state: initialize the subdev state to default values * * @registered: called when this subdev is registered. When called the v4l2_dev * field is set to the correct v4l2_device. * * @unregistered: called when this subdev is unregistered. When called the * v4l2_dev field is still set to the correct v4l2_device. * * @open: called when the subdev device node is opened by an application. * * @close: called when the subdev device node is closed. Please note that * it is possible for @close to be called after @unregistered! * * @release: called when the last user of the subdev device is gone. This * happens after the @unregistered callback and when the last open * filehandle to the v4l-subdevX device node was closed. If no device * node was created for this sub-device, then the @release callback * is called right after the @unregistered callback. * The @release callback is typically used to free the memory containing * the v4l2_subdev structure. It is almost certainly required for any * sub-device that sets the V4L2_SUBDEV_FL_HAS_DEVNODE flag. * * .. note:: * Never call this from drivers, only the v4l2 framework can call * these ops. */ struct v4l2_subdev_internal_ops { int (*init_state)(struct v4l2_subdev *sd, struct v4l2_subdev_state *state); int (*registered)(struct v4l2_subdev *sd); void (*unregistered)(struct v4l2_subdev *sd); int (*open)(struct v4l2_subdev *sd, struct v4l2_subdev_fh *fh); int (*close)(struct v4l2_subdev *sd, struct v4l2_subdev_fh *fh); void (*release)(struct v4l2_subdev *sd); }; /* Set this flag if this subdev is a i2c device. */ #define V4L2_SUBDEV_FL_IS_I2C (1U << 0) /* Set this flag if this subdev is a spi device. */ #define V4L2_SUBDEV_FL_IS_SPI (1U << 1) /* Set this flag if this subdev needs a device node. */ #define V4L2_SUBDEV_FL_HAS_DEVNODE (1U << 2) /* * Set this flag if this subdev generates events. * Note controls can send events, thus drivers exposing controls * should set this flag. */ #define V4L2_SUBDEV_FL_HAS_EVENTS (1U << 3) /* * Set this flag if this subdev supports multiplexed streams. This means * that the driver supports routing and handles the stream parameter in its * v4l2_subdev_pad_ops handlers. More specifically, this means: * * - Centrally managed subdev active state is enabled * - Legacy pad config is _not_ supported (state->pads is NULL) * - Routing ioctls are available * - Multiple streams per pad are supported */ #define V4L2_SUBDEV_FL_STREAMS (1U << 4) struct regulator_bulk_data; /** * struct v4l2_subdev_platform_data - regulators config struct * * @regulators: Optional regulators used to power on/off the subdevice * @num_regulators: Number of regululators * @host_priv: Per-subdevice data, specific for a certain video host device */ struct v4l2_subdev_platform_data { struct regulator_bulk_data *regulators; int num_regulators; void *host_priv; }; /** * struct v4l2_subdev - describes a V4L2 sub-device * * @entity: pointer to &struct media_entity * @list: List of sub-devices * @owner: The owner is the same as the driver's &struct device owner. * @owner_v4l2_dev: true if the &sd->owner matches the owner of @v4l2_dev->dev * owner. Initialized by v4l2_device_register_subdev(). * @flags: subdev flags. Can be: * %V4L2_SUBDEV_FL_IS_I2C - Set this flag if this subdev is a i2c device; * %V4L2_SUBDEV_FL_IS_SPI - Set this flag if this subdev is a spi device; * %V4L2_SUBDEV_FL_HAS_DEVNODE - Set this flag if this subdev needs a * device node; * %V4L2_SUBDEV_FL_HAS_EVENTS - Set this flag if this subdev generates * events. * * @v4l2_dev: pointer to struct &v4l2_device * @ops: pointer to struct &v4l2_subdev_ops * @internal_ops: pointer to struct &v4l2_subdev_internal_ops. * Never call these internal ops from within a driver! * @ctrl_handler: The control handler of this subdev. May be NULL. * @name: Name of the sub-device. Please notice that the name must be unique. * @grp_id: can be used to group similar subdevs. Value is driver-specific * @dev_priv: pointer to private data * @host_priv: pointer to private data used by the device where the subdev * is attached. * @devnode: subdev device node * @dev: pointer to the physical device, if any * @fwnode: The fwnode_handle of the subdev, usually the same as * either dev->of_node->fwnode or dev->fwnode (whichever is non-NULL). * @async_list: Links this subdev to a global subdev_list or * @notifier->done_list list. * @async_subdev_endpoint_list: List entry in async_subdev_endpoint_entry of * &struct v4l2_async_subdev_endpoint. * @subdev_notifier: A sub-device notifier implicitly registered for the sub- * device using v4l2_async_register_subdev_sensor(). * @asc_list: Async connection list, of &struct * v4l2_async_connection.subdev_entry. * @pdata: common part of subdevice platform data * @state_lock: A pointer to a lock used for all the subdev's states, set by the * driver. This is optional. If NULL, each state instance will get * a lock of its own. * @privacy_led: Optional pointer to a LED classdev for the privacy LED for sensors. * @active_state: Active state for the subdev (NULL for subdevs tracking the * state internally). Initialized by calling * v4l2_subdev_init_finalize(). * @enabled_streams: Bitmask of enabled streams used by * v4l2_subdev_enable_streams() and * v4l2_subdev_disable_streams() helper functions for fallback * cases. * * Each instance of a subdev driver should create this struct, either * stand-alone or embedded in a larger struct. * * This structure should be initialized by v4l2_subdev_init() or one of * its variants: v4l2_spi_subdev_init(), v4l2_i2c_subdev_init(). */ struct v4l2_subdev { #if defined(CONFIG_MEDIA_CONTROLLER) struct media_entity entity; #endif struct list_head list; struct module *owner; bool owner_v4l2_dev; u32 flags; struct v4l2_device *v4l2_dev; const struct v4l2_subdev_ops *ops; const struct v4l2_subdev_internal_ops *internal_ops; struct v4l2_ctrl_handler *ctrl_handler; char name[52]; u32 grp_id; void *dev_priv; void *host_priv; struct video_device *devnode; struct device *dev; struct fwnode_handle *fwnode; struct list_head async_list; struct list_head async_subdev_endpoint_list; struct v4l2_async_notifier *subdev_notifier; struct list_head asc_list; struct v4l2_subdev_platform_data *pdata; struct mutex *state_lock; /* * The fields below are private, and should only be accessed via * appropriate functions. */ struct led_classdev *privacy_led; /* * TODO: active_state should most likely be changed from a pointer to an * embedded field. For the time being it's kept as a pointer to more * easily catch uses of active_state in the cases where the driver * doesn't support it. */ struct v4l2_subdev_state *active_state; u64 enabled_streams; }; /** * media_entity_to_v4l2_subdev - Returns a &struct v4l2_subdev from * the &struct media_entity embedded in it. * * @ent: pointer to &struct media_entity. */ #define media_entity_to_v4l2_subdev(ent) \ ({ \ typeof(ent) __me_sd_ent = (ent); \ \ __me_sd_ent ? \ container_of(__me_sd_ent, struct v4l2_subdev, entity) : \ NULL; \ }) /** * vdev_to_v4l2_subdev - Returns a &struct v4l2_subdev from * the &struct video_device embedded on it. * * @vdev: pointer to &struct video_device */ #define vdev_to_v4l2_subdev(vdev) \ ((struct v4l2_subdev *)video_get_drvdata(vdev)) /** * struct v4l2_subdev_fh - Used for storing subdev information per file handle * * @vfh: pointer to &struct v4l2_fh * @state: pointer to &struct v4l2_subdev_state * @owner: module pointer to the owner of this file handle * @client_caps: bitmask of ``V4L2_SUBDEV_CLIENT_CAP_*`` */ struct v4l2_subdev_fh { struct v4l2_fh vfh; struct module *owner; #if defined(CONFIG_VIDEO_V4L2_SUBDEV_API) struct v4l2_subdev_state *state; u64 client_caps; #endif }; /** * to_v4l2_subdev_fh - Returns a &struct v4l2_subdev_fh from * the &struct v4l2_fh embedded on it. * * @fh: pointer to &struct v4l2_fh */ #define to_v4l2_subdev_fh(fh) \ container_of(fh, struct v4l2_subdev_fh, vfh) extern const struct v4l2_file_operations v4l2_subdev_fops; /** * v4l2_set_subdevdata - Sets V4L2 dev private device data * * @sd: pointer to &struct v4l2_subdev * @p: pointer to the private device data to be stored. */ static inline void v4l2_set_subdevdata(struct v4l2_subdev *sd, void *p) { sd->dev_priv = p; } /** * v4l2_get_subdevdata - Gets V4L2 dev private device data * * @sd: pointer to &struct v4l2_subdev * * Returns the pointer to the private device data to be stored. */ static inline void *v4l2_get_subdevdata(const struct v4l2_subdev *sd) { return sd->dev_priv; } /** * v4l2_set_subdev_hostdata - Sets V4L2 dev private host data * * @sd: pointer to &struct v4l2_subdev * @p: pointer to the private data to be stored. */ static inline void v4l2_set_subdev_hostdata(struct v4l2_subdev *sd, void *p) { sd->host_priv = p; } /** * v4l2_get_subdev_hostdata - Gets V4L2 dev private data * * @sd: pointer to &struct v4l2_subdev * * Returns the pointer to the private host data to be stored. */ static inline void *v4l2_get_subdev_hostdata(const struct v4l2_subdev *sd) { return sd->host_priv; } #ifdef CONFIG_MEDIA_CONTROLLER /** * v4l2_subdev_get_fwnode_pad_1_to_1 - Get pad number from a subdev fwnode * endpoint, assuming 1:1 port:pad * * @entity: Pointer to the subdev entity * @endpoint: Pointer to a parsed fwnode endpoint * * This function can be used as the .get_fwnode_pad operation for * subdevices that map port numbers and pad indexes 1:1. If the endpoint * is owned by the subdevice, the function returns the endpoint port * number. * * Returns the endpoint port number on success or a negative error code. */ int v4l2_subdev_get_fwnode_pad_1_to_1(struct media_entity *entity, struct fwnode_endpoint *endpoint); /** * v4l2_subdev_link_validate_default - validates a media link * * @sd: pointer to &struct v4l2_subdev * @link: pointer to &struct media_link * @source_fmt: pointer to &struct v4l2_subdev_format * @sink_fmt: pointer to &struct v4l2_subdev_format * * This function ensures that width, height and the media bus pixel * code are equal on both source and sink of the link. */ int v4l2_subdev_link_validate_default(struct v4l2_subdev *sd, struct media_link *link, struct v4l2_subdev_format *source_fmt, struct v4l2_subdev_format *sink_fmt); /** * v4l2_subdev_link_validate - validates a media link * * @link: pointer to &struct media_link * * This function calls the subdev's link_validate ops to validate * if a media link is valid for streaming. It also internally * calls v4l2_subdev_link_validate_default() to ensure that * width, height and the media bus pixel code are equal on both * source and sink of the link. */ int v4l2_subdev_link_validate(struct media_link *link); /** * v4l2_subdev_has_pad_interdep - MC has_pad_interdep implementation for subdevs * * @entity: pointer to &struct media_entity * @pad0: pad number for the first pad * @pad1: pad number for the second pad * * This function is an implementation of the * media_entity_operations.has_pad_interdep operation for subdevs that * implement the multiplexed streams API (as indicated by the * V4L2_SUBDEV_FL_STREAMS subdev flag). * * It considers two pads interdependent if there is an active route between pad0 * and pad1. */ bool v4l2_subdev_has_pad_interdep(struct media_entity *entity, unsigned int pad0, unsigned int pad1); /** * __v4l2_subdev_state_alloc - allocate v4l2_subdev_state * * @sd: pointer to &struct v4l2_subdev for which the state is being allocated. * @lock_name: name of the state lock * @key: lock_class_key for the lock * * Must call __v4l2_subdev_state_free() when state is no longer needed. * * Not to be called directly by the drivers. */ struct v4l2_subdev_state *__v4l2_subdev_state_alloc(struct v4l2_subdev *sd, const char *lock_name, struct lock_class_key *key); /** * __v4l2_subdev_state_free - free a v4l2_subdev_state * * @state: v4l2_subdev_state to be freed. * * Not to be called directly by the drivers. */ void __v4l2_subdev_state_free(struct v4l2_subdev_state *state); /** * v4l2_subdev_init_finalize() - Finalizes the initialization of the subdevice * @sd: The subdev * * This function finalizes the initialization of the subdev, including * allocation of the active state for the subdev. * * This function must be called by the subdev drivers that use the centralized * active state, after the subdev struct has been initialized and * media_entity_pads_init() has been called, but before registering the * subdev. * * The user must call v4l2_subdev_cleanup() when the subdev is being removed. */ #define v4l2_subdev_init_finalize(sd) \ ({ \ static struct lock_class_key __key; \ const char *name = KBUILD_BASENAME \ ":" __stringify(__LINE__) ":sd->active_state->lock"; \ __v4l2_subdev_init_finalize(sd, name, &__key); \ }) int __v4l2_subdev_init_finalize(struct v4l2_subdev *sd, const char *name, struct lock_class_key *key); /** * v4l2_subdev_cleanup() - Releases the resources allocated by the subdevice * @sd: The subdevice * * Clean up a V4L2 async sub-device. Must be called for a sub-device as part of * its release if resources have been associated with it using * v4l2_async_subdev_endpoint_add() or v4l2_subdev_init_finalize(). */ void v4l2_subdev_cleanup(struct v4l2_subdev *sd); /* * A macro to generate the macro or function name for sub-devices state access * wrapper macros below. */ #define __v4l2_subdev_state_gen_call(NAME, _1, ARG, ...) \ __v4l2_subdev_state_get_ ## NAME ## ARG /** * v4l2_subdev_state_get_format() - Get pointer to a stream format * @state: subdevice state * @pad: pad id * @...: stream id (optional argument) * * This returns a pointer to &struct v4l2_mbus_framefmt for the given pad + * stream in the subdev state. * * For stream-unaware drivers the format for the corresponding pad is returned. * If the pad does not exist, NULL is returned. */ /* * Wrap v4l2_subdev_state_get_format(), allowing the function to be called with * two or three arguments. The purpose of the __v4l2_subdev_state_get_format() * macro below is to come up with the name of the function or macro to call, * using the last two arguments (_stream and _pad). The selected function or * macro is then called using the arguments specified by the caller. A similar * arrangement is used for v4l2_subdev_state_crop() and * v4l2_subdev_state_compose() below. */ #define v4l2_subdev_state_get_format(state, pad, ...) \ __v4l2_subdev_state_gen_call(format, ##__VA_ARGS__, , _pad) \ (state, pad, ##__VA_ARGS__) #define __v4l2_subdev_state_get_format_pad(state, pad) \ __v4l2_subdev_state_get_format(state, pad, 0) struct v4l2_mbus_framefmt * __v4l2_subdev_state_get_format(struct v4l2_subdev_state *state, unsigned int pad, u32 stream); /** * v4l2_subdev_state_get_crop() - Get pointer to a stream crop rectangle * @state: subdevice state * @pad: pad id * @...: stream id (optional argument) * * This returns a pointer to crop rectangle for the given pad + stream in the * subdev state. * * For stream-unaware drivers the crop rectangle for the corresponding pad is * returned. If the pad does not exist, NULL is returned. */ #define v4l2_subdev_state_get_crop(state, pad, ...) \ __v4l2_subdev_state_gen_call(crop, ##__VA_ARGS__, , _pad) \ (state, pad, ##__VA_ARGS__) #define __v4l2_subdev_state_get_crop_pad(state, pad) \ __v4l2_subdev_state_get_crop(state, pad, 0) struct v4l2_rect * __v4l2_subdev_state_get_crop(struct v4l2_subdev_state *state, unsigned int pad, u32 stream); /** * v4l2_subdev_state_get_compose() - Get pointer to a stream compose rectangle * @state: subdevice state * @pad: pad id * @...: stream id (optional argument) * * This returns a pointer to compose rectangle for the given pad + stream in the * subdev state. * * For stream-unaware drivers the compose rectangle for the corresponding pad is * returned. If the pad does not exist, NULL is returned. */ #define v4l2_subdev_state_get_compose(state, pad, ...) \ __v4l2_subdev_state_gen_call(compose, ##__VA_ARGS__, , _pad) \ (state, pad, ##__VA_ARGS__) #define __v4l2_subdev_state_get_compose_pad(state, pad) \ __v4l2_subdev_state_get_compose(state, pad, 0) struct v4l2_rect * __v4l2_subdev_state_get_compose(struct v4l2_subdev_state *state, unsigned int pad, u32 stream); /** * v4l2_subdev_state_get_interval() - Get pointer to a stream frame interval * @state: subdevice state * @pad: pad id * @...: stream id (optional argument) * * This returns a pointer to the frame interval for the given pad + stream in * the subdev state. * * For stream-unaware drivers the frame interval for the corresponding pad is * returned. If the pad does not exist, NULL is returned. */ #define v4l2_subdev_state_get_interval(state, pad, ...) \ __v4l2_subdev_state_gen_call(interval, ##__VA_ARGS__, , _pad) \ (state, pad, ##__VA_ARGS__) #define __v4l2_subdev_state_get_interval_pad(state, pad) \ __v4l2_subdev_state_get_interval(state, pad, 0) struct v4l2_fract * __v4l2_subdev_state_get_interval(struct v4l2_subdev_state *state, unsigned int pad, u32 stream); #if defined(CONFIG_VIDEO_V4L2_SUBDEV_API) /** * v4l2_subdev_get_fmt() - Fill format based on state * @sd: subdevice * @state: subdevice state * @format: pointer to &struct v4l2_subdev_format * * Fill @format->format field based on the information in the @format struct. * * This function can be used by the subdev drivers which support active state to * implement v4l2_subdev_pad_ops.get_fmt if the subdev driver does not need to * do anything special in their get_fmt op. * * Returns 0 on success, error value otherwise. */ int v4l2_subdev_get_fmt(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_format *format); /** * v4l2_subdev_get_frame_interval() - Fill frame interval based on state * @sd: subdevice * @state: subdevice state * @fi: pointer to &struct v4l2_subdev_frame_interval * * Fill @fi->interval field based on the information in the @fi struct. * * This function can be used by the subdev drivers which support active state to * implement v4l2_subdev_pad_ops.get_frame_interval if the subdev driver does * not need to do anything special in their get_frame_interval op. * * Returns 0 on success, error value otherwise. */ int v4l2_subdev_get_frame_interval(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_frame_interval *fi); /** * v4l2_subdev_set_routing() - Set given routing to subdev state * @sd: The subdevice * @state: The subdevice state * @routing: Routing that will be copied to subdev state * * This will release old routing table (if any) from the state, allocate * enough space for the given routing, and copy the routing. * * This can be used from the subdev driver's set_routing op, after validating * the routing. */ int v4l2_subdev_set_routing(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, const struct v4l2_subdev_krouting *routing); struct v4l2_subdev_route * __v4l2_subdev_next_active_route(const struct v4l2_subdev_krouting *routing, struct v4l2_subdev_route *route); /** * for_each_active_route - iterate on all active routes of a routing table * @routing: The routing table * @route: The route iterator */ #define for_each_active_route(routing, route) \ for ((route) = NULL; \ ((route) = __v4l2_subdev_next_active_route((routing), (route)));) /** * v4l2_subdev_set_routing_with_fmt() - Set given routing and format to subdev * state * @sd: The subdevice * @state: The subdevice state * @routing: Routing that will be copied to subdev state * @fmt: Format used to initialize all the streams * * This is the same as v4l2_subdev_set_routing, but additionally initializes * all the streams using the given format. */ int v4l2_subdev_set_routing_with_fmt(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, const struct v4l2_subdev_krouting *routing, const struct v4l2_mbus_framefmt *fmt); /** * v4l2_subdev_routing_find_opposite_end() - Find the opposite stream * @routing: routing used to find the opposite side * @pad: pad id * @stream: stream id * @other_pad: pointer used to return the opposite pad * @other_stream: pointer used to return the opposite stream * * This function uses the routing table to find the pad + stream which is * opposite the given pad + stream. * * @other_pad and/or @other_stream can be NULL if the caller does not need the * value. * * Returns 0 on success, or -EINVAL if no matching route is found. */ int v4l2_subdev_routing_find_opposite_end(const struct v4l2_subdev_krouting *routing, u32 pad, u32 stream, u32 *other_pad, u32 *other_stream); /** * v4l2_subdev_state_get_opposite_stream_format() - Get pointer to opposite * stream format * @state: subdevice state * @pad: pad id * @stream: stream id * * This returns a pointer to &struct v4l2_mbus_framefmt for the pad + stream * that is opposite the given pad + stream in the subdev state. * * If the state does not contain the given pad + stream, NULL is returned. */ struct v4l2_mbus_framefmt * v4l2_subdev_state_get_opposite_stream_format(struct v4l2_subdev_state *state, u32 pad, u32 stream); /** * v4l2_subdev_state_xlate_streams() - Translate streams from one pad to another * * @state: Subdevice state * @pad0: The first pad * @pad1: The second pad * @streams: Streams bitmask on the first pad * * Streams on sink pads of a subdev are routed to source pads as expressed in * the subdev state routing table. Stream numbers don't necessarily match on * the sink and source side of a route. This function translates stream numbers * on @pad0, expressed as a bitmask in @streams, to the corresponding streams * on @pad1 using the routing table from the @state. It returns the stream mask * on @pad1, and updates @streams with the streams that have been found in the * routing table. * * @pad0 and @pad1 must be a sink and a source, in any order. * * Return: The bitmask of streams of @pad1 that are routed to @streams on @pad0. */ u64 v4l2_subdev_state_xlate_streams(const struct v4l2_subdev_state *state, u32 pad0, u32 pad1, u64 *streams); /** * enum v4l2_subdev_routing_restriction - Subdevice internal routing restrictions * * @V4L2_SUBDEV_ROUTING_NO_1_TO_N: * an input stream shall not be routed to multiple output streams (stream * duplication) * @V4L2_SUBDEV_ROUTING_NO_N_TO_1: * multiple input streams shall not be routed to the same output stream * (stream merging) * @V4L2_SUBDEV_ROUTING_NO_SINK_STREAM_MIX: * all streams from a sink pad must be routed to a single source pad * @V4L2_SUBDEV_ROUTING_NO_SOURCE_STREAM_MIX: * all streams on a source pad must originate from a single sink pad * @V4L2_SUBDEV_ROUTING_NO_SOURCE_MULTIPLEXING: * source pads shall not contain multiplexed streams * @V4L2_SUBDEV_ROUTING_NO_SINK_MULTIPLEXING: * sink pads shall not contain multiplexed streams * @V4L2_SUBDEV_ROUTING_ONLY_1_TO_1: * only non-overlapping 1-to-1 stream routing is allowed (a combination of * @V4L2_SUBDEV_ROUTING_NO_1_TO_N and @V4L2_SUBDEV_ROUTING_NO_N_TO_1) * @V4L2_SUBDEV_ROUTING_NO_STREAM_MIX: * all streams from a sink pad must be routed to a single source pad, and * that source pad shall not get routes from any other sink pad * (a combination of @V4L2_SUBDEV_ROUTING_NO_SINK_STREAM_MIX and * @V4L2_SUBDEV_ROUTING_NO_SOURCE_STREAM_MIX) * @V4L2_SUBDEV_ROUTING_NO_MULTIPLEXING: * no multiplexed streams allowed on either source or sink sides. */ enum v4l2_subdev_routing_restriction { V4L2_SUBDEV_ROUTING_NO_1_TO_N = BIT(0), V4L2_SUBDEV_ROUTING_NO_N_TO_1 = BIT(1), V4L2_SUBDEV_ROUTING_NO_SINK_STREAM_MIX = BIT(2), V4L2_SUBDEV_ROUTING_NO_SOURCE_STREAM_MIX = BIT(3), V4L2_SUBDEV_ROUTING_NO_SINK_MULTIPLEXING = BIT(4), V4L2_SUBDEV_ROUTING_NO_SOURCE_MULTIPLEXING = BIT(5), V4L2_SUBDEV_ROUTING_ONLY_1_TO_1 = V4L2_SUBDEV_ROUTING_NO_1_TO_N | V4L2_SUBDEV_ROUTING_NO_N_TO_1, V4L2_SUBDEV_ROUTING_NO_STREAM_MIX = V4L2_SUBDEV_ROUTING_NO_SINK_STREAM_MIX | V4L2_SUBDEV_ROUTING_NO_SOURCE_STREAM_MIX, V4L2_SUBDEV_ROUTING_NO_MULTIPLEXING = V4L2_SUBDEV_ROUTING_NO_SINK_MULTIPLEXING | V4L2_SUBDEV_ROUTING_NO_SOURCE_MULTIPLEXING, }; /** * v4l2_subdev_routing_validate() - Verify that routes comply with driver * constraints * @sd: The subdevice * @routing: Routing to verify * @disallow: Restrictions on routes * * This verifies that the given routing complies with the @disallow constraints. * * Returns 0 on success, error value otherwise. */ int v4l2_subdev_routing_validate(struct v4l2_subdev *sd, const struct v4l2_subdev_krouting *routing, enum v4l2_subdev_routing_restriction disallow); /** * v4l2_subdev_enable_streams() - Enable streams on a pad * @sd: The subdevice * @pad: The pad * @streams_mask: Bitmask of streams to enable * * This function enables streams on a source @pad of a subdevice. The pad is * identified by its index, while the streams are identified by the * @streams_mask bitmask. This allows enabling multiple streams on a pad at * once. * * Enabling a stream that is already enabled isn't allowed. If @streams_mask * contains an already enabled stream, this function returns -EALREADY without * performing any operation. * * Per-stream enable is only available for subdevs that implement the * .enable_streams() and .disable_streams() operations. For other subdevs, this * function implements a best-effort compatibility by calling the .s_stream() * operation, limited to subdevs that have a single source pad. * * Return: * * 0: Success * * -EALREADY: One of the streams in streams_mask is already enabled * * -EINVAL: The pad index is invalid, or doesn't correspond to a source pad * * -EOPNOTSUPP: Falling back to the legacy .s_stream() operation is * impossible because the subdev has multiple source pads */ int v4l2_subdev_enable_streams(struct v4l2_subdev *sd, u32 pad, u64 streams_mask); /** * v4l2_subdev_disable_streams() - Disable streams on a pad * @sd: The subdevice * @pad: The pad * @streams_mask: Bitmask of streams to disable * * This function disables streams on a source @pad of a subdevice. The pad is * identified by its index, while the streams are identified by the * @streams_mask bitmask. This allows disabling multiple streams on a pad at * once. * * Disabling a streams that is not enabled isn't allowed. If @streams_mask * contains a disabled stream, this function returns -EALREADY without * performing any operation. * * Per-stream disable is only available for subdevs that implement the * .enable_streams() and .disable_streams() operations. For other subdevs, this * function implements a best-effort compatibility by calling the .s_stream() * operation, limited to subdevs that have a single source pad. * * Return: * * 0: Success * * -EALREADY: One of the streams in streams_mask is not enabled * * -EINVAL: The pad index is invalid, or doesn't correspond to a source pad * * -EOPNOTSUPP: Falling back to the legacy .s_stream() operation is * impossible because the subdev has multiple source pads */ int v4l2_subdev_disable_streams(struct v4l2_subdev *sd, u32 pad, u64 streams_mask); /** * v4l2_subdev_s_stream_helper() - Helper to implement the subdev s_stream * operation using enable_streams and disable_streams * @sd: The subdevice * @enable: Enable or disable streaming * * Subdevice drivers that implement the streams-aware * &v4l2_subdev_pad_ops.enable_streams and &v4l2_subdev_pad_ops.disable_streams * operations can use this helper to implement the legacy * &v4l2_subdev_video_ops.s_stream operation. * * This helper can only be used by subdevs that have a single source pad. * * Return: 0 on success, or a negative error code otherwise. */ int v4l2_subdev_s_stream_helper(struct v4l2_subdev *sd, int enable); #endif /* CONFIG_VIDEO_V4L2_SUBDEV_API */ #endif /* CONFIG_MEDIA_CONTROLLER */ /** * v4l2_subdev_lock_state() - Locks the subdev state * @state: The subdevice state * * Locks the given subdev state. * * The state must be unlocked with v4l2_subdev_unlock_state() after use. */ static inline void v4l2_subdev_lock_state(struct v4l2_subdev_state *state) { mutex_lock(state->lock); } /** * v4l2_subdev_unlock_state() - Unlocks the subdev state * @state: The subdevice state * * Unlocks the given subdev state. */ static inline void v4l2_subdev_unlock_state(struct v4l2_subdev_state *state) { mutex_unlock(state->lock); } /** * v4l2_subdev_get_unlocked_active_state() - Checks that the active subdev state * is unlocked and returns it * @sd: The subdevice * * Returns the active state for the subdevice, or NULL if the subdev does not * support active state. If the state is not NULL, calls * lockdep_assert_not_held() to issue a warning if the state is locked. * * This function is to be used e.g. when getting the active state for the sole * purpose of passing it forward, without accessing the state fields. */ static inline struct v4l2_subdev_state * v4l2_subdev_get_unlocked_active_state(struct v4l2_subdev *sd) { if (sd->active_state) lockdep_assert_not_held(sd->active_state->lock); return sd->active_state; } /** * v4l2_subdev_get_locked_active_state() - Checks that the active subdev state * is locked and returns it * * @sd: The subdevice * * Returns the active state for the subdevice, or NULL if the subdev does not * support active state. If the state is not NULL, calls lockdep_assert_held() * to issue a warning if the state is not locked. * * This function is to be used when the caller knows that the active state is * already locked. */ static inline struct v4l2_subdev_state * v4l2_subdev_get_locked_active_state(struct v4l2_subdev *sd) { if (sd->active_state) lockdep_assert_held(sd->active_state->lock); return sd->active_state; } /** * v4l2_subdev_lock_and_get_active_state() - Locks and returns the active subdev * state for the subdevice * @sd: The subdevice * * Returns the locked active state for the subdevice, or NULL if the subdev * does not support active state. * * The state must be unlocked with v4l2_subdev_unlock_state() after use. */ static inline struct v4l2_subdev_state * v4l2_subdev_lock_and_get_active_state(struct v4l2_subdev *sd) { if (sd->active_state) v4l2_subdev_lock_state(sd->active_state); return sd->active_state; } /** * v4l2_subdev_init - initializes the sub-device struct * * @sd: pointer to the &struct v4l2_subdev to be initialized * @ops: pointer to &struct v4l2_subdev_ops. */ void v4l2_subdev_init(struct v4l2_subdev *sd, const struct v4l2_subdev_ops *ops); extern const struct v4l2_subdev_ops v4l2_subdev_call_wrappers; /** * v4l2_subdev_call - call an operation of a v4l2_subdev. * * @sd: pointer to the &struct v4l2_subdev * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of callbacks functions. * @f: callback function to be called. * The callback functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Example: err = v4l2_subdev_call(sd, video, s_std, norm); */ #define v4l2_subdev_call(sd, o, f, args...) \ ({ \ struct v4l2_subdev *__sd = (sd); \ int __result; \ if (!__sd) \ __result = -ENODEV; \ else if (!(__sd->ops->o && __sd->ops->o->f)) \ __result = -ENOIOCTLCMD; \ else if (v4l2_subdev_call_wrappers.o && \ v4l2_subdev_call_wrappers.o->f) \ __result = v4l2_subdev_call_wrappers.o->f( \ __sd, ##args); \ else \ __result = __sd->ops->o->f(__sd, ##args); \ __result; \ }) /** * v4l2_subdev_call_state_active - call an operation of a v4l2_subdev which * takes state as a parameter, passing the * subdev its active state. * * @sd: pointer to the &struct v4l2_subdev * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of callbacks functions. * @f: callback function to be called. * The callback functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * This is similar to v4l2_subdev_call(), except that this version can only be * used for ops that take a subdev state as a parameter. The macro will get the * active state, lock it before calling the op and unlock it after the call. */ #define v4l2_subdev_call_state_active(sd, o, f, args...) \ ({ \ int __result; \ struct v4l2_subdev_state *state; \ state = v4l2_subdev_get_unlocked_active_state(sd); \ if (state) \ v4l2_subdev_lock_state(state); \ __result = v4l2_subdev_call(sd, o, f, state, ##args); \ if (state) \ v4l2_subdev_unlock_state(state); \ __result; \ }) /** * v4l2_subdev_call_state_try - call an operation of a v4l2_subdev which * takes state as a parameter, passing the * subdev a newly allocated try state. * * @sd: pointer to the &struct v4l2_subdev * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of callbacks functions. * @f: callback function to be called. * The callback functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * This is similar to v4l2_subdev_call_state_active(), except that as this * version allocates a new state, this is only usable for * V4L2_SUBDEV_FORMAT_TRY use cases. * * Note: only legacy non-MC drivers may need this macro. */ #define v4l2_subdev_call_state_try(sd, o, f, args...) \ ({ \ int __result; \ static struct lock_class_key __key; \ const char *name = KBUILD_BASENAME \ ":" __stringify(__LINE__) ":state->lock"; \ struct v4l2_subdev_state *state = \ __v4l2_subdev_state_alloc(sd, name, &__key); \ v4l2_subdev_lock_state(state); \ __result = v4l2_subdev_call(sd, o, f, state, ##args); \ v4l2_subdev_unlock_state(state); \ __v4l2_subdev_state_free(state); \ __result; \ }) /** * v4l2_subdev_has_op - Checks if a subdev defines a certain operation. * * @sd: pointer to the &struct v4l2_subdev * @o: The group of callback functions in &struct v4l2_subdev_ops * which @f is a part of. * @f: callback function to be checked for its existence. */ #define v4l2_subdev_has_op(sd, o, f) \ ((sd)->ops->o && (sd)->ops->o->f) /** * v4l2_subdev_notify_event() - Delivers event notification for subdevice * @sd: The subdev for which to deliver the event * @ev: The event to deliver * * Will deliver the specified event to all userspace event listeners which are * subscribed to the v42l subdev event queue as well as to the bridge driver * using the notify callback. The notification type for the notify callback * will be %V4L2_DEVICE_NOTIFY_EVENT. */ void v4l2_subdev_notify_event(struct v4l2_subdev *sd, const struct v4l2_event *ev); #endif /* _V4L2_SUBDEV_H */
121 73 80 20 20 20 14 14 14 20 259 254 225 228 69 228 254 267 256 257 268 266 268 267 268 266 257 5 246 9 1 8 266 266 267 266 266 265 8 8 8 8 8 14 14 14 14 14 14 14 14 14 14 1 1 1 1 1 266 264 90 245 266 267 1 267 263 90 245 246 255 255 255 266 267 267 267 266 255 255 254 19 14 19 11 7 7 11 5 5 5 5 38 38 38 38 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 // SPDX-License-Identifier: GPL-2.0 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "mmu.h" #include "mmu_internal.h" #include "mmutrace.h" #include "tdp_iter.h" #include "tdp_mmu.h" #include "spte.h" #include <asm/cmpxchg.h> #include <trace/events/kvm.h> /* Initializes the TDP MMU for the VM, if enabled. */ void kvm_mmu_init_tdp_mmu(struct kvm *kvm) { INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); } /* Arbitrarily returns true so that this may be used in if statements. */ static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, bool shared) { if (shared) lockdep_assert_held_read(&kvm->mmu_lock); else lockdep_assert_held_write(&kvm->mmu_lock); return true; } void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) { /* * Invalidate all roots, which besides the obvious, schedules all roots * for zapping and thus puts the TDP MMU's reference to each root, i.e. * ultimately frees all roots. */ kvm_tdp_mmu_invalidate_all_roots(kvm); kvm_tdp_mmu_zap_invalidated_roots(kvm); WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); /* * Ensure that all the outstanding RCU callbacks to free shadow pages * can run before the VM is torn down. Putting the last reference to * zapped roots will create new callbacks. */ rcu_barrier(); } static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) { free_page((unsigned long)sp->spt); kmem_cache_free(mmu_page_header_cache, sp); } /* * This is called through call_rcu in order to free TDP page table memory * safely with respect to other kernel threads that may be operating on * the memory. * By only accessing TDP MMU page table memory in an RCU read critical * section, and freeing it after a grace period, lockless access to that * memory won't use it after it is freed. */ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) { struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, rcu_head); tdp_mmu_free_sp(sp); } void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) { if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) return; /* * The TDP MMU itself holds a reference to each root until the root is * explicitly invalidated, i.e. the final reference should be never be * put for a valid root. */ KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm); spin_lock(&kvm->arch.tdp_mmu_pages_lock); list_del_rcu(&root->link); spin_unlock(&kvm->arch.tdp_mmu_pages_lock); call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); } /* * Returns the next root after @prev_root (or the first root if @prev_root is * NULL). A reference to the returned root is acquired, and the reference to * @prev_root is released (the caller obviously must hold a reference to * @prev_root if it's non-NULL). * * If @only_valid is true, invalid roots are skipped. * * Returns NULL if the end of tdp_mmu_roots was reached. */ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, struct kvm_mmu_page *prev_root, bool only_valid) { struct kvm_mmu_page *next_root; /* * While the roots themselves are RCU-protected, fields such as * role.invalid are protected by mmu_lock. */ lockdep_assert_held(&kvm->mmu_lock); rcu_read_lock(); if (prev_root) next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, &prev_root->link, typeof(*prev_root), link); else next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, typeof(*next_root), link); while (next_root) { if ((!only_valid || !next_root->role.invalid) && kvm_tdp_mmu_get_root(next_root)) break; next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, &next_root->link, typeof(*next_root), link); } rcu_read_unlock(); if (prev_root) kvm_tdp_mmu_put_root(kvm, prev_root); return next_root; } /* * Note: this iterator gets and puts references to the roots it iterates over. * This makes it safe to release the MMU lock and yield within the loop, but * if exiting the loop early, the caller must drop the reference to the most * recent root. (Unless keeping a live reference is desirable.) * * If shared is set, this function is operating under the MMU lock in read * mode. */ #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid)\ for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \ ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \ _root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \ if (kvm_mmu_page_as_id(_root) != _as_id) { \ } else #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, true) #define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \ for (_root = tdp_mmu_next_root(_kvm, NULL, false); \ ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \ _root = tdp_mmu_next_root(_kvm, _root, false)) /* * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write, * the implication being that any flow that holds mmu_lock for read is * inherently yield-friendly and should use the yield-safe variant above. * Holding mmu_lock for write obviates the need for RCU protection as the list * is guaranteed to be stable. */ #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \ kvm_mmu_page_as_id(_root) != _as_id) { \ } else static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) { struct kvm_mmu_page *sp; sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); return sp; } static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep, gfn_t gfn, union kvm_mmu_page_role role) { INIT_LIST_HEAD(&sp->possible_nx_huge_page_link); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); sp->role = role; sp->gfn = gfn; sp->ptep = sptep; sp->tdp_mmu_page = true; trace_kvm_mmu_get_page(sp, true); } static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp, struct tdp_iter *iter) { struct kvm_mmu_page *parent_sp; union kvm_mmu_page_role role; parent_sp = sptep_to_sp(rcu_dereference(iter->sptep)); role = parent_sp->role; role.level--; tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role); } hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) { union kvm_mmu_page_role role = vcpu->arch.mmu->root_role; struct kvm *kvm = vcpu->kvm; struct kvm_mmu_page *root; lockdep_assert_held_write(&kvm->mmu_lock); /* * Check for an existing root before allocating a new one. Note, the * role check prevents consuming an invalid root. */ for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) { if (root->role.word == role.word && kvm_tdp_mmu_get_root(root)) goto out; } root = tdp_mmu_alloc_sp(vcpu); tdp_mmu_init_sp(root, NULL, 0, role); /* * TDP MMU roots are kept until they are explicitly invalidated, either * by a memslot update or by the destruction of the VM. Initialize the * refcount to two; one reference for the vCPU, and one reference for * the TDP MMU itself, which is held until the root is invalidated and * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots(). */ refcount_set(&root->tdp_mmu_root_count, 2); spin_lock(&kvm->arch.tdp_mmu_pages_lock); list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); spin_unlock(&kvm->arch.tdp_mmu_pages_lock); out: return __pa(root->spt); } static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, u64 old_spte, u64 new_spte, int level, bool shared); static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm_account_pgtable_pages((void *)sp->spt, +1); atomic64_inc(&kvm->arch.tdp_mmu_pages); } static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm_account_pgtable_pages((void *)sp->spt, -1); atomic64_dec(&kvm->arch.tdp_mmu_pages); } /** * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages * * @kvm: kvm instance * @sp: the page to be removed */ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp) { tdp_unaccount_mmu_page(kvm, sp); if (!sp->nx_huge_page_disallowed) return; spin_lock(&kvm->arch.tdp_mmu_pages_lock); sp->nx_huge_page_disallowed = false; untrack_possible_nx_huge_page(kvm, sp); spin_unlock(&kvm->arch.tdp_mmu_pages_lock); } /** * handle_removed_pt() - handle a page table removed from the TDP structure * * @kvm: kvm instance * @pt: the page removed from the paging structure * @shared: This operation may not be running under the exclusive use * of the MMU lock and the operation must synchronize with other * threads that might be modifying SPTEs. * * Given a page table that has been removed from the TDP paging structure, * iterates through the page table to clear SPTEs and free child page tables. * * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU * protection. Since this thread removed it from the paging structure, * this thread will be responsible for ensuring the page is freed. Hence the * early rcu_dereferences in the function. */ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) { struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); int level = sp->role.level; gfn_t base_gfn = sp->gfn; int i; trace_kvm_mmu_prepare_zap_page(sp); tdp_mmu_unlink_sp(kvm, sp); for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { tdp_ptep_t sptep = pt + i; gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); u64 old_spte; if (shared) { /* * Set the SPTE to a nonpresent value that other * threads will not overwrite. If the SPTE was * already marked as removed then another thread * handling a page fault could overwrite it, so * set the SPTE until it is set from some other * value to the removed SPTE value. */ for (;;) { old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE); if (!is_removed_spte(old_spte)) break; cpu_relax(); } } else { /* * If the SPTE is not MMU-present, there is no backing * page associated with the SPTE and so no side effects * that need to be recorded, and exclusive ownership of * mmu_lock ensures the SPTE can't be made present. * Note, zapping MMIO SPTEs is also unnecessary as they * are guarded by the memslots generation, not by being * unreachable. */ old_spte = kvm_tdp_mmu_read_spte(sptep); if (!is_shadow_present_pte(old_spte)) continue; /* * Use the common helper instead of a raw WRITE_ONCE as * the SPTE needs to be updated atomically if it can be * modified by a different vCPU outside of mmu_lock. * Even though the parent SPTE is !PRESENT, the TLB * hasn't yet been flushed, and both Intel and AMD * document that A/D assists can use upper-level PxE * entries that are cached in the TLB, i.e. the CPU can * still access the page and mark it dirty. * * No retry is needed in the atomic update path as the * sole concern is dropping a Dirty bit, i.e. no other * task can zap/remove the SPTE as mmu_lock is held for * write. Marking the SPTE as a removed SPTE is not * strictly necessary for the same reason, but using * the remove SPTE value keeps the shared/exclusive * paths consistent and allows the handle_changed_spte() * call below to hardcode the new value to REMOVED_SPTE. * * Note, even though dropping a Dirty bit is the only * scenario where a non-atomic update could result in a * functional bug, simply checking the Dirty bit isn't * sufficient as a fast page fault could read the upper * level SPTE before it is zapped, and then make this * target SPTE writable, resume the guest, and set the * Dirty bit between reading the SPTE above and writing * it here. */ old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, REMOVED_SPTE, level); } handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, old_spte, REMOVED_SPTE, level, shared); } call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); } /** * handle_changed_spte - handle bookkeeping associated with an SPTE change * @kvm: kvm instance * @as_id: the address space of the paging structure the SPTE was a part of * @gfn: the base GFN that was mapped by the SPTE * @old_spte: The value of the SPTE before the change * @new_spte: The value of the SPTE after the change * @level: the level of the PT the SPTE is part of in the paging structure * @shared: This operation may not be running under the exclusive use of * the MMU lock and the operation must synchronize with other * threads that might be modifying SPTEs. * * Handle bookkeeping that might result from the modification of a SPTE. Note, * dirty logging updates are handled in common code, not here (see make_spte() * and fast_pf_fix_direct_spte()). */ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, u64 old_spte, u64 new_spte, int level, bool shared) { bool was_present = is_shadow_present_pte(old_spte); bool is_present = is_shadow_present_pte(new_spte); bool was_leaf = was_present && is_last_spte(old_spte, level); bool is_leaf = is_present && is_last_spte(new_spte, level); bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL); WARN_ON_ONCE(level < PG_LEVEL_4K); WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); /* * If this warning were to trigger it would indicate that there was a * missing MMU notifier or a race with some notifier handler. * A present, leaf SPTE should never be directly replaced with another * present leaf SPTE pointing to a different PFN. A notifier handler * should be zapping the SPTE before the main MM's page table is * changed, or the SPTE should be zeroed, and the TLBs flushed by the * thread before replacement. */ if (was_leaf && is_leaf && pfn_changed) { pr_err("Invalid SPTE change: cannot replace a present leaf\n" "SPTE with another present leaf SPTE mapping a\n" "different PFN!\n" "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", as_id, gfn, old_spte, new_spte, level); /* * Crash the host to prevent error propagation and guest data * corruption. */ BUG(); } if (old_spte == new_spte) return; trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); if (is_leaf) check_spte_writable_invariants(new_spte); /* * The only times a SPTE should be changed from a non-present to * non-present state is when an MMIO entry is installed/modified/ * removed. In that case, there is nothing to do here. */ if (!was_present && !is_present) { /* * If this change does not involve a MMIO SPTE or removed SPTE, * it is unexpected. Log the change, though it should not * impact the guest since both the former and current SPTEs * are nonpresent. */ if (WARN_ON_ONCE(!is_mmio_spte(old_spte) && !is_mmio_spte(new_spte) && !is_removed_spte(new_spte))) pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" "should not be replaced with another,\n" "different nonpresent SPTE, unless one or both\n" "are MMIO SPTEs, or the new SPTE is\n" "a temporary removed SPTE.\n" "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", as_id, gfn, old_spte, new_spte, level); return; } if (is_leaf != was_leaf) kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1); if (was_leaf && is_dirty_spte(old_spte) && (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) kvm_set_pfn_dirty(spte_to_pfn(old_spte)); /* * Recursively handle child PTs if the change removed a subtree from * the paging structure. Note the WARN on the PFN changing without the * SPTE being converted to a hugepage (leaf) or being zapped. Shadow * pages are kernel allocations and should never be migrated. */ if (was_present && !was_leaf && (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); if (was_leaf && is_accessed_spte(old_spte) && (!is_present || !is_accessed_spte(new_spte) || pfn_changed)) kvm_set_pfn_accessed(spte_to_pfn(old_spte)); } /* * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically * and handle the associated bookkeeping. Do not mark the page dirty * in KVM's dirty bitmaps. * * If setting the SPTE fails because it has changed, iter->old_spte will be * refreshed to the current value of the spte. * * @kvm: kvm instance * @iter: a tdp_iter instance currently on the SPTE that should be set * @new_spte: The value the SPTE should be set to * Return: * * 0 - If the SPTE was set. * * -EBUSY - If the SPTE cannot be set. In this case this function will have * no side-effects other than setting iter->old_spte to the last * known value of the spte. */ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte) { u64 *sptep = rcu_dereference(iter->sptep); /* * The caller is responsible for ensuring the old SPTE is not a REMOVED * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE, * and pre-checking before inserting a new SPTE is advantageous as it * avoids unnecessary work. */ WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte)); lockdep_assert_held_read(&kvm->mmu_lock); /* * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and * does not hold the mmu_lock. On failure, i.e. if a different logical * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with * the current value, so the caller operates on fresh data, e.g. if it * retries tdp_mmu_set_spte_atomic() */ if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) return -EBUSY; handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, new_spte, iter->level, true); return 0; } static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, struct tdp_iter *iter) { int ret; /* * Freeze the SPTE by setting it to a special, * non-present value. This will stop other threads from * immediately installing a present entry in its place * before the TLBs are flushed. */ ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE); if (ret) return ret; kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level); /* * No other thread can overwrite the removed SPTE as they must either * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not * overwrite the special removed SPTE value. No bookkeeping is needed * here since the SPTE is going from non-present to non-present. Use * the raw write helper to avoid an unnecessary check on volatile bits. */ __kvm_tdp_mmu_write_spte(iter->sptep, 0); return 0; } /* * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping * @kvm: KVM instance * @as_id: Address space ID, i.e. regular vs. SMM * @sptep: Pointer to the SPTE * @old_spte: The current value of the SPTE * @new_spte: The new value that will be set for the SPTE * @gfn: The base GFN that was (or will be) mapped by the SPTE * @level: The level _containing_ the SPTE (its parent PT's level) * * Returns the old SPTE value, which _may_ be different than @old_spte if the * SPTE had voldatile bits. */ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, u64 old_spte, u64 new_spte, gfn_t gfn, int level) { lockdep_assert_held_write(&kvm->mmu_lock); /* * No thread should be using this function to set SPTEs to or from the * temporary removed SPTE value. * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic * should be used. If operating under the MMU lock in write mode, the * use of the removed SPTE should not be necessary. */ WARN_ON_ONCE(is_removed_spte(old_spte) || is_removed_spte(new_spte)); old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); return old_spte; } static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte) { WARN_ON_ONCE(iter->yielded); iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, iter->old_spte, new_spte, iter->gfn, iter->level); } #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ for_each_tdp_pte(_iter, _root, _start, _end) #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ tdp_root_for_each_pte(_iter, _root, _start, _end) \ if (!is_shadow_present_pte(_iter.old_spte) || \ !is_last_spte(_iter.old_spte, _iter.level)) \ continue; \ else #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ for_each_tdp_pte(_iter, root_to_sp(_mmu->root.hpa), _start, _end) /* * Yield if the MMU lock is contended or this thread needs to return control * to the scheduler. * * If this function should yield and flush is set, it will perform a remote * TLB flush before yielding. * * If this function yields, iter->yielded is set and the caller must skip to * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk * over the paging structures to allow the iterator to continue its traversal * from the paging structure root. * * Returns true if this function yielded. */ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter, bool flush, bool shared) { WARN_ON_ONCE(iter->yielded); /* Ensure forward progress has been made before yielding. */ if (iter->next_last_level_gfn == iter->yielded_gfn) return false; if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { if (flush) kvm_flush_remote_tlbs(kvm); rcu_read_unlock(); if (shared) cond_resched_rwlock_read(&kvm->mmu_lock); else cond_resched_rwlock_write(&kvm->mmu_lock); rcu_read_lock(); WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn); iter->yielded = true; } return iter->yielded; } static inline gfn_t tdp_mmu_max_gfn_exclusive(void) { /* * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with * a gpa range that would exceed the max gfn, and KVM does not create * MMIO SPTEs for "impossible" gfns, instead sending such accesses down * the slow emulation path every time. */ return kvm_mmu_max_gfn() + 1; } static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, bool shared, int zap_level) { struct tdp_iter iter; gfn_t end = tdp_mmu_max_gfn_exclusive(); gfn_t start = 0; for_each_tdp_pte_min_level(iter, root, zap_level, start, end) { retry: if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) continue; if (!is_shadow_present_pte(iter.old_spte)) continue; if (iter.level > zap_level) continue; if (!shared) tdp_mmu_iter_set_spte(kvm, &iter, 0); else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0)) goto retry; } } static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, bool shared) { /* * The root must have an elevated refcount so that it's reachable via * mmu_notifier callbacks, which allows this path to yield and drop * mmu_lock. When handling an unmap/release mmu_notifier command, KVM * must drop all references to relevant pages prior to completing the * callback. Dropping mmu_lock with an unreachable root would result * in zapping SPTEs after a relevant mmu_notifier callback completes * and lead to use-after-free as zapping a SPTE triggers "writeback" of * dirty accessed bits to the SPTE's associated struct page. */ WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count)); kvm_lockdep_assert_mmu_lock_held(kvm, shared); rcu_read_lock(); /* * To avoid RCU stalls due to recursively removing huge swaths of SPs, * split the zap into two passes. On the first pass, zap at the 1gb * level, and then zap top-level SPs on the second pass. "1gb" is not * arbitrary, as KVM must be able to zap a 1gb shadow page without * inducing a stall to allow in-place replacement with a 1gb hugepage. * * Because zapping a SP recurses on its children, stepping down to * PG_LEVEL_4K in the iterator itself is unnecessary. */ __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G); __tdp_mmu_zap_root(kvm, root, shared, root->role.level); rcu_read_unlock(); } bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) { u64 old_spte; /* * This helper intentionally doesn't allow zapping a root shadow page, * which doesn't have a parent page table and thus no associated entry. */ if (WARN_ON_ONCE(!sp->ptep)) return false; old_spte = kvm_tdp_mmu_read_spte(sp->ptep); if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) return false; tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0, sp->gfn, sp->role.level + 1); return true; } /* * If can_yield is true, will release the MMU lock and reschedule if the * scheduler needs the CPU or there is contention on the MMU lock. If this * function cannot yield, it will not release the MMU lock or reschedule and * the caller must ensure it does not supply too large a GFN range, or the * operation can cause a soft lockup. */ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end, bool can_yield, bool flush) { struct tdp_iter iter; end = min(end, tdp_mmu_max_gfn_exclusive()); lockdep_assert_held_write(&kvm->mmu_lock); rcu_read_lock(); for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) { if (can_yield && tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) { flush = false; continue; } if (!is_shadow_present_pte(iter.old_spte) || !is_last_spte(iter.old_spte, iter.level)) continue; tdp_mmu_iter_set_spte(kvm, &iter, 0); flush = true; } rcu_read_unlock(); /* * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need * to provide RCU protection as no 'struct kvm_mmu_page' will be freed. */ return flush; } /* * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or * more SPTEs were zapped since the MMU lock was last acquired. */ bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush) { struct kvm_mmu_page *root; lockdep_assert_held_write(&kvm->mmu_lock); for_each_tdp_mmu_root_yield_safe(kvm, root) flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush); return flush; } void kvm_tdp_mmu_zap_all(struct kvm *kvm) { struct kvm_mmu_page *root; /* * Zap all roots, including invalid roots, as all SPTEs must be dropped * before returning to the caller. Zap directly even if the root is * also being zapped by a worker. Walking zapped top-level SPTEs isn't * all that expensive and mmu_lock is already held, which means the * worker has yielded, i.e. flushing the work instead of zapping here * isn't guaranteed to be any faster. * * A TLB flush is unnecessary, KVM zaps everything if and only the VM * is being destroyed or the userspace VMM has exited. In both cases, * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request. */ lockdep_assert_held_write(&kvm->mmu_lock); for_each_tdp_mmu_root_yield_safe(kvm, root) tdp_mmu_zap_root(kvm, root, false); } /* * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast * zap" completes. */ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) { struct kvm_mmu_page *root; read_lock(&kvm->mmu_lock); for_each_tdp_mmu_root_yield_safe(kvm, root) { if (!root->tdp_mmu_scheduled_root_to_zap) continue; root->tdp_mmu_scheduled_root_to_zap = false; KVM_BUG_ON(!root->role.invalid, kvm); /* * A TLB flush is not necessary as KVM performs a local TLB * flush when allocating a new root (see kvm_mmu_load()), and * when migrating a vCPU to a different pCPU. Note, the local * TLB flush on reuse also invalidates paging-structure-cache * entries, i.e. TLB entries for intermediate paging structures, * that may be zapped, as such entries are associated with the * ASID on both VMX and SVM. */ tdp_mmu_zap_root(kvm, root, true); /* * The referenced needs to be put *after* zapping the root, as * the root must be reachable by mmu_notifiers while it's being * zapped */ kvm_tdp_mmu_put_root(kvm, root); } read_unlock(&kvm->mmu_lock); } /* * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that * is about to be zapped, e.g. in response to a memslots update. The actual * zapping is done separately so that it happens with mmu_lock with read, * whereas invalidating roots must be done with mmu_lock held for write (unless * the VM is being destroyed). * * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference. * See kvm_tdp_mmu_get_vcpu_root_hpa(). */ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) { struct kvm_mmu_page *root; /* * mmu_lock must be held for write to ensure that a root doesn't become * invalid while there are active readers (invalidating a root while * there are active readers may or may not be problematic in practice, * but it's uncharted territory and not supported). * * Waive the assertion if there are no users of @kvm, i.e. the VM is * being destroyed after all references have been put, or if no vCPUs * have been created (which means there are no roots), i.e. the VM is * being destroyed in an error path of KVM_CREATE_VM. */ if (IS_ENABLED(CONFIG_PROVE_LOCKING) && refcount_read(&kvm->users_count) && kvm->created_vcpus) lockdep_assert_held_write(&kvm->mmu_lock); /* * As above, mmu_lock isn't held when destroying the VM! There can't * be other references to @kvm, i.e. nothing else can invalidate roots * or get/put references to roots. */ list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { /* * Note, invalid roots can outlive a memslot update! Invalid * roots must be *zapped* before the memslot update completes, * but a different task can acquire a reference and keep the * root alive after its been zapped. */ if (!root->role.invalid) { root->tdp_mmu_scheduled_root_to_zap = true; root->role.invalid = true; } } } /* * Installs a last-level SPTE to handle a TDP page fault. * (NPT/EPT violation/misconfiguration) */ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, struct tdp_iter *iter) { struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep)); u64 new_spte; int ret = RET_PF_FIXED; bool wrprot = false; if (WARN_ON_ONCE(sp->role.level != fault->goal_level)) return RET_PF_RETRY; if (unlikely(!fault->slot)) new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); else wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, fault->pfn, iter->old_spte, fault->prefetch, true, fault->map_writable, &new_spte); if (new_spte == iter->old_spte) ret = RET_PF_SPURIOUS; else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) return RET_PF_RETRY; else if (is_shadow_present_pte(iter->old_spte) && !is_last_spte(iter->old_spte, iter->level)) kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level); /* * If the page fault was caused by a write but the page is write * protected, emulation is needed. If the emulation was skipped, * the vCPU would have the same fault again. */ if (wrprot) { if (fault->write) ret = RET_PF_EMULATE; } /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ if (unlikely(is_mmio_spte(new_spte))) { vcpu->stat.pf_mmio_spte_created++; trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, new_spte); ret = RET_PF_EMULATE; } else { trace_kvm_mmu_set_spte(iter->level, iter->gfn, rcu_dereference(iter->sptep)); } return ret; } /* * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the * provided page table. * * @kvm: kvm instance * @iter: a tdp_iter instance currently on the SPTE that should be set * @sp: The new TDP page table to install. * @shared: This operation is running under the MMU lock in read mode. * * Returns: 0 if the new page table was installed. Non-0 if the page table * could not be installed (e.g. the atomic compare-exchange failed). */ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, struct kvm_mmu_page *sp, bool shared) { u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled()); int ret = 0; if (shared) { ret = tdp_mmu_set_spte_atomic(kvm, iter, spte); if (ret) return ret; } else { tdp_mmu_iter_set_spte(kvm, iter, spte); } tdp_account_mmu_page(kvm, sp); return 0; } static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, struct kvm_mmu_page *sp, bool shared); /* * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing * page tables and SPTEs to translate the faulting guest physical address. */ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_mmu *mmu = vcpu->arch.mmu; struct kvm *kvm = vcpu->kvm; struct tdp_iter iter; struct kvm_mmu_page *sp; int ret = RET_PF_RETRY; kvm_mmu_hugepage_adjust(vcpu, fault); trace_kvm_mmu_spte_requested(fault); rcu_read_lock(); tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) { int r; if (fault->nx_huge_page_workaround_enabled) disallowed_hugepage_adjust(fault, iter.old_spte, iter.level); /* * If SPTE has been frozen by another thread, just give up and * retry, avoiding unnecessary page table allocation and free. */ if (is_removed_spte(iter.old_spte)) goto retry; if (iter.level == fault->goal_level) goto map_target_level; /* Step down into the lower level page table if it exists. */ if (is_shadow_present_pte(iter.old_spte) && !is_large_pte(iter.old_spte)) continue; /* * The SPTE is either non-present or points to a huge page that * needs to be split. */ sp = tdp_mmu_alloc_sp(vcpu); tdp_mmu_init_child_sp(sp, &iter); sp->nx_huge_page_disallowed = fault->huge_page_disallowed; if (is_shadow_present_pte(iter.old_spte)) r = tdp_mmu_split_huge_page(kvm, &iter, sp, true); else r = tdp_mmu_link_sp(kvm, &iter, sp, true); /* * Force the guest to retry if installing an upper level SPTE * failed, e.g. because a different task modified the SPTE. */ if (r) { tdp_mmu_free_sp(sp); goto retry; } if (fault->huge_page_disallowed && fault->req_level >= iter.level) { spin_lock(&kvm->arch.tdp_mmu_pages_lock); if (sp->nx_huge_page_disallowed) track_possible_nx_huge_page(kvm, sp); spin_unlock(&kvm->arch.tdp_mmu_pages_lock); } } /* * The walk aborted before reaching the target level, e.g. because the * iterator detected an upper level SPTE was frozen during traversal. */ WARN_ON_ONCE(iter.level == fault->goal_level); goto retry; map_target_level: ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter); retry: rcu_read_unlock(); return ret; } bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, bool flush) { struct kvm_mmu_page *root; __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false) flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end, range->may_block, flush); return flush; } typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, struct kvm_gfn_range *range); static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, struct kvm_gfn_range *range, tdp_handler_t handler) { struct kvm_mmu_page *root; struct tdp_iter iter; bool ret = false; /* * Don't support rescheduling, none of the MMU notifiers that funnel * into this helper allow blocking; it'd be dead, wasteful code. */ for_each_tdp_mmu_root(kvm, root, range->slot->as_id) { rcu_read_lock(); tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) ret |= handler(kvm, &iter, range); rcu_read_unlock(); } return ret; } /* * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero * if any of the GFNs in the range have been accessed. * * No need to mark the corresponding PFN as accessed as this call is coming * from the clear_young() or clear_flush_young() notifier, which uses the * return value to determine if the page has been accessed. */ static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, struct kvm_gfn_range *range) { u64 new_spte; /* If we have a non-accessed entry we don't need to change the pte. */ if (!is_accessed_spte(iter->old_spte)) return false; if (spte_ad_enabled(iter->old_spte)) { iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep, iter->old_spte, shadow_accessed_mask, iter->level); new_spte = iter->old_spte & ~shadow_accessed_mask; } else { /* * Capture the dirty status of the page, so that it doesn't get * lost when the SPTE is marked for access tracking. */ if (is_writable_pte(iter->old_spte)) kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte)); new_spte = mark_spte_for_access_track(iter->old_spte); iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep, iter->old_spte, new_spte, iter->level); } trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level, iter->old_spte, new_spte); return true; } bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range); } static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter, struct kvm_gfn_range *range) { return is_accessed_spte(iter->old_spte); } bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn); } static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, struct kvm_gfn_range *range) { u64 new_spte; /* Huge pages aren't expected to be modified without first being zapped. */ WARN_ON_ONCE(pte_huge(range->arg.pte) || range->start + 1 != range->end); if (iter->level != PG_LEVEL_4K || !is_shadow_present_pte(iter->old_spte)) return false; /* * Note, when changing a read-only SPTE, it's not strictly necessary to * zero the SPTE before setting the new PFN, but doing so preserves the * invariant that the PFN of a present * leaf SPTE can never change. * See handle_changed_spte(). */ tdp_mmu_iter_set_spte(kvm, iter, 0); if (!pte_write(range->arg.pte)) { new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte, pte_pfn(range->arg.pte)); tdp_mmu_iter_set_spte(kvm, iter, new_spte); } return true; } /* * Handle the changed_pte MMU notifier for the TDP MMU. * data is a pointer to the new pte_t mapping the HVA specified by the MMU * notifier. * Returns non-zero if a flush is needed before releasing the MMU lock. */ bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { /* * No need to handle the remote TLB flush under RCU protection, the * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a * shadow page. See the WARN on pfn_changed in handle_changed_spte(). */ return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn); } /* * Remove write access from all SPTEs at or above min_level that map GFNs * [start, end). Returns true if an SPTE has been changed and the TLBs need to * be flushed. */ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end, int min_level) { struct tdp_iter iter; u64 new_spte; bool spte_set = false; rcu_read_lock(); BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); for_each_tdp_pte_min_level(iter, root, min_level, start, end) { retry: if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; if (!is_shadow_present_pte(iter.old_spte) || !is_last_spte(iter.old_spte, iter.level) || !(iter.old_spte & PT_WRITABLE_MASK)) continue; new_spte = iter.old_spte & ~PT_WRITABLE_MASK; if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) goto retry; spte_set = true; } rcu_read_unlock(); return spte_set; } /* * Remove write access from all the SPTEs mapping GFNs in the memslot. Will * only affect leaf SPTEs down to min_level. * Returns true if an SPTE has been changed and the TLBs need to be flushed. */ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, const struct kvm_memory_slot *slot, int min_level) { struct kvm_mmu_page *root; bool spte_set = false; lockdep_assert_held_read(&kvm->mmu_lock); for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages, min_level); return spte_set; } static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp) { struct kvm_mmu_page *sp; gfp |= __GFP_ZERO; sp = kmem_cache_alloc(mmu_page_header_cache, gfp); if (!sp) return NULL; sp->spt = (void *)__get_free_page(gfp); if (!sp->spt) { kmem_cache_free(mmu_page_header_cache, sp); return NULL; } return sp; } static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, struct tdp_iter *iter, bool shared) { struct kvm_mmu_page *sp; kvm_lockdep_assert_mmu_lock_held(kvm, shared); /* * Since we are allocating while under the MMU lock we have to be * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct * reclaim and to avoid making any filesystem callbacks (which can end * up invoking KVM MMU notifiers, resulting in a deadlock). * * If this allocation fails we drop the lock and retry with reclaim * allowed. */ sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT); if (sp) return sp; rcu_read_unlock(); if (shared) read_unlock(&kvm->mmu_lock); else write_unlock(&kvm->mmu_lock); iter->yielded = true; sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT); if (shared) read_lock(&kvm->mmu_lock); else write_lock(&kvm->mmu_lock); rcu_read_lock(); return sp; } /* Note, the caller is responsible for initializing @sp. */ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, struct kvm_mmu_page *sp, bool shared) { const u64 huge_spte = iter->old_spte; const int level = iter->level; int ret, i; /* * No need for atomics when writing to sp->spt since the page table has * not been linked in yet and thus is not reachable from any other CPU. */ for (i = 0; i < SPTE_ENT_PER_PAGE; i++) sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i); /* * Replace the huge spte with a pointer to the populated lower level * page table. Since we are making this change without a TLB flush vCPUs * will see a mix of the split mappings and the original huge mapping, * depending on what's currently in their TLB. This is fine from a * correctness standpoint since the translation will be the same either * way. */ ret = tdp_mmu_link_sp(kvm, iter, sp, shared); if (ret) goto out; /* * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we * are overwriting from the page stats. But we have to manually update * the page stats with the new present child pages. */ kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE); out: trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret); return ret; } static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end, int target_level, bool shared) { struct kvm_mmu_page *sp = NULL; struct tdp_iter iter; int ret = 0; rcu_read_lock(); /* * Traverse the page table splitting all huge pages above the target * level into one lower level. For example, if we encounter a 1GB page * we split it into 512 2MB pages. * * Since the TDP iterator uses a pre-order traversal, we are guaranteed * to visit an SPTE before ever visiting its children, which means we * will correctly recursively split huge pages that are more than one * level above the target level (e.g. splitting a 1GB to 512 2MB pages, * and then splitting each of those to 512 4KB pages). */ for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) { retry: if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) continue; if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte)) continue; if (!sp) { sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared); if (!sp) { ret = -ENOMEM; trace_kvm_mmu_split_huge_page(iter.gfn, iter.old_spte, iter.level, ret); break; } if (iter.yielded) continue; } tdp_mmu_init_child_sp(sp, &iter); if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared)) goto retry; sp = NULL; } rcu_read_unlock(); /* * It's possible to exit the loop having never used the last sp if, for * example, a vCPU doing HugePage NX splitting wins the race and * installs its own sp in place of the last sp we tried to split. */ if (sp) tdp_mmu_free_sp(sp); return ret; } /* * Try to split all huge pages mapped by the TDP MMU down to the target level. */ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t start, gfn_t end, int target_level, bool shared) { struct kvm_mmu_page *root; int r = 0; kvm_lockdep_assert_mmu_lock_held(kvm, shared); for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) { r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared); if (r) { kvm_tdp_mmu_put_root(kvm, root); break; } } } /* * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. * If AD bits are not enabled, this will require clearing the writable bit on * each SPTE. Returns true if an SPTE has been changed and the TLBs need to * be flushed. */ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end) { u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK; struct tdp_iter iter; bool spte_set = false; rcu_read_lock(); tdp_root_for_each_pte(iter, root, start, end) { retry: if (!is_shadow_present_pte(iter.old_spte) || !is_last_spte(iter.old_spte, iter.level)) continue; if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; KVM_MMU_WARN_ON(kvm_ad_enabled() && spte_ad_need_write_protect(iter.old_spte)); if (!(iter.old_spte & dbit)) continue; if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit)) goto retry; spte_set = true; } rcu_read_unlock(); return spte_set; } /* * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. * If AD bits are not enabled, this will require clearing the writable bit on * each SPTE. Returns true if an SPTE has been changed and the TLBs need to * be flushed. */ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, const struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; bool spte_set = false; lockdep_assert_held_read(&kvm->mmu_lock); for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages); return spte_set; } /* * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is * set in mask, starting at gfn. The given memslot is expected to contain all * the GFNs represented by set bits in the mask. If AD bits are enabled, * clearing the dirty status will involve clearing the dirty bit on each SPTE * or, if AD bits are not enabled, clearing the writable bit on each SPTE. */ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t gfn, unsigned long mask, bool wrprot) { u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK : shadow_dirty_mask; struct tdp_iter iter; lockdep_assert_held_write(&kvm->mmu_lock); rcu_read_lock(); tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), gfn + BITS_PER_LONG) { if (!mask) break; KVM_MMU_WARN_ON(kvm_ad_enabled() && spte_ad_need_write_protect(iter.old_spte)); if (iter.level > PG_LEVEL_4K || !(mask & (1UL << (iter.gfn - gfn)))) continue; mask &= ~(1UL << (iter.gfn - gfn)); if (!(iter.old_spte & dbit)) continue; iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep, iter.old_spte, dbit, iter.level); trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level, iter.old_spte, iter.old_spte & ~dbit); kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte)); } rcu_read_unlock(); } /* * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is * set in mask, starting at gfn. The given memslot is expected to contain all * the GFNs represented by set bits in the mask. If AD bits are enabled, * clearing the dirty status will involve clearing the dirty bit on each SPTE * or, if AD bits are not enabled, clearing the writable bit on each SPTE. */ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, unsigned long mask, bool wrprot) { struct kvm_mmu_page *root; for_each_tdp_mmu_root(kvm, root, slot->as_id) clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); } static void zap_collapsible_spte_range(struct kvm *kvm, struct kvm_mmu_page *root, const struct kvm_memory_slot *slot) { gfn_t start = slot->base_gfn; gfn_t end = start + slot->npages; struct tdp_iter iter; int max_mapping_level; rcu_read_lock(); for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) { retry: if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; if (iter.level > KVM_MAX_HUGEPAGE_LEVEL || !is_shadow_present_pte(iter.old_spte)) continue; /* * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with * a large page size, then its parent would have been zapped * instead of stepping down. */ if (is_last_spte(iter.old_spte, iter.level)) continue; /* * If iter.gfn resides outside of the slot, i.e. the page for * the current level overlaps but is not contained by the slot, * then the SPTE can't be made huge. More importantly, trying * to query that info from slot->arch.lpage_info will cause an * out-of-bounds access. */ if (iter.gfn < start || iter.gfn >= end) continue; max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn, PG_LEVEL_NUM); if (max_mapping_level < iter.level) continue; /* Note, a successful atomic zap also does a remote TLB flush. */ if (tdp_mmu_zap_spte_atomic(kvm, &iter)) goto retry; } rcu_read_unlock(); } /* * Zap non-leaf SPTEs (and free their associated page tables) which could * be replaced by huge pages, for GFNs within the slot. */ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; lockdep_assert_held_read(&kvm->mmu_lock); for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) zap_collapsible_spte_range(kvm, root, slot); } /* * Removes write access on the last level SPTE mapping this GFN and unsets the * MMU-writable bit to ensure future writes continue to be intercepted. * Returns true if an SPTE was set and a TLB flush is needed. */ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t gfn, int min_level) { struct tdp_iter iter; u64 new_spte; bool spte_set = false; BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); rcu_read_lock(); for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) { if (!is_shadow_present_pte(iter.old_spte) || !is_last_spte(iter.old_spte, iter.level)) continue; new_spte = iter.old_spte & ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); if (new_spte == iter.old_spte) break; tdp_mmu_iter_set_spte(kvm, &iter, new_spte); spte_set = true; } rcu_read_unlock(); return spte_set; } /* * Removes write access on the last level SPTE mapping this GFN and unsets the * MMU-writable bit to ensure future writes continue to be intercepted. * Returns true if an SPTE was set and a TLB flush is needed. */ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, int min_level) { struct kvm_mmu_page *root; bool spte_set = false; lockdep_assert_held_write(&kvm->mmu_lock); for_each_tdp_mmu_root(kvm, root, slot->as_id) spte_set |= write_protect_gfn(kvm, root, gfn, min_level); return spte_set; } /* * Return the level of the lowest level SPTE added to sptes. * That SPTE may be non-present. * * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. */ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) { struct tdp_iter iter; struct kvm_mmu *mmu = vcpu->arch.mmu; gfn_t gfn = addr >> PAGE_SHIFT; int leaf = -1; *root_level = vcpu->arch.mmu->root_role.level; tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { leaf = iter.level; sptes[leaf] = iter.old_spte; } return leaf; } /* * Returns the last level spte pointer of the shadow page walk for the given * gpa, and sets *spte to the spte value. This spte may be non-preset. If no * walk could be performed, returns NULL and *spte does not contain valid data. * * Contract: * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end. * * WARNING: This function is only intended to be called during fast_page_fault. */ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, u64 *spte) { struct tdp_iter iter; struct kvm_mmu *mmu = vcpu->arch.mmu; gfn_t gfn = addr >> PAGE_SHIFT; tdp_ptep_t sptep = NULL; tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { *spte = iter.old_spte; sptep = iter.sptep; } /* * Perform the rcu_dereference to get the raw spte pointer value since * we are passing it up to fast_page_fault, which is shared with the * legacy MMU and thus does not retain the TDP MMU-specific __rcu * annotation. * * This is safe since fast_page_fault obeys the contracts of this * function as well as all TDP MMU contracts around modifying SPTEs * outside of mmu_lock. */ return rcu_dereference(sptep); }
4 4 4 1 4 3 3 1 3 3 3 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 /* Upcall routine, designed to work as a key type and working through * /sbin/request-key to contact userspace when handling DNS queries. * * See Documentation/networking/dns_resolver.rst * * Copyright (c) 2007 Igor Mammedov * Author(s): Igor Mammedov (niallain@gmail.com) * Steve French (sfrench@us.ibm.com) * Wang Lei (wang840925@gmail.com) * David Howells (dhowells@redhat.com) * * The upcall wrapper used to make an arbitrary DNS query. * * This function requires the appropriate userspace tool dns.upcall to be * installed and something like the following lines should be added to the * /etc/request-key.conf file: * * create dns_resolver * * /sbin/dns.upcall %k * * For example to use this module to query AFSDB RR: * * create dns_resolver afsdb:* * /sbin/dns.afsdb %k * * This library is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation; either version 2.1 of the License, or * (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this library; if not, see <http://www.gnu.org/licenses/>. */ #include <linux/module.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/dns_resolver.h> #include <linux/err.h> #include <net/net_namespace.h> #include <keys/dns_resolver-type.h> #include <keys/user-type.h> #include "internal.h" /** * dns_query - Query the DNS * @net: The network namespace to operate in. * @type: Query type (or NULL for straight host->IP lookup) * @name: Name to look up * @namelen: Length of name * @options: Request options (or NULL if no options) * @_result: Where to place the returned data (or NULL) * @_expiry: Where to store the result expiry time (or NULL) * @invalidate: Always invalidate the key after use * * The data will be returned in the pointer at *result, if provided, and the * caller is responsible for freeing it. * * The description should be of the form "[<query_type>:]<domain_name>", and * the options need to be appropriate for the query type requested. If no * query_type is given, then the query is a straight hostname to IP address * lookup. * * The DNS resolution lookup is performed by upcalling to userspace by way of * requesting a key of type dns_resolver. * * Returns the size of the result on success, -ve error code otherwise. */ int dns_query(struct net *net, const char *type, const char *name, size_t namelen, const char *options, char **_result, time64_t *_expiry, bool invalidate) { struct key *rkey; struct user_key_payload *upayload; const struct cred *saved_cred; size_t typelen, desclen; char *desc, *cp; int ret, len; kenter("%s,%*.*s,%zu,%s", type, (int)namelen, (int)namelen, name, namelen, options); if (!name || namelen == 0) return -EINVAL; /* construct the query key description as "[<type>:]<name>" */ typelen = 0; desclen = 0; if (type) { typelen = strlen(type); if (typelen < 1) return -EINVAL; desclen += typelen + 1; } if (namelen < 3 || namelen > 255) return -EINVAL; desclen += namelen + 1; desc = kmalloc(desclen, GFP_KERNEL); if (!desc) return -ENOMEM; cp = desc; if (type) { memcpy(cp, type, typelen); cp += typelen; *cp++ = ':'; } memcpy(cp, name, namelen); cp += namelen; *cp = '\0'; if (!options) options = ""; kdebug("call request_key(,%s,%s)", desc, options); /* make the upcall, using special credentials to prevent the use of * add_key() to preinstall malicious redirections */ saved_cred = override_creds(dns_resolver_cache); rkey = request_key_net(&key_type_dns_resolver, desc, net, options); revert_creds(saved_cred); kfree(desc); if (IS_ERR(rkey)) { ret = PTR_ERR(rkey); goto out; } down_read(&rkey->sem); set_bit(KEY_FLAG_ROOT_CAN_INVAL, &rkey->flags); rkey->perm |= KEY_USR_VIEW; ret = key_validate(rkey); if (ret < 0) goto put; /* If the DNS server gave an error, return that to the caller */ ret = PTR_ERR(rkey->payload.data[dns_key_error]); if (ret) goto put; upayload = user_key_payload_locked(rkey); len = upayload->datalen; if (_result) { ret = -ENOMEM; *_result = kmemdup_nul(upayload->data, len, GFP_KERNEL); if (!*_result) goto put; } if (_expiry) *_expiry = rkey->expiry; ret = len; put: up_read(&rkey->sem); if (invalidate) key_invalidate(rkey); key_put(rkey); out: kleave(" = %d", ret); return ret; } EXPORT_SYMBOL(dns_query);
8 13 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 #ifndef LLC_H #define LLC_H /* * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/if.h> #include <linux/if_ether.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/rculist_nulls.h> #include <linux/hash.h> #include <linux/jhash.h> #include <linux/atomic.h> struct net_device; struct packet_type; struct sk_buff; struct llc_addr { unsigned char lsap; unsigned char mac[IFHWADDRLEN]; }; #define LLC_SAP_STATE_INACTIVE 1 #define LLC_SAP_STATE_ACTIVE 2 #define LLC_SK_DEV_HASH_BITS 6 #define LLC_SK_DEV_HASH_ENTRIES (1<<LLC_SK_DEV_HASH_BITS) #define LLC_SK_LADDR_HASH_BITS 6 #define LLC_SK_LADDR_HASH_ENTRIES (1<<LLC_SK_LADDR_HASH_BITS) /** * struct llc_sap - Defines the SAP component * * @station - station this sap belongs to * @state - sap state * @p_bit - only lowest-order bit used * @f_bit - only lowest-order bit used * @laddr - SAP value in this 'lsap' * @node - entry in station sap_list * @sk_list - LLC sockets this one manages */ struct llc_sap { unsigned char state; unsigned char p_bit; unsigned char f_bit; refcount_t refcnt; int (*rcv_func)(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); struct llc_addr laddr; struct list_head node; spinlock_t sk_lock; int sk_count; struct hlist_nulls_head sk_laddr_hash[LLC_SK_LADDR_HASH_ENTRIES]; struct hlist_head sk_dev_hash[LLC_SK_DEV_HASH_ENTRIES]; struct rcu_head rcu; }; static inline struct hlist_head *llc_sk_dev_hash(struct llc_sap *sap, int ifindex) { u32 bucket = hash_32(ifindex, LLC_SK_DEV_HASH_BITS); return &sap->sk_dev_hash[bucket]; } static inline u32 llc_sk_laddr_hashfn(struct llc_sap *sap, const struct llc_addr *laddr) { return hash_32(jhash(laddr->mac, sizeof(laddr->mac), 0), LLC_SK_LADDR_HASH_BITS); } static inline struct hlist_nulls_head *llc_sk_laddr_hash(struct llc_sap *sap, const struct llc_addr *laddr) { return &sap->sk_laddr_hash[llc_sk_laddr_hashfn(sap, laddr)]; } #define LLC_DEST_INVALID 0 /* Invalid LLC PDU type */ #define LLC_DEST_SAP 1 /* Type 1 goes here */ #define LLC_DEST_CONN 2 /* Type 2 goes here */ extern struct list_head llc_sap_list; int llc_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); int llc_mac_hdr_init(struct sk_buff *skb, const unsigned char *sa, const unsigned char *da); void llc_add_pack(int type, void (*handler)(struct llc_sap *sap, struct sk_buff *skb)); void llc_remove_pack(int type); void llc_set_station_handler(void (*handler)(struct sk_buff *skb)); struct llc_sap *llc_sap_open(unsigned char lsap, int (*rcv)(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)); static inline void llc_sap_hold(struct llc_sap *sap) { refcount_inc(&sap->refcnt); } static inline bool llc_sap_hold_safe(struct llc_sap *sap) { return refcount_inc_not_zero(&sap->refcnt); } void llc_sap_close(struct llc_sap *sap); static inline void llc_sap_put(struct llc_sap *sap) { if (refcount_dec_and_test(&sap->refcnt)) llc_sap_close(sap); } struct llc_sap *llc_sap_find(unsigned char sap_value); int llc_build_and_send_ui_pkt(struct llc_sap *sap, struct sk_buff *skb, const unsigned char *dmac, unsigned char dsap); void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb); void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb); void llc_station_init(void); void llc_station_exit(void); #ifdef CONFIG_PROC_FS int llc_proc_init(void); void llc_proc_exit(void); #else #define llc_proc_init() (0) #define llc_proc_exit() do { } while(0) #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SYSCTL int llc_sysctl_init(void); void llc_sysctl_exit(void); extern int sysctl_llc2_ack_timeout; extern int sysctl_llc2_busy_timeout; extern int sysctl_llc2_p_timeout; extern int sysctl_llc2_rej_timeout; #else #define llc_sysctl_init() (0) #define llc_sysctl_exit() do { } while(0) #endif /* CONFIG_SYSCTL */ #endif /* LLC_H */
199 95 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 // SPDX-License-Identifier: GPL-2.0 #include <linux/swap_cgroup.h> #include <linux/vmalloc.h> #include <linux/mm.h> #include <linux/swapops.h> /* depends on mm.h include */ static DEFINE_MUTEX(swap_cgroup_mutex); struct swap_cgroup_ctrl { struct page **map; unsigned long length; spinlock_t lock; }; static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; struct swap_cgroup { unsigned short id; }; #define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) /* * SwapCgroup implements "lookup" and "exchange" operations. * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge * against SwapCache. At swap_free(), this is accessed directly from swap. * * This means, * - we have no race in "exchange" when we're accessed via SwapCache because * SwapCache(and its swp_entry) is under lock. * - When called via swap_free(), there is no user of this entry and no race. * Then, we don't need lock around "exchange". * * TODO: we can push these buffers out to HIGHMEM. */ /* * allocate buffer for swap_cgroup. */ static int swap_cgroup_prepare(int type) { struct page *page; struct swap_cgroup_ctrl *ctrl; unsigned long idx, max; ctrl = &swap_cgroup_ctrl[type]; for (idx = 0; idx < ctrl->length; idx++) { page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) goto not_enough_page; ctrl->map[idx] = page; if (!(idx % SWAP_CLUSTER_MAX)) cond_resched(); } return 0; not_enough_page: max = idx; for (idx = 0; idx < max; idx++) __free_page(ctrl->map[idx]); return -ENOMEM; } static struct swap_cgroup *__lookup_swap_cgroup(struct swap_cgroup_ctrl *ctrl, pgoff_t offset) { struct page *mappage; struct swap_cgroup *sc; mappage = ctrl->map[offset / SC_PER_PAGE]; sc = page_address(mappage); return sc + offset % SC_PER_PAGE; } static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, struct swap_cgroup_ctrl **ctrlp) { pgoff_t offset = swp_offset(ent); struct swap_cgroup_ctrl *ctrl; ctrl = &swap_cgroup_ctrl[swp_type(ent)]; if (ctrlp) *ctrlp = ctrl; return __lookup_swap_cgroup(ctrl, offset); } /** * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. * @ent: swap entry to be cmpxchged * @old: old id * @new: new id * * Returns old id at success, 0 at failure. * (There is no mem_cgroup using 0 as its id) */ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, unsigned short old, unsigned short new) { struct swap_cgroup_ctrl *ctrl; struct swap_cgroup *sc; unsigned long flags; unsigned short retval; sc = lookup_swap_cgroup(ent, &ctrl); spin_lock_irqsave(&ctrl->lock, flags); retval = sc->id; if (retval == old) sc->id = new; else retval = 0; spin_unlock_irqrestore(&ctrl->lock, flags); return retval; } /** * swap_cgroup_record - record mem_cgroup for a set of swap entries * @ent: the first swap entry to be recorded into * @id: mem_cgroup to be recorded * @nr_ents: number of swap entries to be recorded * * Returns old value at success, 0 at failure. * (Of course, old value can be 0.) */ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, unsigned int nr_ents) { struct swap_cgroup_ctrl *ctrl; struct swap_cgroup *sc; unsigned short old; unsigned long flags; pgoff_t offset = swp_offset(ent); pgoff_t end = offset + nr_ents; sc = lookup_swap_cgroup(ent, &ctrl); spin_lock_irqsave(&ctrl->lock, flags); old = sc->id; for (;;) { VM_BUG_ON(sc->id != old); sc->id = id; offset++; if (offset == end) break; if (offset % SC_PER_PAGE) sc++; else sc = __lookup_swap_cgroup(ctrl, offset); } spin_unlock_irqrestore(&ctrl->lock, flags); return old; } /** * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry * @ent: swap entry to be looked up. * * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) */ unsigned short lookup_swap_cgroup_id(swp_entry_t ent) { return lookup_swap_cgroup(ent, NULL)->id; } int swap_cgroup_swapon(int type, unsigned long max_pages) { void *array; unsigned long length; struct swap_cgroup_ctrl *ctrl; if (mem_cgroup_disabled()) return 0; length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); array = vcalloc(length, sizeof(void *)); if (!array) goto nomem; ctrl = &swap_cgroup_ctrl[type]; mutex_lock(&swap_cgroup_mutex); ctrl->length = length; ctrl->map = array; spin_lock_init(&ctrl->lock); if (swap_cgroup_prepare(type)) { /* memory shortage */ ctrl->map = NULL; ctrl->length = 0; mutex_unlock(&swap_cgroup_mutex); vfree(array); goto nomem; } mutex_unlock(&swap_cgroup_mutex); return 0; nomem: pr_info("couldn't allocate enough memory for swap_cgroup\n"); pr_info("swap_cgroup can be disabled by swapaccount=0 boot option\n"); return -ENOMEM; } void swap_cgroup_swapoff(int type) { struct page **map; unsigned long i, length; struct swap_cgroup_ctrl *ctrl; if (mem_cgroup_disabled()) return; mutex_lock(&swap_cgroup_mutex); ctrl = &swap_cgroup_ctrl[type]; map = ctrl->map; length = ctrl->length; ctrl->map = NULL; ctrl->length = 0; mutex_unlock(&swap_cgroup_mutex); if (map) { for (i = 0; i < length; i++) { struct page *page = map[i]; if (page) __free_page(page); if (!(i % SWAP_CLUSTER_MAX)) cond_resched(); } vfree(map); } }
2 7 6 6 1 8 1 8 7 7 8 8 6 6 6 6 4 2 4 2 2 4 2 2 2 2 2 2 2 2 8 1 7 5 5 5 5 2 2 2 2 2 3 3 8 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/cls_route.c ROUTE4 classifier. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <net/dst.h> #include <net/route.h> #include <net/netlink.h> #include <net/act_api.h> #include <net/pkt_cls.h> #include <net/tc_wrapper.h> /* * 1. For now we assume that route tags < 256. * It allows to use direct table lookups, instead of hash tables. * 2. For now we assume that "from TAG" and "fromdev DEV" statements * are mutually exclusive. * 3. "to TAG from ANY" has higher priority, than "to ANY from XXX" */ struct route4_fastmap { struct route4_filter *filter; u32 id; int iif; }; struct route4_head { struct route4_fastmap fastmap[16]; struct route4_bucket __rcu *table[256 + 1]; struct rcu_head rcu; }; struct route4_bucket { /* 16 FROM buckets + 16 IIF buckets + 1 wildcard bucket */ struct route4_filter __rcu *ht[16 + 16 + 1]; struct rcu_head rcu; }; struct route4_filter { struct route4_filter __rcu *next; u32 id; int iif; struct tcf_result res; struct tcf_exts exts; u32 handle; struct route4_bucket *bkt; struct tcf_proto *tp; struct rcu_work rwork; }; #define ROUTE4_FAILURE ((struct route4_filter *)(-1L)) static inline int route4_fastmap_hash(u32 id, int iif) { return id & 0xF; } static DEFINE_SPINLOCK(fastmap_lock); static void route4_reset_fastmap(struct route4_head *head) { spin_lock_bh(&fastmap_lock); memset(head->fastmap, 0, sizeof(head->fastmap)); spin_unlock_bh(&fastmap_lock); } static void route4_set_fastmap(struct route4_head *head, u32 id, int iif, struct route4_filter *f) { int h = route4_fastmap_hash(id, iif); /* fastmap updates must look atomic to aling id, iff, filter */ spin_lock_bh(&fastmap_lock); head->fastmap[h].id = id; head->fastmap[h].iif = iif; head->fastmap[h].filter = f; spin_unlock_bh(&fastmap_lock); } static inline int route4_hash_to(u32 id) { return id & 0xFF; } static inline int route4_hash_from(u32 id) { return (id >> 16) & 0xF; } static inline int route4_hash_iif(int iif) { return 16 + ((iif >> 16) & 0xF); } static inline int route4_hash_wild(void) { return 32; } #define ROUTE4_APPLY_RESULT() \ { \ *res = f->res; \ if (tcf_exts_has_actions(&f->exts)) { \ int r = tcf_exts_exec(skb, &f->exts, res); \ if (r < 0) { \ dont_cache = 1; \ continue; \ } \ return r; \ } else if (!dont_cache) \ route4_set_fastmap(head, id, iif, f); \ return 0; \ } TC_INDIRECT_SCOPE int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct route4_head *head = rcu_dereference_bh(tp->root); struct dst_entry *dst; struct route4_bucket *b; struct route4_filter *f; u32 id, h; int iif, dont_cache = 0; dst = skb_dst(skb); if (!dst) goto failure; id = dst->tclassid; iif = inet_iif(skb); h = route4_fastmap_hash(id, iif); spin_lock(&fastmap_lock); if (id == head->fastmap[h].id && iif == head->fastmap[h].iif && (f = head->fastmap[h].filter) != NULL) { if (f == ROUTE4_FAILURE) { spin_unlock(&fastmap_lock); goto failure; } *res = f->res; spin_unlock(&fastmap_lock); return 0; } spin_unlock(&fastmap_lock); h = route4_hash_to(id); restart: b = rcu_dereference_bh(head->table[h]); if (b) { for (f = rcu_dereference_bh(b->ht[route4_hash_from(id)]); f; f = rcu_dereference_bh(f->next)) if (f->id == id) ROUTE4_APPLY_RESULT(); for (f = rcu_dereference_bh(b->ht[route4_hash_iif(iif)]); f; f = rcu_dereference_bh(f->next)) if (f->iif == iif) ROUTE4_APPLY_RESULT(); for (f = rcu_dereference_bh(b->ht[route4_hash_wild()]); f; f = rcu_dereference_bh(f->next)) ROUTE4_APPLY_RESULT(); } if (h < 256) { h = 256; id &= ~0xFFFF; goto restart; } if (!dont_cache) route4_set_fastmap(head, id, iif, ROUTE4_FAILURE); failure: return -1; } static inline u32 to_hash(u32 id) { u32 h = id & 0xFF; if (id & 0x8000) h += 256; return h; } static inline u32 from_hash(u32 id) { id &= 0xFFFF; if (id == 0xFFFF) return 32; if (!(id & 0x8000)) { if (id > 255) return 256; return id & 0xF; } return 16 + (id & 0xF); } static void *route4_get(struct tcf_proto *tp, u32 handle) { struct route4_head *head = rtnl_dereference(tp->root); struct route4_bucket *b; struct route4_filter *f; unsigned int h1, h2; h1 = to_hash(handle); if (h1 > 256) return NULL; h2 = from_hash(handle >> 16); if (h2 > 32) return NULL; b = rtnl_dereference(head->table[h1]); if (b) { for (f = rtnl_dereference(b->ht[h2]); f; f = rtnl_dereference(f->next)) if (f->handle == handle) return f; } return NULL; } static int route4_init(struct tcf_proto *tp) { struct route4_head *head; head = kzalloc(sizeof(struct route4_head), GFP_KERNEL); if (head == NULL) return -ENOBUFS; rcu_assign_pointer(tp->root, head); return 0; } static void __route4_delete_filter(struct route4_filter *f) { tcf_exts_destroy(&f->exts); tcf_exts_put_net(&f->exts); kfree(f); } static void route4_delete_filter_work(struct work_struct *work) { struct route4_filter *f = container_of(to_rcu_work(work), struct route4_filter, rwork); rtnl_lock(); __route4_delete_filter(f); rtnl_unlock(); } static void route4_queue_work(struct route4_filter *f) { tcf_queue_work(&f->rwork, route4_delete_filter_work); } static void route4_destroy(struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack) { struct route4_head *head = rtnl_dereference(tp->root); int h1, h2; if (head == NULL) return; for (h1 = 0; h1 <= 256; h1++) { struct route4_bucket *b; b = rtnl_dereference(head->table[h1]); if (b) { for (h2 = 0; h2 <= 32; h2++) { struct route4_filter *f; while ((f = rtnl_dereference(b->ht[h2])) != NULL) { struct route4_filter *next; next = rtnl_dereference(f->next); RCU_INIT_POINTER(b->ht[h2], next); tcf_unbind_filter(tp, &f->res); if (tcf_exts_get_net(&f->exts)) route4_queue_work(f); else __route4_delete_filter(f); } } RCU_INIT_POINTER(head->table[h1], NULL); kfree_rcu(b, rcu); } } kfree_rcu(head, rcu); } static int route4_delete(struct tcf_proto *tp, void *arg, bool *last, bool rtnl_held, struct netlink_ext_ack *extack) { struct route4_head *head = rtnl_dereference(tp->root); struct route4_filter *f = arg; struct route4_filter __rcu **fp; struct route4_filter *nf; struct route4_bucket *b; unsigned int h = 0; int i, h1; if (!head || !f) return -EINVAL; h = f->handle; b = f->bkt; fp = &b->ht[from_hash(h >> 16)]; for (nf = rtnl_dereference(*fp); nf; fp = &nf->next, nf = rtnl_dereference(*fp)) { if (nf == f) { /* unlink it */ RCU_INIT_POINTER(*fp, rtnl_dereference(f->next)); /* Remove any fastmap lookups that might ref filter * notice we unlink'd the filter so we can't get it * back in the fastmap. */ route4_reset_fastmap(head); /* Delete it */ tcf_unbind_filter(tp, &f->res); tcf_exts_get_net(&f->exts); tcf_queue_work(&f->rwork, route4_delete_filter_work); /* Strip RTNL protected tree */ for (i = 0; i <= 32; i++) { struct route4_filter *rt; rt = rtnl_dereference(b->ht[i]); if (rt) goto out; } /* OK, session has no flows */ RCU_INIT_POINTER(head->table[to_hash(h)], NULL); kfree_rcu(b, rcu); break; } } out: *last = true; for (h1 = 0; h1 <= 256; h1++) { if (rcu_access_pointer(head->table[h1])) { *last = false; break; } } return 0; } static const struct nla_policy route4_policy[TCA_ROUTE4_MAX + 1] = { [TCA_ROUTE4_CLASSID] = { .type = NLA_U32 }, [TCA_ROUTE4_TO] = NLA_POLICY_MAX(NLA_U32, 0xFF), [TCA_ROUTE4_FROM] = NLA_POLICY_MAX(NLA_U32, 0xFF), [TCA_ROUTE4_IIF] = NLA_POLICY_MAX(NLA_U32, 0x7FFF), }; static int route4_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, struct route4_filter *f, u32 handle, struct route4_head *head, struct nlattr **tb, struct nlattr *est, int new, u32 flags, struct netlink_ext_ack *extack) { u32 id = 0, to = 0, nhandle = 0x8000; struct route4_filter *fp; unsigned int h1; struct route4_bucket *b; int err; err = tcf_exts_validate(net, tp, tb, est, &f->exts, flags, extack); if (err < 0) return err; if (tb[TCA_ROUTE4_TO]) { if (new && handle & 0x8000) { NL_SET_ERR_MSG(extack, "Invalid handle"); return -EINVAL; } to = nla_get_u32(tb[TCA_ROUTE4_TO]); nhandle = to; } if (tb[TCA_ROUTE4_FROM] && tb[TCA_ROUTE4_IIF]) { NL_SET_ERR_MSG_ATTR(extack, tb[TCA_ROUTE4_FROM], "'from' and 'fromif' are mutually exclusive"); return -EINVAL; } if (tb[TCA_ROUTE4_FROM]) { id = nla_get_u32(tb[TCA_ROUTE4_FROM]); nhandle |= id << 16; } else if (tb[TCA_ROUTE4_IIF]) { id = nla_get_u32(tb[TCA_ROUTE4_IIF]); nhandle |= (id | 0x8000) << 16; } else nhandle |= 0xFFFF << 16; if (handle && new) { nhandle |= handle & 0x7F00; if (nhandle != handle) { NL_SET_ERR_MSG_FMT(extack, "Handle mismatch constructed: %x (expected: %x)", handle, nhandle); return -EINVAL; } } if (!nhandle) { NL_SET_ERR_MSG(extack, "Replacing with handle of 0 is invalid"); return -EINVAL; } h1 = to_hash(nhandle); b = rtnl_dereference(head->table[h1]); if (!b) { b = kzalloc(sizeof(struct route4_bucket), GFP_KERNEL); if (b == NULL) return -ENOBUFS; rcu_assign_pointer(head->table[h1], b); } else { unsigned int h2 = from_hash(nhandle >> 16); for (fp = rtnl_dereference(b->ht[h2]); fp; fp = rtnl_dereference(fp->next)) if (fp->handle == f->handle) return -EEXIST; } if (tb[TCA_ROUTE4_TO]) f->id = to; if (tb[TCA_ROUTE4_FROM]) f->id = to | id<<16; else if (tb[TCA_ROUTE4_IIF]) f->iif = id; f->handle = nhandle; f->bkt = b; f->tp = tp; if (tb[TCA_ROUTE4_CLASSID]) { f->res.classid = nla_get_u32(tb[TCA_ROUTE4_CLASSID]); tcf_bind_filter(tp, &f->res, base); } return 0; } static int route4_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, void **arg, u32 flags, struct netlink_ext_ack *extack) { struct route4_head *head = rtnl_dereference(tp->root); struct route4_filter __rcu **fp; struct route4_filter *fold, *f1, *pfp, *f = NULL; struct route4_bucket *b; struct nlattr *tb[TCA_ROUTE4_MAX + 1]; unsigned int h, th; int err; bool new = true; if (!handle) { NL_SET_ERR_MSG(extack, "Creating with handle of 0 is invalid"); return -EINVAL; } if (NL_REQ_ATTR_CHECK(extack, NULL, tca, TCA_OPTIONS)) { NL_SET_ERR_MSG_MOD(extack, "Missing options"); return -EINVAL; } err = nla_parse_nested_deprecated(tb, TCA_ROUTE4_MAX, tca[TCA_OPTIONS], route4_policy, NULL); if (err < 0) return err; fold = *arg; if (fold && fold->handle != handle) return -EINVAL; err = -ENOBUFS; f = kzalloc(sizeof(struct route4_filter), GFP_KERNEL); if (!f) goto errout; err = tcf_exts_init(&f->exts, net, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE); if (err < 0) goto errout; if (fold) { f->id = fold->id; f->iif = fold->iif; f->handle = fold->handle; f->tp = fold->tp; f->bkt = fold->bkt; new = false; } err = route4_set_parms(net, tp, base, f, handle, head, tb, tca[TCA_RATE], new, flags, extack); if (err < 0) goto errout; h = from_hash(f->handle >> 16); fp = &f->bkt->ht[h]; for (pfp = rtnl_dereference(*fp); (f1 = rtnl_dereference(*fp)) != NULL; fp = &f1->next) if (f->handle < f1->handle) break; tcf_block_netif_keep_dst(tp->chain->block); rcu_assign_pointer(f->next, f1); rcu_assign_pointer(*fp, f); if (fold) { th = to_hash(fold->handle); h = from_hash(fold->handle >> 16); b = rtnl_dereference(head->table[th]); if (b) { fp = &b->ht[h]; for (pfp = rtnl_dereference(*fp); pfp; fp = &pfp->next, pfp = rtnl_dereference(*fp)) { if (pfp == fold) { rcu_assign_pointer(*fp, fold->next); break; } } } } route4_reset_fastmap(head); *arg = f; if (fold) { tcf_unbind_filter(tp, &fold->res); tcf_exts_get_net(&fold->exts); tcf_queue_work(&fold->rwork, route4_delete_filter_work); } return 0; errout: if (f) tcf_exts_destroy(&f->exts); kfree(f); return err; } static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg, bool rtnl_held) { struct route4_head *head = rtnl_dereference(tp->root); unsigned int h, h1; if (head == NULL || arg->stop) return; for (h = 0; h <= 256; h++) { struct route4_bucket *b = rtnl_dereference(head->table[h]); if (b) { for (h1 = 0; h1 <= 32; h1++) { struct route4_filter *f; for (f = rtnl_dereference(b->ht[h1]); f; f = rtnl_dereference(f->next)) { if (!tc_cls_stats_dump(tp, arg, f)) return; } } } } } static int route4_dump(struct net *net, struct tcf_proto *tp, void *fh, struct sk_buff *skb, struct tcmsg *t, bool rtnl_held) { struct route4_filter *f = fh; struct nlattr *nest; u32 id; if (f == NULL) return skb->len; t->tcm_handle = f->handle; nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; if (!(f->handle & 0x8000)) { id = f->id & 0xFF; if (nla_put_u32(skb, TCA_ROUTE4_TO, id)) goto nla_put_failure; } if (f->handle & 0x80000000) { if ((f->handle >> 16) != 0xFFFF && nla_put_u32(skb, TCA_ROUTE4_IIF, f->iif)) goto nla_put_failure; } else { id = f->id >> 16; if (nla_put_u32(skb, TCA_ROUTE4_FROM, id)) goto nla_put_failure; } if (f->res.classid && nla_put_u32(skb, TCA_ROUTE4_CLASSID, f->res.classid)) goto nla_put_failure; if (tcf_exts_dump(skb, &f->exts) < 0) goto nla_put_failure; nla_nest_end(skb, nest); if (tcf_exts_dump_stats(skb, &f->exts) < 0) goto nla_put_failure; return skb->len; nla_put_failure: nla_nest_cancel(skb, nest); return -1; } static void route4_bind_class(void *fh, u32 classid, unsigned long cl, void *q, unsigned long base) { struct route4_filter *f = fh; tc_cls_bind_class(classid, cl, q, &f->res, base); } static struct tcf_proto_ops cls_route4_ops __read_mostly = { .kind = "route", .classify = route4_classify, .init = route4_init, .destroy = route4_destroy, .get = route4_get, .change = route4_change, .delete = route4_delete, .walk = route4_walk, .dump = route4_dump, .bind_class = route4_bind_class, .owner = THIS_MODULE, }; static int __init init_route4(void) { return register_tcf_proto_ops(&cls_route4_ops); } static void __exit exit_route4(void) { unregister_tcf_proto_ops(&cls_route4_ops); } module_init(init_route4) module_exit(exit_route4) MODULE_DESCRIPTION("Routing table realm based TC classifier"); MODULE_LICENSE("GPL");
2 3 3 5 5 5 5 2 3 5 5 5 3 3 3 3 3 3 1 3 3 9 9 9 3 2 3 9 5 5 5 2 2 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2010-2011 EIA Electronics, // Kurt Van Dijck <kurt.van.dijck@eia.be> // Copyright (c) 2017-2019 Pengutronix, // Marc Kleine-Budde <kernel@pengutronix.de> // Copyright (c) 2017-2019 Pengutronix, // Oleksij Rempel <kernel@pengutronix.de> /* bus for j1939 remote devices * Since rtnetlink, no real bus is used. */ #include <net/sock.h> #include "j1939-priv.h" static void __j1939_ecu_release(struct kref *kref) { struct j1939_ecu *ecu = container_of(kref, struct j1939_ecu, kref); struct j1939_priv *priv = ecu->priv; list_del(&ecu->list); kfree(ecu); j1939_priv_put(priv); } void j1939_ecu_put(struct j1939_ecu *ecu) { kref_put(&ecu->kref, __j1939_ecu_release); } static void j1939_ecu_get(struct j1939_ecu *ecu) { kref_get(&ecu->kref); } static bool j1939_ecu_is_mapped_locked(struct j1939_ecu *ecu) { struct j1939_priv *priv = ecu->priv; lockdep_assert_held(&priv->lock); return j1939_ecu_find_by_addr_locked(priv, ecu->addr) == ecu; } /* ECU device interface */ /* map ECU to a bus address space */ static void j1939_ecu_map_locked(struct j1939_ecu *ecu) { struct j1939_priv *priv = ecu->priv; struct j1939_addr_ent *ent; lockdep_assert_held(&priv->lock); if (!j1939_address_is_unicast(ecu->addr)) return; ent = &priv->ents[ecu->addr]; if (ent->ecu) { netdev_warn(priv->ndev, "Trying to map already mapped ECU, addr: 0x%02x, name: 0x%016llx. Skip it.\n", ecu->addr, ecu->name); return; } j1939_ecu_get(ecu); ent->ecu = ecu; ent->nusers += ecu->nusers; } /* unmap ECU from a bus address space */ void j1939_ecu_unmap_locked(struct j1939_ecu *ecu) { struct j1939_priv *priv = ecu->priv; struct j1939_addr_ent *ent; lockdep_assert_held(&priv->lock); if (!j1939_address_is_unicast(ecu->addr)) return; if (!j1939_ecu_is_mapped_locked(ecu)) return; ent = &priv->ents[ecu->addr]; ent->ecu = NULL; ent->nusers -= ecu->nusers; j1939_ecu_put(ecu); } void j1939_ecu_unmap(struct j1939_ecu *ecu) { write_lock_bh(&ecu->priv->lock); j1939_ecu_unmap_locked(ecu); write_unlock_bh(&ecu->priv->lock); } void j1939_ecu_unmap_all(struct j1939_priv *priv) { int i; write_lock_bh(&priv->lock); for (i = 0; i < ARRAY_SIZE(priv->ents); i++) if (priv->ents[i].ecu) j1939_ecu_unmap_locked(priv->ents[i].ecu); write_unlock_bh(&priv->lock); } void j1939_ecu_timer_start(struct j1939_ecu *ecu) { /* The ECU is held here and released in the * j1939_ecu_timer_handler() or j1939_ecu_timer_cancel(). */ j1939_ecu_get(ecu); /* Schedule timer in 250 msec to commit address change. */ hrtimer_start(&ecu->ac_timer, ms_to_ktime(250), HRTIMER_MODE_REL_SOFT); } void j1939_ecu_timer_cancel(struct j1939_ecu *ecu) { if (hrtimer_cancel(&ecu->ac_timer)) j1939_ecu_put(ecu); } static enum hrtimer_restart j1939_ecu_timer_handler(struct hrtimer *hrtimer) { struct j1939_ecu *ecu = container_of(hrtimer, struct j1939_ecu, ac_timer); struct j1939_priv *priv = ecu->priv; write_lock_bh(&priv->lock); /* TODO: can we test if ecu->addr is unicast before starting * the timer? */ j1939_ecu_map_locked(ecu); /* The corresponding j1939_ecu_get() is in * j1939_ecu_timer_start(). */ j1939_ecu_put(ecu); write_unlock_bh(&priv->lock); return HRTIMER_NORESTART; } struct j1939_ecu *j1939_ecu_create_locked(struct j1939_priv *priv, name_t name) { struct j1939_ecu *ecu; lockdep_assert_held(&priv->lock); ecu = kzalloc(sizeof(*ecu), gfp_any()); if (!ecu) return ERR_PTR(-ENOMEM); kref_init(&ecu->kref); ecu->addr = J1939_IDLE_ADDR; ecu->name = name; hrtimer_init(&ecu->ac_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); ecu->ac_timer.function = j1939_ecu_timer_handler; INIT_LIST_HEAD(&ecu->list); j1939_priv_get(priv); ecu->priv = priv; list_add_tail(&ecu->list, &priv->ecus); return ecu; } struct j1939_ecu *j1939_ecu_find_by_addr_locked(struct j1939_priv *priv, u8 addr) { lockdep_assert_held(&priv->lock); return priv->ents[addr].ecu; } struct j1939_ecu *j1939_ecu_get_by_addr_locked(struct j1939_priv *priv, u8 addr) { struct j1939_ecu *ecu; lockdep_assert_held(&priv->lock); if (!j1939_address_is_unicast(addr)) return NULL; ecu = j1939_ecu_find_by_addr_locked(priv, addr); if (ecu) j1939_ecu_get(ecu); return ecu; } struct j1939_ecu *j1939_ecu_get_by_addr(struct j1939_priv *priv, u8 addr) { struct j1939_ecu *ecu; read_lock_bh(&priv->lock); ecu = j1939_ecu_get_by_addr_locked(priv, addr); read_unlock_bh(&priv->lock); return ecu; } /* get pointer to ecu without increasing ref counter */ static struct j1939_ecu *j1939_ecu_find_by_name_locked(struct j1939_priv *priv, name_t name) { struct j1939_ecu *ecu; lockdep_assert_held(&priv->lock); list_for_each_entry(ecu, &priv->ecus, list) { if (ecu->name == name) return ecu; } return NULL; } struct j1939_ecu *j1939_ecu_get_by_name_locked(struct j1939_priv *priv, name_t name) { struct j1939_ecu *ecu; lockdep_assert_held(&priv->lock); if (!name) return NULL; ecu = j1939_ecu_find_by_name_locked(priv, name); if (ecu) j1939_ecu_get(ecu); return ecu; } struct j1939_ecu *j1939_ecu_get_by_name(struct j1939_priv *priv, name_t name) { struct j1939_ecu *ecu; read_lock_bh(&priv->lock); ecu = j1939_ecu_get_by_name_locked(priv, name); read_unlock_bh(&priv->lock); return ecu; } u8 j1939_name_to_addr(struct j1939_priv *priv, name_t name) { struct j1939_ecu *ecu; int addr = J1939_IDLE_ADDR; if (!name) return J1939_NO_ADDR; read_lock_bh(&priv->lock); ecu = j1939_ecu_find_by_name_locked(priv, name); if (ecu && j1939_ecu_is_mapped_locked(ecu)) /* ecu's SA is registered */ addr = ecu->addr; read_unlock_bh(&priv->lock); return addr; } /* TX addr/name accounting * Transport protocol needs to know if a SA is local or not * These functions originate from userspace manipulating sockets, * so locking is straigforward */ int j1939_local_ecu_get(struct j1939_priv *priv, name_t name, u8 sa) { struct j1939_ecu *ecu; int err = 0; write_lock_bh(&priv->lock); if (j1939_address_is_unicast(sa)) priv->ents[sa].nusers++; if (!name) goto done; ecu = j1939_ecu_get_by_name_locked(priv, name); if (!ecu) ecu = j1939_ecu_create_locked(priv, name); err = PTR_ERR_OR_ZERO(ecu); if (err) goto done; ecu->nusers++; /* TODO: do we care if ecu->addr != sa? */ if (j1939_ecu_is_mapped_locked(ecu)) /* ecu's sa is active already */ priv->ents[ecu->addr].nusers++; done: write_unlock_bh(&priv->lock); return err; } void j1939_local_ecu_put(struct j1939_priv *priv, name_t name, u8 sa) { struct j1939_ecu *ecu; write_lock_bh(&priv->lock); if (j1939_address_is_unicast(sa)) priv->ents[sa].nusers--; if (!name) goto done; ecu = j1939_ecu_find_by_name_locked(priv, name); if (WARN_ON_ONCE(!ecu)) goto done; ecu->nusers--; /* TODO: do we care if ecu->addr != sa? */ if (j1939_ecu_is_mapped_locked(ecu)) /* ecu's sa is active already */ priv->ents[ecu->addr].nusers--; j1939_ecu_put(ecu); done: write_unlock_bh(&priv->lock); }
5 3559 989 779 782 762 2392 5 3072 208 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 // SPDX-License-Identifier: GPL-2.0 /* * fs/ext4/extents_status.h * * Written by Yongqiang Yang <xiaoqiangnk@gmail.com> * Modified by * Allison Henderson <achender@linux.vnet.ibm.com> * Zheng Liu <wenqing.lz@taobao.com> * */ #ifndef _EXT4_EXTENTS_STATUS_H #define _EXT4_EXTENTS_STATUS_H /* * Turn on ES_DEBUG__ to get lots of info about extent status operations. */ #ifdef ES_DEBUG__ #define es_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) #else #define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif /* * With ES_AGGRESSIVE_TEST defined, the result of es caching will be * checked with old map_block's result. */ #define ES_AGGRESSIVE_TEST__ /* * These flags live in the high bits of extent_status.es_pblk */ enum { ES_WRITTEN_B, ES_UNWRITTEN_B, ES_DELAYED_B, ES_HOLE_B, ES_REFERENCED_B, ES_FLAGS }; #define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS) #define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT) #define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B) #define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B) #define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B) #define EXTENT_STATUS_HOLE (1 << ES_HOLE_B) #define EXTENT_STATUS_REFERENCED (1 << ES_REFERENCED_B) #define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \ EXTENT_STATUS_UNWRITTEN | \ EXTENT_STATUS_DELAYED | \ EXTENT_STATUS_HOLE) << ES_SHIFT) struct ext4_sb_info; struct ext4_extent; struct extent_status { struct rb_node rb_node; ext4_lblk_t es_lblk; /* first logical block extent covers */ ext4_lblk_t es_len; /* length of extent in block */ ext4_fsblk_t es_pblk; /* first physical block */ }; struct ext4_es_tree { struct rb_root root; struct extent_status *cache_es; /* recently accessed extent */ }; struct ext4_es_stats { unsigned long es_stats_shrunk; struct percpu_counter es_stats_cache_hits; struct percpu_counter es_stats_cache_misses; u64 es_stats_scan_time; u64 es_stats_max_scan_time; struct percpu_counter es_stats_all_cnt; struct percpu_counter es_stats_shk_cnt; }; /* * Pending cluster reservations for bigalloc file systems * * A cluster with a pending reservation is a logical cluster shared by at * least one extent in the extents status tree with delayed and unwritten * status and at least one other written or unwritten extent. The * reservation is said to be pending because a cluster reservation would * have to be taken in the event all blocks in the cluster shared with * written or unwritten extents were deleted while the delayed and * unwritten blocks remained. * * The set of pending cluster reservations is an auxiliary data structure * used with the extents status tree to implement reserved cluster/block * accounting for bigalloc file systems. The set is kept in memory and * records all pending cluster reservations. * * Its primary function is to avoid the need to read extents from the * disk when invalidating pages as a result of a truncate, punch hole, or * collapse range operation. Page invalidation requires a decrease in the * reserved cluster count if it results in the removal of all delayed * and unwritten extents (blocks) from a cluster that is not shared with a * written or unwritten extent, and no decrease otherwise. Determining * whether the cluster is shared can be done by searching for a pending * reservation on it. * * Secondarily, it provides a potentially faster method for determining * whether the reserved cluster count should be increased when a physical * cluster is deallocated as a result of a truncate, punch hole, or * collapse range operation. The necessary information is also present * in the extents status tree, but might be more rapidly accessed in * the pending reservation set in many cases due to smaller size. * * The pending cluster reservation set is implemented as a red-black tree * with the goal of minimizing per page search time overhead. */ struct pending_reservation { struct rb_node rb_node; ext4_lblk_t lclu; }; struct ext4_pending_tree { struct rb_root root; }; extern int __init ext4_init_es(void); extern void ext4_exit_es(void); extern void ext4_es_init_tree(struct ext4_es_tree *tree); extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, unsigned int status); extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, unsigned int status); extern void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); extern void ext4_es_find_extent_range(struct inode *inode, int (*match_fn)(struct extent_status *es), ext4_lblk_t lblk, ext4_lblk_t end, struct extent_status *es); extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t *next_lblk, struct extent_status *es); extern bool ext4_es_scan_range(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t lblk, ext4_lblk_t end); extern bool ext4_es_scan_clu(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t lblk); static inline unsigned int ext4_es_status(struct extent_status *es) { return es->es_pblk >> ES_SHIFT; } static inline unsigned int ext4_es_type(struct extent_status *es) { return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT; } static inline int ext4_es_is_written(struct extent_status *es) { return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0; } static inline int ext4_es_is_unwritten(struct extent_status *es) { return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0; } static inline int ext4_es_is_delayed(struct extent_status *es) { return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0; } static inline int ext4_es_is_hole(struct extent_status *es) { return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0; } static inline int ext4_es_is_mapped(struct extent_status *es) { return (ext4_es_is_written(es) || ext4_es_is_unwritten(es)); } static inline int ext4_es_is_delonly(struct extent_status *es) { return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es)); } static inline void ext4_es_set_referenced(struct extent_status *es) { es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; } static inline void ext4_es_clear_referenced(struct extent_status *es) { es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT); } static inline int ext4_es_is_referenced(struct extent_status *es) { return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0; } static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) { return es->es_pblk & ~ES_MASK; } static inline ext4_fsblk_t ext4_es_show_pblock(struct extent_status *es) { ext4_fsblk_t pblock = ext4_es_pblock(es); return pblock == ~ES_MASK ? 0 : pblock; } static inline void ext4_es_store_pblock(struct extent_status *es, ext4_fsblk_t pb) { ext4_fsblk_t block; block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK); es->es_pblk = block; } static inline void ext4_es_store_status(struct extent_status *es, unsigned int status) { es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | (es->es_pblk & ~ES_MASK); } static inline void ext4_es_store_pblock_status(struct extent_status *es, ext4_fsblk_t pb, unsigned int status) { es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | (pb & ~ES_MASK); } extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v); extern int __init ext4_init_pending(void); extern void ext4_exit_pending(void); extern void ext4_init_pending_tree(struct ext4_pending_tree *tree); extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk); extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); extern void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, bool allocated); extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); extern void ext4_clear_inode_es(struct inode *inode); #endif /* _EXT4_EXTENTS_STATUS_H */
487 157 382 379 49 331 331 331 331 331 331 331 55 55 11 10 55 55 12 55 19 55 26 55 42 42 121 34 86 84 84 86 34 86 57 57 5 2 1 4 80 58 85 85 86 84 84 84 84 84 373 374 1 372 376 381 368 369 9 9 8 8 18 378 483 485 377 374 378 376 375 377 1 378 376 377 382 380 6 1 6 380 5 4 1 5 1 5 196 73 485 484 485 484 483 146 148 148 90 89 1 90 89 90 90 90 89 28 96 1 1 1 1 1 90 89 89 89 89 37 90 88 5 2 5 5 5 5 5 1 5 5 5 89 89 1 89 1 38 89 6 6 1 6 6 6 7 2 8 4 4 4 8 72 73 11 53 189 378 189 13 190 14 1 14 14 147 190 56 379 376 378 377 382 378 376 376 364 378 378 376 2 2 1 1 1 1 1 1 1 1 1 22 1 1 1 1 1 96 60 97 83 83 82 83 48 30 26 48 18 8 48 13 13 13 13 2 328 253 252 252 333 330 1 329 120 329 329 1 324 195 332 29 5 5 5 1 1 1 276 277 276 126 48 86 49 55 334 125 180 136 11 46 9 38 24 1 483 53 51 15 13 50 25 50 22 21 481 20 21 21 21 483 483 482 324 4 2 477 5 4 481 134 135 118 18 18 136 136 365 7 7 7 7 364 46 24 28 478 35 479 1 484 476 486 10 484 486 136 118 115 382 480 1 481 12 9 9 2 2 9 9 9 15 10 2 2 9 4 9 5 8 8 8 6 6 8 2 2 3 1 1 2 1 2 1 1 1 1 1 326 7 328 15 15 15 15 49 15 43 48 37 49 39 39 39 47 1 47 42 47 42 47 26 270 270 270 259 259 270 270 270 271 270 270 271 5 3 2 1 18 4 3 1 1 507 506 504 2 2 502 499 501 6 7 5 7 4 7 494 488 25 8 8 15 33 23 5 3 466 16 2 4 1 498 493 732 328 326 2 326 200 326 326 326 326 313 312 313 344 338 3 337 336 14 11 2 1 9 323 12 331 31 331 321 331 13 331 1 331 40 329 293 169 2 326 313 187 313 313 313 313 14 313 312 20 349 1 349 347 346 17 349 349 349 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 // SPDX-License-Identifier: GPL-2.0 /* * Shared application/kernel submission and completion ring pairs, for * supporting fast/efficient IO. * * A note on the read/write ordering memory barriers that are matched between * the application and kernel side. * * After the application reads the CQ ring tail, it must use an * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses * before writing the tail (using smp_load_acquire to read the tail will * do). It also needs a smp_mb() before updating CQ head (ordering the * entry load(s) with the head store), pairing with an implicit barrier * through a control-dependency in io_get_cqe (smp_store_release to * store head will do). Failure to do so could lead to reading invalid * CQ entries. * * Likewise, the application must use an appropriate smp_wmb() before * writing the SQ tail (ordering SQ entry stores with the tail store), * which pairs with smp_load_acquire in io_get_sqring (smp_store_release * to store the tail will do). And it needs a barrier ordering the SQ * head load before writing new SQ entries (smp_load_acquire to read * head will do). * * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after* * updating the SQ tail; a full memory barrier smp_mb() is needed * between. * * Also see the examples in the liburing library: * * git://git.kernel.dk/liburing * * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens * from data shared between the kernel and application. This is done both * for ordering purposes, but also to ensure that once a value is loaded from * data that the application could potentially modify, it remains stable. * * Copyright (C) 2018-2019 Jens Axboe * Copyright (c) 2018-2019 Christoph Hellwig */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/syscalls.h> #include <net/compat.h> #include <linux/refcount.h> #include <linux/uio.h> #include <linux/bits.h> #include <linux/sched/signal.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/percpu.h> #include <linux/slab.h> #include <linux/bvec.h> #include <linux/net.h> #include <net/sock.h> #include <net/af_unix.h> #include <linux/anon_inodes.h> #include <linux/sched/mm.h> #include <linux/uaccess.h> #include <linux/nospec.h> #include <linux/highmem.h> #include <linux/fsnotify.h> #include <linux/fadvise.h> #include <linux/task_work.h> #include <linux/io_uring.h> #include <linux/io_uring/cmd.h> #include <linux/audit.h> #include <linux/security.h> #include <asm/shmparam.h> #define CREATE_TRACE_POINTS #include <trace/events/io_uring.h> #include <uapi/linux/io_uring.h> #include "io-wq.h" #include "io_uring.h" #include "opdef.h" #include "refs.h" #include "tctx.h" #include "register.h" #include "sqpoll.h" #include "fdinfo.h" #include "kbuf.h" #include "rsrc.h" #include "cancel.h" #include "net.h" #include "notif.h" #include "waitid.h" #include "futex.h" #include "timeout.h" #include "poll.h" #include "rw.h" #include "alloc_cache.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ IOSQE_IO_HARDLINK | IOSQE_ASYNC) #define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \ IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS) #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ REQ_F_ASYNC_DATA) #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ IO_REQ_CLEAN_FLAGS) #define IO_TCTX_REFS_CACHE_NR (1U << 10) #define IO_COMPL_BATCH 32 #define IO_REQ_ALLOC_BATCH 8 enum { IO_CHECK_CQ_OVERFLOW_BIT, IO_CHECK_CQ_DROPPED_BIT, }; struct io_defer_entry { struct list_head list; struct io_kiocb *req; u32 seq; }; /* requests with any of those set should undergo io_disarm_next() */ #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all); static void io_queue_sqe(struct io_kiocb *req); struct kmem_cache *req_cachep; static int __read_mostly sysctl_io_uring_disabled; static int __read_mostly sysctl_io_uring_group = -1; #ifdef CONFIG_SYSCTL static struct ctl_table kernel_io_uring_disabled_table[] = { { .procname = "io_uring_disabled", .data = &sysctl_io_uring_disabled, .maxlen = sizeof(sysctl_io_uring_disabled), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "io_uring_group", .data = &sysctl_io_uring_group, .maxlen = sizeof(gid_t), .mode = 0644, .proc_handler = proc_dointvec, }, {}, }; #endif static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) { if (!wq_list_empty(&ctx->submit_state.compl_reqs) || ctx->submit_state.cqes_count) __io_submit_flush_completions(ctx); } static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) { return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); } static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx) { return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head); } static bool io_match_linked(struct io_kiocb *head) { struct io_kiocb *req; io_for_each_link(req, head) { if (req->flags & REQ_F_INFLIGHT) return true; } return false; } /* * As io_match_task() but protected against racing with linked timeouts. * User must not hold timeout_lock. */ bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool cancel_all) { bool matched; if (task && head->task != task) return false; if (cancel_all) return true; if (head->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = head->ctx; /* protect against races with linked timeouts */ spin_lock_irq(&ctx->timeout_lock); matched = io_match_linked(head); spin_unlock_irq(&ctx->timeout_lock); } else { matched = io_match_linked(head); } return matched; } static inline void req_fail_link_node(struct io_kiocb *req, int res) { req_set_fail(req); io_req_set_res(req, res, 0); } static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) { wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); } static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) { struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); complete(&ctx->ref_comp); } static __cold void io_fallback_req_func(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, fallback_work.work); struct llist_node *node = llist_del_all(&ctx->fallback_llist); struct io_kiocb *req, *tmp; struct io_tw_state ts = { .locked = true, }; percpu_ref_get(&ctx->refs); mutex_lock(&ctx->uring_lock); llist_for_each_entry_safe(req, tmp, node, io_task_work.node) req->io_task_work.func(req, &ts); if (WARN_ON_ONCE(!ts.locked)) return; io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); percpu_ref_put(&ctx->refs); } static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits) { unsigned hash_buckets = 1U << bits; size_t hash_size = hash_buckets * sizeof(table->hbs[0]); table->hbs = kmalloc(hash_size, GFP_KERNEL); if (!table->hbs) return -ENOMEM; table->hash_bits = bits; init_hash_table(table, hash_buckets); return 0; } static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; int hash_bits; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return NULL; xa_init(&ctx->io_bl_xa); /* * Use 5 bits less than the max cq entries, that should give us around * 32 entries per hash list if totally full and uniformly spread, but * don't keep too many buckets to not overconsume memory. */ hash_bits = ilog2(p->cq_entries) - 5; hash_bits = clamp(hash_bits, 1, 8); if (io_alloc_hash_table(&ctx->cancel_table, hash_bits)) goto err; if (io_alloc_hash_table(&ctx->cancel_table_locked, hash_bits)) goto err; if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) goto err; ctx->flags = p->flags; init_waitqueue_head(&ctx->sqo_sq_wait); INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->io_buffers_cache); INIT_HLIST_HEAD(&ctx->io_buf_list); io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX, sizeof(struct io_rsrc_node)); io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX, sizeof(struct async_poll)); io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_async_msghdr)); io_futex_cache_init(ctx); init_completion(&ctx->ref_comp); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->cq_wait); init_waitqueue_head(&ctx->poll_wq); init_waitqueue_head(&ctx->rsrc_quiesce_wq); spin_lock_init(&ctx->completion_lock); spin_lock_init(&ctx->timeout_lock); INIT_WQ_LIST(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->io_buffers_comp); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); INIT_LIST_HEAD(&ctx->ltimeout_list); INIT_LIST_HEAD(&ctx->rsrc_ref_list); init_llist_head(&ctx->work_llist); INIT_LIST_HEAD(&ctx->tctx_list); ctx->submit_state.free_list.next = NULL; INIT_WQ_LIST(&ctx->locked_free_list); INIT_HLIST_HEAD(&ctx->waitid_list); #ifdef CONFIG_FUTEX INIT_HLIST_HEAD(&ctx->futex_list); #endif INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); INIT_WQ_LIST(&ctx->submit_state.compl_reqs); INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd); return ctx; err: kfree(ctx->cancel_table.hbs); kfree(ctx->cancel_table_locked.hbs); kfree(ctx->io_bl); xa_destroy(&ctx->io_bl_xa); kfree(ctx); return NULL; } static void io_account_cq_overflow(struct io_ring_ctx *ctx) { struct io_rings *r = ctx->rings; WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); ctx->cq_extra--; } static bool req_need_defer(struct io_kiocb *req, u32 seq) { if (unlikely(req->flags & REQ_F_IO_DRAIN)) { struct io_ring_ctx *ctx = req->ctx; return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail; } return false; } static void io_clean_op(struct io_kiocb *req) { if (req->flags & REQ_F_BUFFER_SELECTED) { spin_lock(&req->ctx->completion_lock); io_put_kbuf_comp(req); spin_unlock(&req->ctx->completion_lock); } if (req->flags & REQ_F_NEED_CLEANUP) { const struct io_cold_def *def = &io_cold_defs[req->opcode]; if (def->cleanup) def->cleanup(req); } if ((req->flags & REQ_F_POLLED) && req->apoll) { kfree(req->apoll->double_poll); kfree(req->apoll); req->apoll = NULL; } if (req->flags & REQ_F_INFLIGHT) { struct io_uring_task *tctx = req->task->io_uring; atomic_dec(&tctx->inflight_tracked); } if (req->flags & REQ_F_CREDS) put_cred(req->creds); if (req->flags & REQ_F_ASYNC_DATA) { kfree(req->async_data); req->async_data = NULL; } req->flags &= ~IO_REQ_CLEAN_FLAGS; } static inline void io_req_track_inflight(struct io_kiocb *req) { if (!(req->flags & REQ_F_INFLIGHT)) { req->flags |= REQ_F_INFLIGHT; atomic_inc(&req->task->io_uring->inflight_tracked); } } static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) { if (WARN_ON_ONCE(!req->link)) return NULL; req->flags &= ~REQ_F_ARM_LTIMEOUT; req->flags |= REQ_F_LINK_TIMEOUT; /* linked timeouts should have two refs once prep'ed */ io_req_set_refcount(req); __io_req_set_refcount(req->link, 2); return req->link; } static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) { if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT))) return NULL; return __io_prep_linked_timeout(req); } static noinline void __io_arm_ltimeout(struct io_kiocb *req) { io_queue_linked_timeout(__io_prep_linked_timeout(req)); } static inline void io_arm_ltimeout(struct io_kiocb *req) { if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT)) __io_arm_ltimeout(req); } static void io_prep_async_work(struct io_kiocb *req) { const struct io_issue_def *def = &io_issue_defs[req->opcode]; struct io_ring_ctx *ctx = req->ctx; if (!(req->flags & REQ_F_CREDS)) { req->flags |= REQ_F_CREDS; req->creds = get_current_cred(); } req->work.list.next = NULL; req->work.flags = 0; req->work.cancel_seq = atomic_read(&ctx->cancel_seq); if (req->flags & REQ_F_FORCE_ASYNC) req->work.flags |= IO_WQ_WORK_CONCURRENT; if (req->file && !(req->flags & REQ_F_FIXED_FILE)) req->flags |= io_file_get_flags(req->file); if (req->file && (req->flags & REQ_F_ISREG)) { bool should_hash = def->hash_reg_file; /* don't serialize this request if the fs doesn't need it */ if (should_hash && (req->file->f_flags & O_DIRECT) && (req->file->f_mode & FMODE_DIO_PARALLEL_WRITE)) should_hash = false; if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL)) io_wq_hash_work(&req->work, file_inode(req->file)); } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; } } static void io_prep_async_link(struct io_kiocb *req) { struct io_kiocb *cur; if (req->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; spin_lock_irq(&ctx->timeout_lock); io_for_each_link(cur, req) io_prep_async_work(cur); spin_unlock_irq(&ctx->timeout_lock); } else { io_for_each_link(cur, req) io_prep_async_work(cur); } } void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use) { struct io_kiocb *link = io_prep_linked_timeout(req); struct io_uring_task *tctx = req->task->io_uring; BUG_ON(!tctx); BUG_ON(!tctx->io_wq); /* init ->work of the whole link before punting */ io_prep_async_link(req); /* * Not expected to happen, but if we do have a bug where this _can_ * happen, catch it here and ensure the request is marked as * canceled. That will make io-wq go through the usual work cancel * procedure rather than attempt to run this request (or create a new * worker for it). */ if (WARN_ON_ONCE(!same_thread_group(req->task, current))) req->work.flags |= IO_WQ_WORK_CANCEL; trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work)); io_wq_enqueue(tctx->io_wq, &req->work); if (link) io_queue_linked_timeout(link); } static __cold void io_queue_deferred(struct io_ring_ctx *ctx) { while (!list_empty(&ctx->defer_list)) { struct io_defer_entry *de = list_first_entry(&ctx->defer_list, struct io_defer_entry, list); if (req_need_defer(de->req, de->seq)) break; list_del_init(&de->list); io_req_task_queue(de->req); kfree(de); } } void io_eventfd_ops(struct rcu_head *rcu) { struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); int ops = atomic_xchg(&ev_fd->ops, 0); if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT)) eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); /* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback * ordering in a race but if references are 0 we know we have to free * it regardless. */ if (atomic_dec_and_test(&ev_fd->refs)) { eventfd_ctx_put(ev_fd->cq_ev_fd); kfree(ev_fd); } } static void io_eventfd_signal(struct io_ring_ctx *ctx) { struct io_ev_fd *ev_fd = NULL; rcu_read_lock(); /* * rcu_dereference ctx->io_ev_fd once and use it for both for checking * and eventfd_signal */ ev_fd = rcu_dereference(ctx->io_ev_fd); /* * Check again if ev_fd exists incase an io_eventfd_unregister call * completed between the NULL check of ctx->io_ev_fd at the start of * the function and rcu_read_lock. */ if (unlikely(!ev_fd)) goto out; if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) goto out; if (ev_fd->eventfd_async && !io_wq_current_is_worker()) goto out; if (likely(eventfd_signal_allowed())) { eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); } else { atomic_inc(&ev_fd->refs); if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) call_rcu_hurry(&ev_fd->rcu, io_eventfd_ops); else atomic_dec(&ev_fd->refs); } out: rcu_read_unlock(); } static void io_eventfd_flush_signal(struct io_ring_ctx *ctx) { bool skip; spin_lock(&ctx->completion_lock); /* * Eventfd should only get triggered when at least one event has been * posted. Some applications rely on the eventfd notification count * only changing IFF a new CQE has been added to the CQ ring. There's * no depedency on 1:1 relationship between how many times this * function is called (and hence the eventfd count) and number of CQEs * posted to the CQ ring. */ skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail; ctx->evfd_last_cq_tail = ctx->cached_cq_tail; spin_unlock(&ctx->completion_lock); if (skip) return; io_eventfd_signal(ctx); } void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { if (ctx->poll_activated) io_poll_wq_wake(ctx); if (ctx->off_timeout_used) io_flush_timeouts(ctx); if (ctx->drain_active) { spin_lock(&ctx->completion_lock); io_queue_deferred(ctx); spin_unlock(&ctx->completion_lock); } if (ctx->has_evfd) io_eventfd_flush_signal(ctx); } static inline void __io_cq_lock(struct io_ring_ctx *ctx) { if (!ctx->lockless_cq) spin_lock(&ctx->completion_lock); } static inline void io_cq_lock(struct io_ring_ctx *ctx) __acquires(ctx->completion_lock) { spin_lock(&ctx->completion_lock); } static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx) { io_commit_cqring(ctx); if (!ctx->task_complete) { if (!ctx->lockless_cq) spin_unlock(&ctx->completion_lock); /* IOPOLL rings only need to wake up if it's also SQPOLL */ if (!ctx->syscall_iopoll) io_cqring_wake(ctx); } io_commit_cqring_flush(ctx); } static void io_cq_unlock_post(struct io_ring_ctx *ctx) __releases(ctx->completion_lock) { io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); io_cqring_wake(ctx); io_commit_cqring_flush(ctx); } /* Returns true if there are no backlogged entries after the flush */ static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) { struct io_overflow_cqe *ocqe; LIST_HEAD(list); spin_lock(&ctx->completion_lock); list_splice_init(&ctx->cq_overflow_list, &list); clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); spin_unlock(&ctx->completion_lock); while (!list_empty(&list)) { ocqe = list_first_entry(&list, struct io_overflow_cqe, list); list_del(&ocqe->list); kfree(ocqe); } } static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) { size_t cqe_size = sizeof(struct io_uring_cqe); if (__io_cqring_events(ctx) == ctx->cq_entries) return; if (ctx->flags & IORING_SETUP_CQE32) cqe_size <<= 1; io_cq_lock(ctx); while (!list_empty(&ctx->cq_overflow_list)) { struct io_uring_cqe *cqe; struct io_overflow_cqe *ocqe; if (!io_get_cqe_overflow(ctx, &cqe, true)) break; ocqe = list_first_entry(&ctx->cq_overflow_list, struct io_overflow_cqe, list); memcpy(cqe, &ocqe->cqe, cqe_size); list_del(&ocqe->list); kfree(ocqe); } if (list_empty(&ctx->cq_overflow_list)) { clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); } io_cq_unlock_post(ctx); } static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) { /* iopoll syncs against uring_lock, not completion_lock */ if (ctx->flags & IORING_SETUP_IOPOLL) mutex_lock(&ctx->uring_lock); __io_cqring_overflow_flush(ctx); if (ctx->flags & IORING_SETUP_IOPOLL) mutex_unlock(&ctx->uring_lock); } static void io_cqring_overflow_flush(struct io_ring_ctx *ctx) { if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) io_cqring_do_overflow_flush(ctx); } /* can be called by any task */ static void io_put_task_remote(struct task_struct *task) { struct io_uring_task *tctx = task->io_uring; percpu_counter_sub(&tctx->inflight, 1); if (unlikely(atomic_read(&tctx->in_cancel))) wake_up(&tctx->wait); put_task_struct(task); } /* used by a task to put its own references */ static void io_put_task_local(struct task_struct *task) { task->io_uring->cached_refs++; } /* must to be called somewhat shortly after putting a request */ static inline void io_put_task(struct task_struct *task) { if (likely(task == current)) io_put_task_local(task); else io_put_task_remote(task); } void io_task_refs_refill(struct io_uring_task *tctx) { unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; percpu_counter_add(&tctx->inflight, refill); refcount_add(refill, &current->usage); tctx->cached_refs += refill; } static __cold void io_uring_drop_tctx_refs(struct task_struct *task) { struct io_uring_task *tctx = task->io_uring; unsigned int refs = tctx->cached_refs; if (refs) { tctx->cached_refs = 0; percpu_counter_sub(&tctx->inflight, refs); put_task_struct_many(task, refs); } } static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, u64 extra1, u64 extra2) { struct io_overflow_cqe *ocqe; size_t ocq_size = sizeof(struct io_overflow_cqe); bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); lockdep_assert_held(&ctx->completion_lock); if (is_cqe32) ocq_size += sizeof(struct io_uring_cqe); ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT); trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe); if (!ocqe) { /* * If we're in ring overflow flush mode, or in task cancel mode, * or cannot allocate an overflow entry, then we need to drop it * on the floor. */ io_account_cq_overflow(ctx); set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq); return false; } if (list_empty(&ctx->cq_overflow_list)) { set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); } ocqe->cqe.user_data = user_data; ocqe->cqe.res = res; ocqe->cqe.flags = cflags; if (is_cqe32) { ocqe->cqe.big_cqe[0] = extra1; ocqe->cqe.big_cqe[1] = extra2; } list_add_tail(&ocqe->list, &ctx->cq_overflow_list); return true; } void io_req_cqe_overflow(struct io_kiocb *req) { io_cqring_event_overflow(req->ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags, req->big_cqe.extra1, req->big_cqe.extra2); memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } /* * writes to the cq entry need to come after reading head; the * control dependency is enough as we're using WRITE_ONCE to * fill the cq entry */ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow) { struct io_rings *rings = ctx->rings; unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); unsigned int free, queued, len; /* * Posting into the CQ when there are pending overflowed CQEs may break * ordering guarantees, which will affect links, F_MORE users and more. * Force overflow the completion. */ if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))) return false; /* userspace may cheat modifying the tail, be safe and do min */ queued = min(__io_cqring_events(ctx), ctx->cq_entries); free = ctx->cq_entries - queued; /* we need a contiguous range, limit based on the current array offset */ len = min(free, ctx->cq_entries - off); if (!len) return false; if (ctx->flags & IORING_SETUP_CQE32) { off <<= 1; len <<= 1; } ctx->cqe_cached = &rings->cqes[off]; ctx->cqe_sentinel = ctx->cqe_cached + len; return true; } static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { struct io_uring_cqe *cqe; ctx->cq_extra++; /* * If we can't get a cq entry, userspace overflowed the * submission (by quite a lot). Increment the overflow count in * the ring. */ if (likely(io_get_cqe(ctx, &cqe))) { trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0); WRITE_ONCE(cqe->user_data, user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, cflags); if (ctx->flags & IORING_SETUP_CQE32) { WRITE_ONCE(cqe->big_cqe[0], 0); WRITE_ONCE(cqe->big_cqe[1], 0); } return true; } return false; } static void __io_flush_post_cqes(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { struct io_submit_state *state = &ctx->submit_state; unsigned int i; lockdep_assert_held(&ctx->uring_lock); for (i = 0; i < state->cqes_count; i++) { struct io_uring_cqe *cqe = &ctx->completion_cqes[i]; if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) { if (ctx->lockless_cq) { spin_lock(&ctx->completion_lock); io_cqring_event_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, 0, 0); spin_unlock(&ctx->completion_lock); } else { io_cqring_event_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, 0, 0); } } } state->cqes_count = 0; } static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, bool allow_overflow) { bool filled; io_cq_lock(ctx); filled = io_fill_cqe_aux(ctx, user_data, res, cflags); if (!filled && allow_overflow) filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); io_cq_unlock_post(ctx); return filled; } bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { return __io_post_aux_cqe(ctx, user_data, res, cflags, true); } /* * A helper for multishot requests posting additional CQEs. * Should only be used from a task_work including IO_URING_F_MULTISHOT. */ bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags) { struct io_ring_ctx *ctx = req->ctx; u64 user_data = req->cqe.user_data; struct io_uring_cqe *cqe; if (!defer) return __io_post_aux_cqe(ctx, user_data, res, cflags, false); lockdep_assert_held(&ctx->uring_lock); if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->completion_cqes)) { __io_cq_lock(ctx); __io_flush_post_cqes(ctx); /* no need to flush - flush is deferred */ __io_cq_unlock_post(ctx); } /* For defered completions this is not as strict as it is otherwise, * however it's main job is to prevent unbounded posted completions, * and in that it works just as well. */ if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) return false; cqe = &ctx->completion_cqes[ctx->submit_state.cqes_count++]; cqe->user_data = user_data; cqe->res = res; cqe->flags = cflags; return true; } static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct io_rsrc_node *rsrc_node = NULL; io_cq_lock(ctx); if (!(req->flags & REQ_F_CQE_SKIP)) { if (!io_fill_cqe_req(ctx, req)) io_req_cqe_overflow(req); } /* * If we're the last reference to this request, add to our locked * free_list cache. */ if (req_ref_put_and_test(req)) { if (req->flags & IO_REQ_LINK_FLAGS) { if (req->flags & IO_DISARM_MASK) io_disarm_next(req); if (req->link) { io_req_task_queue(req->link); req->link = NULL; } } io_put_kbuf_comp(req); if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) io_clean_op(req); io_put_file(req); rsrc_node = req->rsrc_node; /* * Selected buffer deallocation in io_clean_op() assumes that * we don't hold ->completion_lock. Clean them here to avoid * deadlocks. */ io_put_task_remote(req->task); wq_list_add_head(&req->comp_list, &ctx->locked_free_list); ctx->locked_free_nr++; } io_cq_unlock_post(ctx); if (rsrc_node) { io_ring_submit_lock(ctx, issue_flags); io_put_rsrc_node(ctx, rsrc_node); io_ring_submit_unlock(ctx, issue_flags); } } void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) { if (req->ctx->task_complete && req->ctx->submitter_task != current) { req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); } else if (!(issue_flags & IO_URING_F_UNLOCKED) || !(req->ctx->flags & IORING_SETUP_IOPOLL)) { __io_req_complete_post(req, issue_flags); } else { struct io_ring_ctx *ctx = req->ctx; mutex_lock(&ctx->uring_lock); __io_req_complete_post(req, issue_flags & ~IO_URING_F_UNLOCKED); mutex_unlock(&ctx->uring_lock); } } void io_req_defer_failed(struct io_kiocb *req, s32 res) __must_hold(&ctx->uring_lock) { const struct io_cold_def *def = &io_cold_defs[req->opcode]; lockdep_assert_held(&req->ctx->uring_lock); req_set_fail(req); io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); if (def->fail) def->fail(req); io_req_complete_defer(req); } /* * Don't initialise the fields below on every allocation, but do that in * advance and keep them valid across allocations. */ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) { req->ctx = ctx; req->link = NULL; req->async_data = NULL; /* not necessary, but safer to zero */ memset(&req->cqe, 0, sizeof(req->cqe)); memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, struct io_submit_state *state) { spin_lock(&ctx->completion_lock); wq_list_splice(&ctx->locked_free_list, &state->free_list); ctx->locked_free_nr = 0; spin_unlock(&ctx->completion_lock); } /* * A request might get retired back into the request caches even before opcode * handlers and io_issue_sqe() are done with it, e.g. inline completion path. * Because of that, io_alloc_req() should be called only under ->uring_lock * and with extra caution to not get a request that is still worked on. */ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; void *reqs[IO_REQ_ALLOC_BATCH]; int ret, i; /* * If we have more than a batch's worth of requests in our IRQ side * locked cache, grab the lock and move them over to our submission * side cache. */ if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) { io_flush_cached_locked_reqs(ctx, &ctx->submit_state); if (!io_req_cache_empty(ctx)) return true; } ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs); /* * Bulk alloc is all-or-nothing. If we fail to get a batch, * retry single alloc to be on the safe side. */ if (unlikely(ret <= 0)) { reqs[0] = kmem_cache_alloc(req_cachep, gfp); if (!reqs[0]) return false; ret = 1; } percpu_ref_get_many(&ctx->refs, ret); for (i = 0; i < ret; i++) { struct io_kiocb *req = reqs[i]; io_preinit_req(req, ctx); io_req_add_to_cache(req, ctx); } return true; } __cold void io_free_req(struct io_kiocb *req) { /* refs were already put, restore them for io_req_task_complete() */ req->flags &= ~REQ_F_REFCOUNT; /* we only want to free it, don't post CQEs */ req->flags |= REQ_F_CQE_SKIP; req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); } static void __io_req_find_next_prep(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; spin_lock(&ctx->completion_lock); io_disarm_next(req); spin_unlock(&ctx->completion_lock); } static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) { struct io_kiocb *nxt; /* * If LINK is set, we have dependent requests in this chain. If we * didn't fail this request, queue the first one up, moving any other * dependencies to the next request. In case of failure, fail the rest * of the chain. */ if (unlikely(req->flags & IO_DISARM_MASK)) __io_req_find_next_prep(req); nxt = req->link; req->link = NULL; return nxt; } static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts) { if (!ctx) return; if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); if (ts->locked) { io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); ts->locked = false; } percpu_ref_put(&ctx->refs); } static unsigned int handle_tw_list(struct llist_node *node, struct io_ring_ctx **ctx, struct io_tw_state *ts, struct llist_node *last) { unsigned int count = 0; while (node && node != last) { struct llist_node *next = node->next; struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); prefetch(container_of(next, struct io_kiocb, io_task_work.node)); if (req->ctx != *ctx) { ctx_flush_and_put(*ctx, ts); *ctx = req->ctx; /* if not contended, grab and improve batching */ ts->locked = mutex_trylock(&(*ctx)->uring_lock); percpu_ref_get(&(*ctx)->refs); } INDIRECT_CALL_2(req->io_task_work.func, io_poll_task_func, io_req_rw_complete, req, ts); node = next; count++; if (unlikely(need_resched())) { ctx_flush_and_put(*ctx, ts); *ctx = NULL; cond_resched(); } } return count; } /** * io_llist_xchg - swap all entries in a lock-less list * @head: the head of lock-less list to delete all entries * @new: new entry as the head of the list * * If list is empty, return NULL, otherwise, return the pointer to the first entry. * The order of entries returned is from the newest to the oldest added one. */ static inline struct llist_node *io_llist_xchg(struct llist_head *head, struct llist_node *new) { return xchg(&head->first, new); } /** * io_llist_cmpxchg - possibly swap all entries in a lock-less list * @head: the head of lock-less list to delete all entries * @old: expected old value of the first entry of the list * @new: new entry as the head of the list * * perform a cmpxchg on the first entry of the list. */ static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head, struct llist_node *old, struct llist_node *new) { return cmpxchg(&head->first, old, new); } static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync) { struct llist_node *node = llist_del_all(&tctx->task_list); struct io_ring_ctx *last_ctx = NULL; struct io_kiocb *req; while (node) { req = container_of(node, struct io_kiocb, io_task_work.node); node = node->next; if (sync && last_ctx != req->ctx) { if (last_ctx) { flush_delayed_work(&last_ctx->fallback_work); percpu_ref_put(&last_ctx->refs); } last_ctx = req->ctx; percpu_ref_get(&last_ctx->refs); } if (llist_add(&req->io_task_work.node, &req->ctx->fallback_llist)) schedule_delayed_work(&req->ctx->fallback_work, 1); } if (last_ctx) { flush_delayed_work(&last_ctx->fallback_work); percpu_ref_put(&last_ctx->refs); } } void tctx_task_work(struct callback_head *cb) { struct io_tw_state ts = {}; struct io_ring_ctx *ctx = NULL; struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work); struct llist_node fake = {}; struct llist_node *node; unsigned int loops = 0; unsigned int count = 0; if (unlikely(current->flags & PF_EXITING)) { io_fallback_tw(tctx, true); return; } do { loops++; node = io_llist_xchg(&tctx->task_list, &fake); count += handle_tw_list(node, &ctx, &ts, &fake); /* skip expensive cmpxchg if there are items in the list */ if (READ_ONCE(tctx->task_list.first) != &fake) continue; if (ts.locked && !wq_list_empty(&ctx->submit_state.compl_reqs)) { io_submit_flush_completions(ctx); if (READ_ONCE(tctx->task_list.first) != &fake) continue; } node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL); } while (node != &fake); ctx_flush_and_put(ctx, &ts); /* relaxed read is enough as only the task itself sets ->in_cancel */ if (unlikely(atomic_read(&tctx->in_cancel))) io_uring_drop_tctx_refs(current); trace_io_uring_task_work_run(tctx, count, loops); } static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) { struct io_ring_ctx *ctx = req->ctx; unsigned nr_wait, nr_tw, nr_tw_prev; struct llist_node *first; if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) flags &= ~IOU_F_TWQ_LAZY_WAKE; first = READ_ONCE(ctx->work_llist.first); do { nr_tw_prev = 0; if (first) { struct io_kiocb *first_req = container_of(first, struct io_kiocb, io_task_work.node); /* * Might be executed at any moment, rely on * SLAB_TYPESAFE_BY_RCU to keep it alive. */ nr_tw_prev = READ_ONCE(first_req->nr_tw); } nr_tw = nr_tw_prev + 1; /* Large enough to fail the nr_wait comparison below */ if (!(flags & IOU_F_TWQ_LAZY_WAKE)) nr_tw = -1U; req->nr_tw = nr_tw; req->io_task_work.node.next = first; } while (!try_cmpxchg(&ctx->work_llist.first, &first, &req->io_task_work.node)); if (!first) { if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); if (ctx->has_evfd) io_eventfd_signal(ctx); } nr_wait = atomic_read(&ctx->cq_wait_nr); /* no one is waiting */ if (!nr_wait) return; /* either not enough or the previous add has already woken it up */ if (nr_wait > nr_tw || nr_tw_prev >= nr_wait) return; /* pairs with set_current_state() in io_cqring_wait() */ smp_mb__after_atomic(); wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); } static void io_req_normal_work_add(struct io_kiocb *req) { struct io_uring_task *tctx = req->task->io_uring; struct io_ring_ctx *ctx = req->ctx; /* task_work already pending, we're done */ if (!llist_add(&req->io_task_work.node, &tctx->task_list)) return; if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method))) return; io_fallback_tw(tctx, false); } void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) { if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) { rcu_read_lock(); io_req_local_work_add(req, flags); rcu_read_unlock(); } else { io_req_normal_work_add(req); } } static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) { struct llist_node *node; node = llist_del_all(&ctx->work_llist); while (node) { struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); node = node->next; io_req_normal_work_add(req); } } static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts) { struct llist_node *node; unsigned int loops = 0; int ret = 0; if (WARN_ON_ONCE(ctx->submitter_task != current)) return -EEXIST; if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); again: /* * llists are in reverse order, flip it back the right way before * running the pending items. */ node = llist_reverse_order(io_llist_xchg(&ctx->work_llist, NULL)); while (node) { struct llist_node *next = node->next; struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); prefetch(container_of(next, struct io_kiocb, io_task_work.node)); INDIRECT_CALL_2(req->io_task_work.func, io_poll_task_func, io_req_rw_complete, req, ts); ret++; node = next; } loops++; if (!llist_empty(&ctx->work_llist)) goto again; if (ts->locked) { io_submit_flush_completions(ctx); if (!llist_empty(&ctx->work_llist)) goto again; } trace_io_uring_local_work_run(ctx, ret, loops); return ret; } static inline int io_run_local_work_locked(struct io_ring_ctx *ctx) { struct io_tw_state ts = { .locked = true, }; int ret; if (llist_empty(&ctx->work_llist)) return 0; ret = __io_run_local_work(ctx, &ts); /* shouldn't happen! */ if (WARN_ON_ONCE(!ts.locked)) mutex_lock(&ctx->uring_lock); return ret; } static int io_run_local_work(struct io_ring_ctx *ctx) { struct io_tw_state ts = {}; int ret; ts.locked = mutex_trylock(&ctx->uring_lock); ret = __io_run_local_work(ctx, &ts); if (ts.locked) mutex_unlock(&ctx->uring_lock); return ret; } static void io_req_task_cancel(struct io_kiocb *req, struct io_tw_state *ts) { io_tw_lock(req->ctx, ts); io_req_defer_failed(req, req->cqe.res); } void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts) { io_tw_lock(req->ctx, ts); /* req->task == current here, checking PF_EXITING is safe */ if (unlikely(req->task->flags & PF_EXITING)) io_req_defer_failed(req, -EFAULT); else if (req->flags & REQ_F_FORCE_ASYNC) io_queue_iowq(req, ts); else io_queue_sqe(req); } void io_req_task_queue_fail(struct io_kiocb *req, int ret) { io_req_set_res(req, ret, 0); req->io_task_work.func = io_req_task_cancel; io_req_task_work_add(req); } void io_req_task_queue(struct io_kiocb *req) { req->io_task_work.func = io_req_task_submit; io_req_task_work_add(req); } void io_queue_next(struct io_kiocb *req) { struct io_kiocb *nxt = io_req_find_next(req); if (nxt) io_req_task_queue(nxt); } static void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) __must_hold(&ctx->uring_lock) { do { struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) { if (req->flags & REQ_F_REFCOUNT) { node = req->comp_list.next; if (!req_ref_put_and_test(req)) continue; } if ((req->flags & REQ_F_POLLED) && req->apoll) { struct async_poll *apoll = req->apoll; if (apoll->double_poll) kfree(apoll->double_poll); if (!io_alloc_cache_put(&ctx->apoll_cache, &apoll->cache)) kfree(apoll); req->flags &= ~REQ_F_POLLED; } if (req->flags & IO_REQ_LINK_FLAGS) io_queue_next(req); if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) io_clean_op(req); } io_put_file(req); io_req_put_rsrc_locked(req, ctx); io_put_task(req->task); node = req->comp_list.next; io_req_add_to_cache(req, ctx); } while (node); } void __io_submit_flush_completions(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { struct io_submit_state *state = &ctx->submit_state; struct io_wq_work_node *node; __io_cq_lock(ctx); /* must come first to preserve CQE ordering in failure cases */ if (state->cqes_count) __io_flush_post_cqes(ctx); __wq_list_for_each(node, &state->compl_reqs) { struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); if (!(req->flags & REQ_F_CQE_SKIP) && unlikely(!io_fill_cqe_req(ctx, req))) { if (ctx->lockless_cq) { spin_lock(&ctx->completion_lock); io_req_cqe_overflow(req); spin_unlock(&ctx->completion_lock); } else { io_req_cqe_overflow(req); } } } __io_cq_unlock_post(ctx); if (!wq_list_empty(&ctx->submit_state.compl_reqs)) { io_free_batch_list(ctx, state->compl_reqs.first); INIT_WQ_LIST(&state->compl_reqs); } } static unsigned io_cqring_events(struct io_ring_ctx *ctx) { /* See comment at the top of this file */ smp_rmb(); return __io_cqring_events(ctx); } /* * We can't just wait for polled events to come to us, we have to actively * find and complete them. */ static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_IOPOLL)) return; mutex_lock(&ctx->uring_lock); while (!wq_list_empty(&ctx->iopoll_list)) { /* let it sleep and repeat later if can't complete a request */ if (io_do_iopoll(ctx, true) == 0) break; /* * Ensure we allow local-to-the-cpu processing to take place, * in this case we need to ensure that we reap all events. * Also let task_work, etc. to progress by releasing the mutex */ if (need_resched()) { mutex_unlock(&ctx->uring_lock); cond_resched(); mutex_lock(&ctx->uring_lock); } } mutex_unlock(&ctx->uring_lock); } static int io_iopoll_check(struct io_ring_ctx *ctx, long min) { unsigned int nr_events = 0; unsigned long check_cq; if (!io_allowed_run_tw(ctx)) return -EEXIST; check_cq = READ_ONCE(ctx->check_cq); if (unlikely(check_cq)) { if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) __io_cqring_overflow_flush(ctx); /* * Similarly do not spin if we have not informed the user of any * dropped CQE. */ if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) return -EBADR; } /* * Don't enter poll loop if we already have events pending. * If we do, we can potentially be spinning for commands that * already triggered a CQE (eg in error). */ if (io_cqring_events(ctx)) return 0; do { int ret = 0; /* * If a submit got punted to a workqueue, we can have the * application entering polling for a command before it gets * issued. That app will hold the uring_lock for the duration * of the poll right here, so we need to take a breather every * now and then to ensure that the issue has a chance to add * the poll to the issued list. Otherwise we can spin here * forever, while the workqueue is stuck trying to acquire the * very same mutex. */ if (wq_list_empty(&ctx->iopoll_list) || io_task_work_pending(ctx)) { u32 tail = ctx->cached_cq_tail; (void) io_run_local_work_locked(ctx); if (task_work_pending(current) || wq_list_empty(&ctx->iopoll_list)) { mutex_unlock(&ctx->uring_lock); io_run_task_work(); mutex_lock(&ctx->uring_lock); } /* some requests don't go through iopoll_list */ if (tail != ctx->cached_cq_tail || wq_list_empty(&ctx->iopoll_list)) break; } ret = io_do_iopoll(ctx, !min); if (unlikely(ret < 0)) return ret; if (task_sigpending(current)) return -EINTR; if (need_resched()) break; nr_events += ret; } while (nr_events < min); return 0; } void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts) { if (ts->locked) io_req_complete_defer(req); else io_req_complete_post(req, IO_URING_F_UNLOCKED); } /* * After the iocb has been issued, it's safe to be found on the poll list. * Adding the kiocb to the list AFTER submission ensures that we don't * find it from a io_do_iopoll() thread before the issuer is done * accessing the kiocb cookie. */ static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; /* workqueue context doesn't hold uring_lock, grab it now */ if (unlikely(needs_lock)) mutex_lock(&ctx->uring_lock); /* * Track whether we have multiple files in our lists. This will impact * how we do polling eventually, not spinning if we're on potentially * different devices. */ if (wq_list_empty(&ctx->iopoll_list)) { ctx->poll_multi_queue = false; } else if (!ctx->poll_multi_queue) { struct io_kiocb *list_req; list_req = container_of(ctx->iopoll_list.first, struct io_kiocb, comp_list); if (list_req->file != req->file) ctx->poll_multi_queue = true; } /* * For fast devices, IO may have already completed. If it has, add * it to the front so we find it first. */ if (READ_ONCE(req->iopoll_completed)) wq_list_add_head(&req->comp_list, &ctx->iopoll_list); else wq_list_add_tail(&req->comp_list, &ctx->iopoll_list); if (unlikely(needs_lock)) { /* * If IORING_SETUP_SQPOLL is enabled, sqes are either handle * in sq thread task context or in io worker task context. If * current task context is sq thread, we don't need to check * whether should wake up sq thread. */ if ((ctx->flags & IORING_SETUP_SQPOLL) && wq_has_sleeper(&ctx->sq_data->wait)) wake_up(&ctx->sq_data->wait); mutex_unlock(&ctx->uring_lock); } } unsigned int io_file_get_flags(struct file *file) { unsigned int res = 0; if (S_ISREG(file_inode(file)->i_mode)) res |= REQ_F_ISREG; if ((file->f_flags & O_NONBLOCK) || (file->f_mode & FMODE_NOWAIT)) res |= REQ_F_SUPPORT_NOWAIT; return res; } bool io_alloc_async_data(struct io_kiocb *req) { WARN_ON_ONCE(!io_cold_defs[req->opcode].async_size); req->async_data = kmalloc(io_cold_defs[req->opcode].async_size, GFP_KERNEL); if (req->async_data) { req->flags |= REQ_F_ASYNC_DATA; return false; } return true; } int io_req_prep_async(struct io_kiocb *req) { const struct io_cold_def *cdef = &io_cold_defs[req->opcode]; const struct io_issue_def *def = &io_issue_defs[req->opcode]; /* assign early for deferred execution for non-fixed file */ if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE) && !req->file) req->file = io_file_get_normal(req, req->cqe.fd); if (!cdef->prep_async) return 0; if (WARN_ON_ONCE(req_has_async_data(req))) return -EFAULT; if (!def->manual_alloc) { if (io_alloc_async_data(req)) return -EAGAIN; } return cdef->prep_async(req); } static u32 io_get_sequence(struct io_kiocb *req) { u32 seq = req->ctx->cached_sq_head; struct io_kiocb *cur; /* need original cached_sq_head, but it was increased for each req */ io_for_each_link(cur, req) seq--; return seq; } static __cold void io_drain_req(struct io_kiocb *req) __must_hold(&ctx->uring_lock) { struct io_ring_ctx *ctx = req->ctx; struct io_defer_entry *de; int ret; u32 seq = io_get_sequence(req); /* Still need defer if there is pending req in defer list. */ spin_lock(&ctx->completion_lock); if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) { spin_unlock(&ctx->completion_lock); queue: ctx->drain_active = false; io_req_task_queue(req); return; } spin_unlock(&ctx->completion_lock); io_prep_async_link(req); de = kmalloc(sizeof(*de), GFP_KERNEL); if (!de) { ret = -ENOMEM; io_req_defer_failed(req, ret); return; } spin_lock(&ctx->completion_lock); if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { spin_unlock(&ctx->completion_lock); kfree(de); goto queue; } trace_io_uring_defer(req); de->req = req; de->seq = seq; list_add_tail(&de->list, &ctx->defer_list); spin_unlock(&ctx->completion_lock); } static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, unsigned int issue_flags) { if (req->file || !def->needs_file) return true; if (req->flags & REQ_F_FIXED_FILE) req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags); else req->file = io_file_get_normal(req, req->cqe.fd); return !!req->file; } static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) { const struct io_issue_def *def = &io_issue_defs[req->opcode]; const struct cred *creds = NULL; int ret; if (unlikely(!io_assign_file(req, def, issue_flags))) return -EBADF; if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred())) creds = override_creds(req->creds); if (!def->audit_skip) audit_uring_entry(req->opcode); ret = def->issue(req, issue_flags); if (!def->audit_skip) audit_uring_exit(!ret, ret); if (creds) revert_creds(creds); if (ret == IOU_OK) { if (issue_flags & IO_URING_F_COMPLETE_DEFER) io_req_complete_defer(req); else io_req_complete_post(req, issue_flags); return 0; } if (ret == IOU_ISSUE_SKIP_COMPLETE) { ret = 0; io_arm_ltimeout(req); /* If the op doesn't have a file, we're not polling for it */ if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue) io_iopoll_req_issued(req, issue_flags); } return ret; } int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts) { io_tw_lock(req->ctx, ts); return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT| IO_URING_F_COMPLETE_DEFER); } struct io_wq_work *io_wq_free_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_kiocb *nxt = NULL; if (req_ref_put_and_test(req)) { if (req->flags & IO_REQ_LINK_FLAGS) nxt = io_req_find_next(req); io_free_req(req); } return nxt ? &nxt->work : NULL; } void io_wq_submit_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); const struct io_issue_def *def = &io_issue_defs[req->opcode]; unsigned int issue_flags = IO_URING_F_UNLOCKED | IO_URING_F_IOWQ; bool needs_poll = false; int ret = 0, err = -ECANCELED; /* one will be dropped by ->io_wq_free_work() after returning to io-wq */ if (!(req->flags & REQ_F_REFCOUNT)) __io_req_set_refcount(req, 2); else req_ref_get(req); io_arm_ltimeout(req); /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ if (work->flags & IO_WQ_WORK_CANCEL) { fail: io_req_task_queue_fail(req, err); return; } if (!io_assign_file(req, def, issue_flags)) { err = -EBADF; work->flags |= IO_WQ_WORK_CANCEL; goto fail; } if (req->flags & REQ_F_FORCE_ASYNC) { bool opcode_poll = def->pollin || def->pollout; if (opcode_poll && file_can_poll(req->file)) { needs_poll = true; issue_flags |= IO_URING_F_NONBLOCK; } } do { ret = io_issue_sqe(req, issue_flags); if (ret != -EAGAIN) break; /* * If REQ_F_NOWAIT is set, then don't wait or retry with * poll. -EAGAIN is final for that case. */ if (req->flags & REQ_F_NOWAIT) break; /* * We can get EAGAIN for iopolled IO even though we're * forcing a sync submission from here, since we can't * wait for request slots on the block side. */ if (!needs_poll) { if (!(req->ctx->flags & IORING_SETUP_IOPOLL)) break; if (io_wq_worker_stopped()) break; cond_resched(); continue; } if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK) return; /* aborted or ready, in either case retry blocking */ needs_poll = false; issue_flags &= ~IO_URING_F_NONBLOCK; } while (1); /* avoid locking problems by failing it from a clean context */ if (ret < 0) io_req_task_queue_fail(req, ret); } inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct io_fixed_file *slot; struct file *file = NULL; io_ring_submit_lock(ctx, issue_flags); if (unlikely((unsigned int)fd >= ctx->nr_user_files)) goto out; fd = array_index_nospec(fd, ctx->nr_user_files); slot = io_fixed_file_slot(&ctx->file_table, fd); file = io_slot_file(slot); req->flags |= io_slot_flags(slot); io_req_set_rsrc_node(req, ctx, 0); out: io_ring_submit_unlock(ctx, issue_flags); return file; } struct file *io_file_get_normal(struct io_kiocb *req, int fd) { struct file *file = fget(fd); trace_io_uring_file_get(req, fd); /* we don't allow fixed io_uring files */ if (file && io_is_uring_fops(file)) io_req_track_inflight(req); return file; } static void io_queue_async(struct io_kiocb *req, int ret) __must_hold(&req->ctx->uring_lock) { struct io_kiocb *linked_timeout; if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) { io_req_defer_failed(req, ret); return; } linked_timeout = io_prep_linked_timeout(req); switch (io_arm_poll_handler(req, 0)) { case IO_APOLL_READY: io_kbuf_recycle(req, 0); io_req_task_queue(req); break; case IO_APOLL_ABORTED: io_kbuf_recycle(req, 0); io_queue_iowq(req, NULL); break; case IO_APOLL_OK: break; } if (linked_timeout) io_queue_linked_timeout(linked_timeout); } static inline void io_queue_sqe(struct io_kiocb *req) __must_hold(&req->ctx->uring_lock) { int ret; ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); /* * We async punt it if the file wasn't marked NOWAIT, or if the file * doesn't support non-blocking read/write attempts */ if (unlikely(ret)) io_queue_async(req, ret); } static void io_queue_sqe_fallback(struct io_kiocb *req) __must_hold(&req->ctx->uring_lock) { if (unlikely(req->flags & REQ_F_FAIL)) { /* * We don't submit, fail them all, for that replace hardlinks * with normal links. Extra REQ_F_LINK is tolerated. */ req->flags &= ~REQ_F_HARDLINK; req->flags |= REQ_F_LINK; io_req_defer_failed(req, req->cqe.res); } else { int ret = io_req_prep_async(req); if (unlikely(ret)) { io_req_defer_failed(req, ret); return; } if (unlikely(req->ctx->drain_active)) io_drain_req(req); else io_queue_iowq(req, NULL); } } /* * Check SQE restrictions (opcode and flags). * * Returns 'true' if SQE is allowed, 'false' otherwise. */ static inline bool io_check_restriction(struct io_ring_ctx *ctx, struct io_kiocb *req, unsigned int sqe_flags) { if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) return false; if ((sqe_flags & ctx->restrictions.sqe_flags_required) != ctx->restrictions.sqe_flags_required) return false; if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed | ctx->restrictions.sqe_flags_required)) return false; return true; } static void io_init_req_drain(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *head = ctx->submit_state.link.head; ctx->drain_active = true; if (head) { /* * If we need to drain a request in the middle of a link, drain * the head request and the next request/link after the current * link. Considering sequential execution of links, * REQ_F_IO_DRAIN will be maintained for every request of our * link. */ head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC; ctx->drain_next = true; } } static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe) __must_hold(&ctx->uring_lock) { const struct io_issue_def *def; unsigned int sqe_flags; int personality; u8 opcode; /* req is partially pre-initialised, see io_preinit_req() */ req->opcode = opcode = READ_ONCE(sqe->opcode); /* same numerical values with corresponding REQ_F_*, safe to copy */ req->flags = sqe_flags = READ_ONCE(sqe->flags); req->cqe.user_data = READ_ONCE(sqe->user_data); req->file = NULL; req->rsrc_node = NULL; req->task = current; if (unlikely(opcode >= IORING_OP_LAST)) { req->opcode = 0; return -EINVAL; } def = &io_issue_defs[opcode]; if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) { /* enforce forwards compatibility on users */ if (sqe_flags & ~SQE_VALID_FLAGS) return -EINVAL; if (sqe_flags & IOSQE_BUFFER_SELECT) { if (!def->buffer_select) return -EOPNOTSUPP; req->buf_index = READ_ONCE(sqe->buf_group); } if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS) ctx->drain_disabled = true; if (sqe_flags & IOSQE_IO_DRAIN) { if (ctx->drain_disabled) return -EOPNOTSUPP; io_init_req_drain(req); } } if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) { if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags)) return -EACCES; /* knock it to the slow queue path, will be drained there */ if (ctx->drain_active) req->flags |= REQ_F_FORCE_ASYNC; /* if there is no link, we're at "next" request and need to drain */ if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) { ctx->drain_next = false; ctx->drain_active = true; req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC; } } if (!def->ioprio && sqe->ioprio) return -EINVAL; if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (def->needs_file) { struct io_submit_state *state = &ctx->submit_state; req->cqe.fd = READ_ONCE(sqe->fd); /* * Plug now if we have more than 2 IO left after this, and the * target is potentially a read/write to block based storage. */ if (state->need_plug && def->plug) { state->plug_started = true; state->need_plug = false; blk_start_plug_nr_ios(&state->plug, state->submit_nr); } } personality = READ_ONCE(sqe->personality); if (personality) { int ret; req->creds = xa_load(&ctx->personalities, personality); if (!req->creds) return -EINVAL; get_cred(req->creds); ret = security_uring_override_creds(req->creds); if (ret) { put_cred(req->creds); return ret; } req->flags |= REQ_F_CREDS; } return def->prep(req, sqe); } static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe, struct io_kiocb *req, int ret) { struct io_ring_ctx *ctx = req->ctx; struct io_submit_link *link = &ctx->submit_state.link; struct io_kiocb *head = link->head; trace_io_uring_req_failed(sqe, req, ret); /* * Avoid breaking links in the middle as it renders links with SQPOLL * unusable. Instead of failing eagerly, continue assembling the link if * applicable and mark the head with REQ_F_FAIL. The link flushing code * should find the flag and handle the rest. */ req_fail_link_node(req, ret); if (head && !(head->flags & REQ_F_FAIL)) req_fail_link_node(head, -ECANCELED); if (!(req->flags & IO_REQ_LINK_FLAGS)) { if (head) { link->last->link = req; link->head = NULL; req = head; } io_queue_sqe_fallback(req); return ret; } if (head) link->last->link = req; else link->head = req; link->last = req; return 0; } static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe) __must_hold(&ctx->uring_lock) { struct io_submit_link *link = &ctx->submit_state.link; int ret; ret = io_init_req(ctx, req, sqe); if (unlikely(ret)) return io_submit_fail_init(sqe, req, ret); trace_io_uring_submit_req(req); /* * If we already have a head request, queue this one for async * submittal once the head completes. If we don't have a head but * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be * submitted sync once the chain is complete. If none of those * conditions are true (normal request), then just queue it. */ if (unlikely(link->head)) { ret = io_req_prep_async(req); if (unlikely(ret)) return io_submit_fail_init(sqe, req, ret); trace_io_uring_link(req, link->head); link->last->link = req; link->last = req; if (req->flags & IO_REQ_LINK_FLAGS) return 0; /* last request of the link, flush it */ req = link->head; link->head = NULL; if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)) goto fallback; } else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS | REQ_F_FORCE_ASYNC | REQ_F_FAIL))) { if (req->flags & IO_REQ_LINK_FLAGS) { link->head = req; link->last = req; } else { fallback: io_queue_sqe_fallback(req); } return 0; } io_queue_sqe(req); return 0; } /* * Batched submission is done, ensure local IO is flushed out. */ static void io_submit_state_end(struct io_ring_ctx *ctx) { struct io_submit_state *state = &ctx->submit_state; if (unlikely(state->link.head)) io_queue_sqe_fallback(state->link.head); /* flush only after queuing links as they can generate completions */ io_submit_flush_completions(ctx); if (state->plug_started) blk_finish_plug(&state->plug); } /* * Start submission side cache. */ static void io_submit_state_start(struct io_submit_state *state, unsigned int max_ios) { state->plug_started = false; state->need_plug = max_ios > 2; state->submit_nr = max_ios; /* set only head, no need to init link_last in advance */ state->link.head = NULL; } static void io_commit_sqring(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; /* * Ensure any loads from the SQEs are done at this point, * since once we write the new head, the application could * write new data to them. */ smp_store_release(&rings->sq.head, ctx->cached_sq_head); } /* * Fetch an sqe, if one is available. Note this returns a pointer to memory * that is mapped by userspace. This means that care needs to be taken to * ensure that reads are stable, as we cannot rely on userspace always * being a good citizen. If members of the sqe are validated and then later * used, it's important that those reads are done through READ_ONCE() to * prevent a re-load down the line. */ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) { unsigned mask = ctx->sq_entries - 1; unsigned head = ctx->cached_sq_head++ & mask; if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) { head = READ_ONCE(ctx->sq_array[head]); if (unlikely(head >= ctx->sq_entries)) { /* drop invalid entries */ spin_lock(&ctx->completion_lock); ctx->cq_extra--; spin_unlock(&ctx->completion_lock); WRITE_ONCE(ctx->rings->sq_dropped, READ_ONCE(ctx->rings->sq_dropped) + 1); return false; } } /* * The cached sq head (or cq tail) serves two purposes: * * 1) allows us to batch the cost of updating the user visible * head updates. * 2) allows the kernel side to track the head on its own, even * though the application is the one updating it. */ /* double index for 128-byte SQEs, twice as long */ if (ctx->flags & IORING_SETUP_SQE128) head <<= 1; *sqe = &ctx->sq_sqes[head]; return true; } int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) __must_hold(&ctx->uring_lock) { unsigned int entries = io_sqring_entries(ctx); unsigned int left; int ret; if (unlikely(!entries)) return 0; /* make sure SQ entry isn't read before tail */ ret = left = min(nr, entries); io_get_task_refs(left); io_submit_state_start(&ctx->submit_state, left); do { const struct io_uring_sqe *sqe; struct io_kiocb *req; if (unlikely(!io_alloc_req(ctx, &req))) break; if (unlikely(!io_get_sqe(ctx, &sqe))) { io_req_add_to_cache(req, ctx); break; } /* * Continue submitting even for sqe failure if the * ring was setup with IORING_SETUP_SUBMIT_ALL */ if (unlikely(io_submit_sqe(ctx, req, sqe)) && !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) { left--; break; } } while (--left); if (unlikely(left)) { ret -= left; /* try again if it submitted nothing and can't allocate a req */ if (!ret && io_req_cache_empty(ctx)) ret = -EAGAIN; current->io_uring->cached_refs += left; } io_submit_state_end(ctx); /* Commit SQ ring head once we've consumed and submitted all SQEs */ io_commit_sqring(ctx); return ret; } struct io_wait_queue { struct wait_queue_entry wq; struct io_ring_ctx *ctx; unsigned cq_tail; unsigned nr_timeouts; ktime_t timeout; }; static inline bool io_has_work(struct io_ring_ctx *ctx) { return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) || !llist_empty(&ctx->work_llist); } static inline bool io_should_wake(struct io_wait_queue *iowq) { struct io_ring_ctx *ctx = iowq->ctx; int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail; /* * Wake up if we have enough events, or if a timeout occurred since we * started waiting. For timeouts, we always want to return to userspace, * regardless of event count. */ return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; } static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, int wake_flags, void *key) { struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq); /* * Cannot safely flush overflowed CQEs from here, ensure we wake up * the task, and the next invocation will do it. */ if (io_should_wake(iowq) || io_has_work(iowq->ctx)) return autoremove_wake_function(curr, mode, wake_flags, key); return -1; } int io_run_task_work_sig(struct io_ring_ctx *ctx) { if (!llist_empty(&ctx->work_llist)) { __set_current_state(TASK_RUNNING); if (io_run_local_work(ctx) > 0) return 0; } if (io_run_task_work() > 0) return 0; if (task_sigpending(current)) return -EINTR; return 0; } static bool current_pending_io(void) { struct io_uring_task *tctx = current->io_uring; if (!tctx) return false; return percpu_counter_read_positive(&tctx->inflight); } /* when returns >0, the caller should retry */ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) { int io_wait, ret; if (unlikely(READ_ONCE(ctx->check_cq))) return 1; if (unlikely(!llist_empty(&ctx->work_llist))) return 1; if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) return 1; if (unlikely(task_sigpending(current))) return -EINTR; if (unlikely(io_should_wake(iowq))) return 0; /* * Mark us as being in io_wait if we have pending requests, so cpufreq * can take into account that the task is waiting for IO - turns out * to be important for low QD IO. */ io_wait = current->in_iowait; if (current_pending_io()) current->in_iowait = 1; ret = 0; if (iowq->timeout == KTIME_MAX) schedule(); else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS)) ret = -ETIME; current->in_iowait = io_wait; return ret; } /* * Wait until events become available, if we don't already have some. The * application must reap them itself, as they reside on the shared cq ring. */ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, const sigset_t __user *sig, size_t sigsz, struct __kernel_timespec __user *uts) { struct io_wait_queue iowq; struct io_rings *rings = ctx->rings; int ret; if (!io_allowed_run_tw(ctx)) return -EEXIST; if (!llist_empty(&ctx->work_llist)) io_run_local_work(ctx); io_run_task_work(); io_cqring_overflow_flush(ctx); /* if user messes with these they will just get an early return */ if (__io_cqring_events_user(ctx) >= min_events) return 0; if (sig) { #ifdef CONFIG_COMPAT if (in_compat_syscall()) ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig, sigsz); else #endif ret = set_user_sigmask(sig, sigsz); if (ret) return ret; } init_waitqueue_func_entry(&iowq.wq, io_wake_function); iowq.wq.private = current; INIT_LIST_HEAD(&iowq.wq.entry); iowq.ctx = ctx; iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; iowq.timeout = KTIME_MAX; if (uts) { struct timespec64 ts; if (get_timespec64(&ts, uts)) return -EFAULT; iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); } trace_io_uring_cqring_wait(ctx, min_events); do { unsigned long check_cq; if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail); atomic_set(&ctx->cq_wait_nr, nr_wait); set_current_state(TASK_INTERRUPTIBLE); } else { prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, TASK_INTERRUPTIBLE); } ret = io_cqring_wait_schedule(ctx, &iowq); __set_current_state(TASK_RUNNING); atomic_set(&ctx->cq_wait_nr, 0); /* * Run task_work after scheduling and before io_should_wake(). * If we got woken because of task_work being processed, run it * now rather than let the caller do another wait loop. */ io_run_task_work(); if (!llist_empty(&ctx->work_llist)) io_run_local_work(ctx); /* * Non-local task_work will be run on exit to userspace, but * if we're using DEFER_TASKRUN, then we could have waited * with a timeout for a number of requests. If the timeout * hits, we could have some requests ready to process. Ensure * this break is _after_ we have run task_work, to avoid * deferring running potentially pending requests until the * next time we wait for events. */ if (ret < 0) break; check_cq = READ_ONCE(ctx->check_cq); if (unlikely(check_cq)) { /* let the caller flush overflows, retry */ if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) io_cqring_do_overflow_flush(ctx); if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) { ret = -EBADR; break; } } if (io_should_wake(&iowq)) { ret = 0; break; } cond_resched(); } while (1); if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) finish_wait(&ctx->cq_wait, &iowq.wq); restore_saved_sigmask_unless(ret == -EINTR); return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } void io_mem_free(void *ptr) { if (!ptr) return; folio_put(virt_to_folio(ptr)); } static void io_pages_free(struct page ***pages, int npages) { struct page **page_array; int i; if (!pages) return; page_array = *pages; if (!page_array) return; for (i = 0; i < npages; i++) unpin_user_page(page_array[i]); kvfree(page_array); *pages = NULL; } static void *__io_uaddr_map(struct page ***pages, unsigned short *npages, unsigned long uaddr, size_t size) { struct page **page_array; unsigned int nr_pages; void *page_addr; int ret, i; *npages = 0; if (uaddr & (PAGE_SIZE - 1) || !size) return ERR_PTR(-EINVAL); nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; if (nr_pages > USHRT_MAX) return ERR_PTR(-EINVAL); page_array = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!page_array) return ERR_PTR(-ENOMEM); ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, page_array); if (ret != nr_pages) { err: io_pages_free(&page_array, ret > 0 ? ret : 0); return ret < 0 ? ERR_PTR(ret) : ERR_PTR(-EFAULT); } page_addr = page_address(page_array[0]); for (i = 0; i < nr_pages; i++) { ret = -EINVAL; /* * Can't support mapping user allocated ring memory on 32-bit * archs where it could potentially reside in highmem. Just * fail those with -EINVAL, just like we did on kernels that * didn't support this feature. */ if (PageHighMem(page_array[i])) goto err; /* * No support for discontig pages for now, should either be a * single normal page, or a huge page. Later on we can add * support for remapping discontig pages, for now we will * just fail them with EINVAL. */ if (page_address(page_array[i]) != page_addr) goto err; page_addr += PAGE_SIZE; } *pages = page_array; *npages = nr_pages; return page_to_virt(page_array[0]); } static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr, size_t size) { return __io_uaddr_map(&ctx->ring_pages, &ctx->n_ring_pages, uaddr, size); } static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr, size_t size) { return __io_uaddr_map(&ctx->sqe_pages, &ctx->n_sqe_pages, uaddr, size); } static void io_rings_free(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { io_mem_free(ctx->rings); io_mem_free(ctx->sq_sqes); ctx->rings = NULL; ctx->sq_sqes = NULL; } else { io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); ctx->n_ring_pages = 0; io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages); ctx->n_sqe_pages = 0; } } void *io_mem_alloc(size_t size) { gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; void *ret; ret = (void *) __get_free_pages(gfp, get_order(size)); if (ret) return ret; return ERR_PTR(-ENOMEM); } static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, unsigned int cq_entries, size_t *sq_offset) { struct io_rings *rings; size_t off, sq_array_size; off = struct_size(rings, cqes, cq_entries); if (off == SIZE_MAX) return SIZE_MAX; if (ctx->flags & IORING_SETUP_CQE32) { if (check_shl_overflow(off, 1, &off)) return SIZE_MAX; } #ifdef CONFIG_SMP off = ALIGN(off, SMP_CACHE_BYTES); if (off == 0) return SIZE_MAX; #endif if (ctx->flags & IORING_SETUP_NO_SQARRAY) { if (sq_offset) *sq_offset = SIZE_MAX; return off; } if (sq_offset) *sq_offset = off; sq_array_size = array_size(sizeof(u32), sq_entries); if (sq_array_size == SIZE_MAX) return SIZE_MAX; if (check_add_overflow(off, sq_array_size, &off)) return SIZE_MAX; return off; } static void io_req_caches_free(struct io_ring_ctx *ctx) { struct io_kiocb *req; int nr = 0; mutex_lock(&ctx->uring_lock); io_flush_cached_locked_reqs(ctx, &ctx->submit_state); while (!io_req_cache_empty(ctx)) { req = io_extract_req(ctx); kmem_cache_free(req_cachep, req); nr++; } if (nr) percpu_ref_put_many(&ctx->refs, nr); mutex_unlock(&ctx->uring_lock); } static void io_rsrc_node_cache_free(struct io_cache_entry *entry) { kfree(container_of(entry, struct io_rsrc_node, cache)); } static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_sq_thread_finish(ctx); /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */ if (WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list))) return; mutex_lock(&ctx->uring_lock); if (ctx->buf_data) __io_sqe_buffers_unregister(ctx); if (ctx->file_data) __io_sqe_files_unregister(ctx); io_cqring_overflow_kill(ctx); io_eventfd_unregister(ctx); io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free); io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_futex_cache_free(ctx); io_destroy_buffers(ctx); mutex_unlock(&ctx->uring_lock); if (ctx->sq_creds) put_cred(ctx->sq_creds); if (ctx->submitter_task) put_task_struct(ctx->submitter_task); /* there are no registered resources left, nobody uses it */ if (ctx->rsrc_node) io_rsrc_node_destroy(ctx, ctx->rsrc_node); WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free); if (ctx->mm_account) { mmdrop(ctx->mm_account); ctx->mm_account = NULL; } io_rings_free(ctx); io_kbuf_mmap_list_free(ctx); percpu_ref_exit(&ctx->refs); free_uid(ctx->user); io_req_caches_free(ctx); if (ctx->hash_map) io_wq_put_hash(ctx->hash_map); kfree(ctx->cancel_table.hbs); kfree(ctx->cancel_table_locked.hbs); kfree(ctx->io_bl); xa_destroy(&ctx->io_bl_xa); kfree(ctx); } static __cold void io_activate_pollwq_cb(struct callback_head *cb) { struct io_ring_ctx *ctx = container_of(cb, struct io_ring_ctx, poll_wq_task_work); mutex_lock(&ctx->uring_lock); ctx->poll_activated = true; mutex_unlock(&ctx->uring_lock); /* * Wake ups for some events between start of polling and activation * might've been lost due to loose synchronisation. */ wake_up_all(&ctx->poll_wq); percpu_ref_put(&ctx->refs); } __cold void io_activate_pollwq(struct io_ring_ctx *ctx) { spin_lock(&ctx->completion_lock); /* already activated or in progress */ if (ctx->poll_activated || ctx->poll_wq_task_work.func) goto out; if (WARN_ON_ONCE(!ctx->task_complete)) goto out; if (!ctx->submitter_task) goto out; /* * with ->submitter_task only the submitter task completes requests, we * only need to sync with it, which is done by injecting a tw */ init_task_work(&ctx->poll_wq_task_work, io_activate_pollwq_cb); percpu_ref_get(&ctx->refs); if (task_work_add(ctx->submitter_task, &ctx->poll_wq_task_work, TWA_SIGNAL)) percpu_ref_put(&ctx->refs); out: spin_unlock(&ctx->completion_lock); } static __poll_t io_uring_poll(struct file *file, poll_table *wait) { struct io_ring_ctx *ctx = file->private_data; __poll_t mask = 0; if (unlikely(!ctx->poll_activated)) io_activate_pollwq(ctx); poll_wait(file, &ctx->poll_wq, wait); /* * synchronizes with barrier from wq_has_sleeper call in * io_commit_cqring */ smp_rmb(); if (!io_sqring_full(ctx)) mask |= EPOLLOUT | EPOLLWRNORM; /* * Don't flush cqring overflow list here, just do a simple check. * Otherwise there could possible be ABBA deadlock: * CPU0 CPU1 * ---- ---- * lock(&ctx->uring_lock); * lock(&ep->mtx); * lock(&ctx->uring_lock); * lock(&ep->mtx); * * Users may get EPOLLIN meanwhile seeing nothing in cqring, this * pushes them to do the flush. */ if (__io_cqring_events_user(ctx) || io_has_work(ctx)) mask |= EPOLLIN | EPOLLRDNORM; return mask; } struct io_tctx_exit { struct callback_head task_work; struct completion completion; struct io_ring_ctx *ctx; }; static __cold void io_tctx_exit_cb(struct callback_head *cb) { struct io_uring_task *tctx = current->io_uring; struct io_tctx_exit *work; work = container_of(cb, struct io_tctx_exit, task_work); /* * When @in_cancel, we're in cancellation and it's racy to remove the * node. It'll be removed by the end of cancellation, just ignore it. * tctx can be NULL if the queueing of this task_work raced with * work cancelation off the exec path. */ if (tctx && !atomic_read(&tctx->in_cancel)) io_uring_del_tctx_node((unsigned long)work->ctx); complete(&work->completion); } static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); return req->ctx == data; } static __cold void io_ring_exit_work(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); unsigned long timeout = jiffies + HZ * 60 * 5; unsigned long interval = HZ / 20; struct io_tctx_exit exit; struct io_tctx_node *node; int ret; /* * If we're doing polled IO and end up having requests being * submitted async (out-of-line), then completions can come in while * we're waiting for refs to drop. We need to reap these manually, * as nobody else will be looking for them. */ do { if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) { mutex_lock(&ctx->uring_lock); io_cqring_overflow_kill(ctx); mutex_unlock(&ctx->uring_lock); } if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) io_move_task_work_from_local(ctx); while (io_uring_try_cancel_requests(ctx, NULL, true)) cond_resched(); if (ctx->sq_data) { struct io_sq_data *sqd = ctx->sq_data; struct task_struct *tsk; io_sq_thread_park(sqd); tsk = sqd->thread; if (tsk && tsk->io_uring && tsk->io_uring->io_wq) io_wq_cancel_cb(tsk->io_uring->io_wq, io_cancel_ctx_cb, ctx, true); io_sq_thread_unpark(sqd); } io_req_caches_free(ctx); if (WARN_ON_ONCE(time_after(jiffies, timeout))) { /* there is little hope left, don't run it too often */ interval = HZ * 60; } /* * This is really an uninterruptible wait, as it has to be * complete. But it's also run from a kworker, which doesn't * take signals, so it's fine to make it interruptible. This * avoids scenarios where we knowingly can wait much longer * on completions, for example if someone does a SIGSTOP on * a task that needs to finish task_work to make this loop * complete. That's a synthetic situation that should not * cause a stuck task backtrace, and hence a potential panic * on stuck tasks if that is enabled. */ } while (!wait_for_completion_interruptible_timeout(&ctx->ref_comp, interval)); init_completion(&exit.completion); init_task_work(&exit.task_work, io_tctx_exit_cb); exit.ctx = ctx; mutex_lock(&ctx->uring_lock); while (!list_empty(&ctx->tctx_list)) { WARN_ON_ONCE(time_after(jiffies, timeout)); node = list_first_entry(&ctx->tctx_list, struct io_tctx_node, ctx_node); /* don't spin on a single task if cancellation failed */ list_rotate_left(&ctx->tctx_list); ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL); if (WARN_ON_ONCE(ret)) continue; mutex_unlock(&ctx->uring_lock); /* * See comment above for * wait_for_completion_interruptible_timeout() on why this * wait is marked as interruptible. */ wait_for_completion_interruptible(&exit.completion); mutex_lock(&ctx->uring_lock); } mutex_unlock(&ctx->uring_lock); spin_lock(&ctx->completion_lock); spin_unlock(&ctx->completion_lock); /* pairs with RCU read section in io_req_local_work_add() */ if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) synchronize_rcu(); io_ring_ctx_free(ctx); } static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { unsigned long index; struct creds *creds; mutex_lock(&ctx->uring_lock); percpu_ref_kill(&ctx->refs); xa_for_each(&ctx->personalities, index, creds) io_unregister_personality(ctx, index); if (ctx->rings) io_poll_remove_all(ctx, NULL, true); mutex_unlock(&ctx->uring_lock); /* * If we failed setting up the ctx, we might not have any rings * and therefore did not submit any requests */ if (ctx->rings) io_kill_timeouts(ctx, NULL, true); flush_delayed_work(&ctx->fallback_work); INIT_WORK(&ctx->exit_work, io_ring_exit_work); /* * Use system_unbound_wq to avoid spawning tons of event kworkers * if we're exiting a ton of rings at the same time. It just adds * noise and overhead, there's no discernable change in runtime * over using system_wq. */ queue_work(system_unbound_wq, &ctx->exit_work); } static int io_uring_release(struct inode *inode, struct file *file) { struct io_ring_ctx *ctx = file->private_data; file->private_data = NULL; io_ring_ctx_wait_and_kill(ctx); return 0; } struct io_task_cancel { struct task_struct *task; bool all; }; static bool io_cancel_task_cb(struct io_wq_work *work, void *data) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_task_cancel *cancel = data; return io_match_task_safe(req, cancel->task, cancel->all); } static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all) { struct io_defer_entry *de; LIST_HEAD(list); spin_lock(&ctx->completion_lock); list_for_each_entry_reverse(de, &ctx->defer_list, list) { if (io_match_task_safe(de->req, task, cancel_all)) { list_cut_position(&list, &ctx->defer_list, &de->list); break; } } spin_unlock(&ctx->completion_lock); if (list_empty(&list)) return false; while (!list_empty(&list)) { de = list_first_entry(&list, struct io_defer_entry, list); list_del_init(&de->list); io_req_task_queue_fail(de->req, -ECANCELED); kfree(de); } return true; } static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) { struct io_tctx_node *node; enum io_wq_cancel cret; bool ret = false; mutex_lock(&ctx->uring_lock); list_for_each_entry(node, &ctx->tctx_list, ctx_node) { struct io_uring_task *tctx = node->task->io_uring; /* * io_wq will stay alive while we hold uring_lock, because it's * killed after ctx nodes, which requires to take the lock. */ if (!tctx || !tctx->io_wq) continue; cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true); ret |= (cret != IO_WQ_CANCEL_NOTFOUND); } mutex_unlock(&ctx->uring_lock); return ret; } static bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all) { struct hlist_node *tmp; struct io_kiocb *req; bool ret = false; lockdep_assert_held(&ctx->uring_lock); hlist_for_each_entry_safe(req, tmp, &ctx->cancelable_uring_cmd, hash_node) { struct io_uring_cmd *cmd = io_kiocb_to_cmd(req, struct io_uring_cmd); struct file *file = req->file; if (!cancel_all && req->task != task) continue; if (cmd->flags & IORING_URING_CMD_CANCELABLE) { /* ->sqe isn't available if no async data */ if (!req_has_async_data(req)) cmd->sqe = NULL; file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL); ret = true; } } io_submit_flush_completions(ctx); return ret; } static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all) { struct io_task_cancel cancel = { .task = task, .all = cancel_all, }; struct io_uring_task *tctx = task ? task->io_uring : NULL; enum io_wq_cancel cret; bool ret = false; /* set it so io_req_local_work_add() would wake us up */ if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { atomic_set(&ctx->cq_wait_nr, 1); smp_mb(); } /* failed during ring init, it couldn't have issued any requests */ if (!ctx->rings) return false; if (!task) { ret |= io_uring_try_cancel_iowq(ctx); } else if (tctx && tctx->io_wq) { /* * Cancels requests of all rings, not only @ctx, but * it's fine as the task is in exit/exec. */ cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, &cancel, true); ret |= (cret != IO_WQ_CANCEL_NOTFOUND); } /* SQPOLL thread does its own polling */ if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || (ctx->sq_data && ctx->sq_data->thread == current)) { while (!wq_list_empty(&ctx->iopoll_list)) { io_iopoll_try_reap_events(ctx); ret = true; cond_resched(); } } if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && io_allowed_defer_tw_run(ctx)) ret |= io_run_local_work(ctx) > 0; ret |= io_cancel_defer_files(ctx, task, cancel_all); mutex_lock(&ctx->uring_lock); ret |= io_poll_remove_all(ctx, task, cancel_all); ret |= io_waitid_remove_all(ctx, task, cancel_all); ret |= io_futex_remove_all(ctx, task, cancel_all); ret |= io_uring_try_cancel_uring_cmd(ctx, task, cancel_all); mutex_unlock(&ctx->uring_lock); ret |= io_kill_timeouts(ctx, task, cancel_all); if (task) ret |= io_run_task_work() > 0; return ret; } static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) { if (tracked) return atomic_read(&tctx->inflight_tracked); return percpu_counter_sum(&tctx->inflight); } /* * Find any io_uring ctx that this task has registered or done IO on, and cancel * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation. */ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) { struct io_uring_task *tctx = current->io_uring; struct io_ring_ctx *ctx; struct io_tctx_node *node; unsigned long index; s64 inflight; DEFINE_WAIT(wait); WARN_ON_ONCE(sqd && sqd->thread != current); if (!current->io_uring) return; if (tctx->io_wq) io_wq_exit_start(tctx->io_wq); atomic_inc(&tctx->in_cancel); do { bool loop = false; io_uring_drop_tctx_refs(current); /* read completions before cancelations */ inflight = tctx_inflight(tctx, !cancel_all); if (!inflight) break; if (!sqd) { xa_for_each(&tctx->xa, index, node) { /* sqpoll task will cancel all its requests */ if (node->ctx->sq_data) continue; loop |= io_uring_try_cancel_requests(node->ctx, current, cancel_all); } } else { list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) loop |= io_uring_try_cancel_requests(ctx, current, cancel_all); } if (loop) { cond_resched(); continue; } prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE); io_run_task_work(); io_uring_drop_tctx_refs(current); xa_for_each(&tctx->xa, index, node) { if (!llist_empty(&node->ctx->work_llist)) { WARN_ON_ONCE(node->ctx->submitter_task && node->ctx->submitter_task != current); goto end_wait; } } /* * If we've seen completions, retry without waiting. This * avoids a race where a completion comes in before we did * prepare_to_wait(). */ if (inflight == tctx_inflight(tctx, !cancel_all)) schedule(); end_wait: finish_wait(&tctx->wait, &wait); } while (1); io_uring_clean_tctx(tctx); if (cancel_all) { /* * We shouldn't run task_works after cancel, so just leave * ->in_cancel set for normal exit. */ atomic_dec(&tctx->in_cancel); /* for exec all current's requests should be gone, kill tctx */ __io_uring_free(current); } } void __io_uring_cancel(bool cancel_all) { io_uring_cancel_generic(cancel_all, NULL); } static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, size_t sz) { struct io_ring_ctx *ctx = file->private_data; loff_t offset = pgoff << PAGE_SHIFT; struct page *page; void *ptr; switch (offset & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: case IORING_OFF_CQ_RING: /* Don't allow mmap if the ring was setup without it */ if (ctx->flags & IORING_SETUP_NO_MMAP) return ERR_PTR(-EINVAL); ptr = ctx->rings; break; case IORING_OFF_SQES: /* Don't allow mmap if the ring was setup without it */ if (ctx->flags & IORING_SETUP_NO_MMAP) return ERR_PTR(-EINVAL); ptr = ctx->sq_sqes; break; case IORING_OFF_PBUF_RING: { unsigned int bgid; bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; rcu_read_lock(); ptr = io_pbuf_get_address(ctx, bgid); rcu_read_unlock(); if (!ptr) return ERR_PTR(-EINVAL); break; } default: return ERR_PTR(-EINVAL); } page = virt_to_head_page(ptr); if (sz > page_size(page)) return ERR_PTR(-EINVAL); return ptr; } #ifdef CONFIG_MMU static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) { size_t sz = vma->vm_end - vma->vm_start; unsigned long pfn; void *ptr; ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); if (IS_ERR(ptr)) return PTR_ERR(ptr); pfn = virt_to_phys(ptr) >> PAGE_SHIFT; return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); } static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { void *ptr; /* * Do not allow to map to user-provided address to avoid breaking the * aliasing rules. Userspace is not able to guess the offset address of * kernel kmalloc()ed memory area. */ if (addr) return -EINVAL; ptr = io_uring_validate_mmap_request(filp, pgoff, len); if (IS_ERR(ptr)) return -ENOMEM; /* * Some architectures have strong cache aliasing requirements. * For such architectures we need a coherent mapping which aliases * kernel memory *and* userspace memory. To achieve that: * - use a NULL file pointer to reference physical memory, and * - use the kernel virtual address of the shared io_uring context * (instead of the userspace-provided address, which has to be 0UL * anyway). * - use the same pgoff which the get_unmapped_area() uses to * calculate the page colouring. * For architectures without such aliasing requirements, the * architecture will return any suitable mapping because addr is 0. */ filp = NULL; flags |= MAP_SHARED; pgoff = 0; /* has been translated to ptr above */ #ifdef SHM_COLOUR addr = (uintptr_t) ptr; pgoff = addr >> PAGE_SHIFT; #else addr = 0UL; #endif return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); } #else /* !CONFIG_MMU */ static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) { return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL; } static unsigned int io_uring_nommu_mmap_capabilities(struct file *file) { return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE; } static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { void *ptr; ptr = io_uring_validate_mmap_request(file, pgoff, len); if (IS_ERR(ptr)) return PTR_ERR(ptr); return (unsigned long) ptr; } #endif /* !CONFIG_MMU */ static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz) { if (flags & IORING_ENTER_EXT_ARG) { struct io_uring_getevents_arg arg; if (argsz != sizeof(arg)) return -EINVAL; if (copy_from_user(&arg, argp, sizeof(arg))) return -EFAULT; } return 0; } static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, struct __kernel_timespec __user **ts, const sigset_t __user **sig) { struct io_uring_getevents_arg arg; /* * If EXT_ARG isn't set, then we have no timespec and the argp pointer * is just a pointer to the sigset_t. */ if (!(flags & IORING_ENTER_EXT_ARG)) { *sig = (const sigset_t __user *) argp; *ts = NULL; return 0; } /* * EXT_ARG is set - ensure we agree on the size of it and copy in our * timespec and sigset_t pointers if good. */ if (*argsz != sizeof(arg)) return -EINVAL; if (copy_from_user(&arg, argp, sizeof(arg))) return -EFAULT; if (arg.pad) return -EINVAL; *sig = u64_to_user_ptr(arg.sigmask); *argsz = arg.sigmask_sz; *ts = u64_to_user_ptr(arg.ts); return 0; } SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, u32, min_complete, u32, flags, const void __user *, argp, size_t, argsz) { struct io_ring_ctx *ctx; struct file *file; long ret; if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | IORING_ENTER_REGISTERED_RING))) return -EINVAL; /* * Ring fd has been registered via IORING_REGISTER_RING_FDS, we * need only dereference our task private array to find it. */ if (flags & IORING_ENTER_REGISTERED_RING) { struct io_uring_task *tctx = current->io_uring; if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) return -EINVAL; fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); file = tctx->registered_rings[fd]; if (unlikely(!file)) return -EBADF; } else { file = fget(fd); if (unlikely(!file)) return -EBADF; ret = -EOPNOTSUPP; if (unlikely(!io_is_uring_fops(file))) goto out; } ctx = file->private_data; ret = -EBADFD; if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED)) goto out; /* * For SQ polling, the thread will do all submissions and completions. * Just return the requested submit count, and wake the thread if * we were asked to. */ ret = 0; if (ctx->flags & IORING_SETUP_SQPOLL) { io_cqring_overflow_flush(ctx); if (unlikely(ctx->sq_data->thread == NULL)) { ret = -EOWNERDEAD; goto out; } if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sq_data->wait); if (flags & IORING_ENTER_SQ_WAIT) io_sqpoll_wait_sq(ctx); ret = to_submit; } else if (to_submit) { ret = io_uring_add_tctx_node(ctx); if (unlikely(ret)) goto out; mutex_lock(&ctx->uring_lock); ret = io_submit_sqes(ctx, to_submit); if (ret != to_submit) { mutex_unlock(&ctx->uring_lock); goto out; } if (flags & IORING_ENTER_GETEVENTS) { if (ctx->syscall_iopoll) goto iopoll_locked; /* * Ignore errors, we'll soon call io_cqring_wait() and * it should handle ownership problems if any. */ if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) (void)io_run_local_work_locked(ctx); } mutex_unlock(&ctx->uring_lock); } if (flags & IORING_ENTER_GETEVENTS) { int ret2; if (ctx->syscall_iopoll) { /* * We disallow the app entering submit/complete with * polling, but we still need to lock the ring to * prevent racing with polled issue that got punted to * a workqueue. */ mutex_lock(&ctx->uring_lock); iopoll_locked: ret2 = io_validate_ext_arg(flags, argp, argsz); if (likely(!ret2)) { min_complete = min(min_complete, ctx->cq_entries); ret2 = io_iopoll_check(ctx, min_complete); } mutex_unlock(&ctx->uring_lock); } else { const sigset_t __user *sig; struct __kernel_timespec __user *ts; ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); if (likely(!ret2)) { min_complete = min(min_complete, ctx->cq_entries); ret2 = io_cqring_wait(ctx, min_complete, sig, argsz, ts); } } if (!ret) { ret = ret2; /* * EBADR indicates that one or more CQE were dropped. * Once the user has been informed we can clear the bit * as they are obviously ok with those drops. */ if (unlikely(ret2 == -EBADR)) clear_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq); } } out: if (!(flags & IORING_ENTER_REGISTERED_RING)) fput(file); return ret; } static const struct file_operations io_uring_fops = { .release = io_uring_release, .mmap = io_uring_mmap, #ifndef CONFIG_MMU .get_unmapped_area = io_uring_nommu_get_unmapped_area, .mmap_capabilities = io_uring_nommu_mmap_capabilities, #else .get_unmapped_area = io_uring_mmu_get_unmapped_area, #endif .poll = io_uring_poll, #ifdef CONFIG_PROC_FS .show_fdinfo = io_uring_show_fdinfo, #endif }; bool io_is_uring_fops(struct file *file) { return file->f_op == &io_uring_fops; } static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, struct io_uring_params *p) { struct io_rings *rings; size_t size, sq_array_offset; void *ptr; /* make sure these are sane, as we already accounted them */ ctx->sq_entries = p->sq_entries; ctx->cq_entries = p->cq_entries; size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset); if (size == SIZE_MAX) return -EOVERFLOW; if (!(ctx->flags & IORING_SETUP_NO_MMAP)) rings = io_mem_alloc(size); else rings = io_rings_map(ctx, p->cq_off.user_addr, size); if (IS_ERR(rings)) return PTR_ERR(rings); ctx->rings = rings; if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); rings->sq_ring_mask = p->sq_entries - 1; rings->cq_ring_mask = p->cq_entries - 1; rings->sq_ring_entries = p->sq_entries; rings->cq_ring_entries = p->cq_entries; if (p->flags & IORING_SETUP_SQE128) size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries); else size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); if (size == SIZE_MAX) { io_rings_free(ctx); return -EOVERFLOW; } if (!(ctx->flags & IORING_SETUP_NO_MMAP)) ptr = io_mem_alloc(size); else ptr = io_sqes_map(ctx, p->sq_off.user_addr, size); if (IS_ERR(ptr)) { io_rings_free(ctx); return PTR_ERR(ptr); } ctx->sq_sqes = ptr; return 0; } static int io_uring_install_fd(struct file *file) { int fd; fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); if (fd < 0) return fd; fd_install(fd, file); return fd; } /* * Allocate an anonymous fd, this is what constitutes the application * visible backing of an io_uring instance. The application mmaps this * fd to gain access to the SQ/CQ ring details. */ static struct file *io_uring_get_file(struct io_ring_ctx *ctx) { /* Create a new inode so that the LSM can block the creation. */ return anon_inode_create_getfile("[io_uring]", &io_uring_fops, ctx, O_RDWR | O_CLOEXEC, NULL); } static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, struct io_uring_params __user *params) { struct io_ring_ctx *ctx; struct io_uring_task *tctx; struct file *file; int ret; if (!entries) return -EINVAL; if (entries > IORING_MAX_ENTRIES) { if (!(p->flags & IORING_SETUP_CLAMP)) return -EINVAL; entries = IORING_MAX_ENTRIES; } if ((p->flags & IORING_SETUP_REGISTERED_FD_ONLY) && !(p->flags & IORING_SETUP_NO_MMAP)) return -EINVAL; /* * Use twice as many entries for the CQ ring. It's possible for the * application to drive a higher depth than the size of the SQ ring, * since the sqes are only used at submission time. This allows for * some flexibility in overcommitting a bit. If the application has * set IORING_SETUP_CQSIZE, it will have passed in the desired number * of CQ ring entries manually. */ p->sq_entries = roundup_pow_of_two(entries); if (p->flags & IORING_SETUP_CQSIZE) { /* * If IORING_SETUP_CQSIZE is set, we do the same roundup * to a power-of-two, if it isn't already. We do NOT impose * any cq vs sq ring sizing. */ if (!p->cq_entries) return -EINVAL; if (p->cq_entries > IORING_MAX_CQ_ENTRIES) { if (!(p->flags & IORING_SETUP_CLAMP)) return -EINVAL; p->cq_entries = IORING_MAX_CQ_ENTRIES; } p->cq_entries = roundup_pow_of_two(p->cq_entries); if (p->cq_entries < p->sq_entries) return -EINVAL; } else { p->cq_entries = 2 * p->sq_entries; } ctx = io_ring_ctx_alloc(p); if (!ctx) return -ENOMEM; if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && !(ctx->flags & IORING_SETUP_IOPOLL) && !(ctx->flags & IORING_SETUP_SQPOLL)) ctx->task_complete = true; if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) ctx->lockless_cq = true; /* * lazy poll_wq activation relies on ->task_complete for synchronisation * purposes, see io_activate_pollwq() */ if (!ctx->task_complete) ctx->poll_activated = true; /* * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user * space applications don't need to do io completion events * polling again, they can rely on io_sq_thread to do polling * work, which can reduce cpu usage and uring_lock contention. */ if (ctx->flags & IORING_SETUP_IOPOLL && !(ctx->flags & IORING_SETUP_SQPOLL)) ctx->syscall_iopoll = 1; ctx->compat = in_compat_syscall(); if (!ns_capable_noaudit(&init_user_ns, CAP_IPC_LOCK)) ctx->user = get_uid(current_user()); /* * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if * COOP_TASKRUN is set, then IPIs are never needed by the app. */ ret = -EINVAL; if (ctx->flags & IORING_SETUP_SQPOLL) { /* IPI related flags don't make sense with SQPOLL */ if (ctx->flags & (IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | IORING_SETUP_DEFER_TASKRUN)) goto err; ctx->notify_method = TWA_SIGNAL_NO_IPI; } else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) { ctx->notify_method = TWA_SIGNAL_NO_IPI; } else { if (ctx->flags & IORING_SETUP_TASKRUN_FLAG && !(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) goto err; ctx->notify_method = TWA_SIGNAL; } /* * For DEFER_TASKRUN we require the completion task to be the same as the * submission task. This implies that there is only one submitter, so enforce * that. */ if (ctx->flags & IORING_SETUP_DEFER_TASKRUN && !(ctx->flags & IORING_SETUP_SINGLE_ISSUER)) { goto err; } /* * This is just grabbed for accounting purposes. When a process exits, * the mm is exited and dropped before the files, hence we need to hang * on to this mm purely for the purposes of being able to unaccount * memory (locked/pinned vm). It's not used for anything else. */ mmgrab(current->mm); ctx->mm_account = current->mm; ret = io_allocate_scq_urings(ctx, p); if (ret) goto err; ret = io_sq_offload_create(ctx, p); if (ret) goto err; ret = io_rsrc_init(ctx); if (ret) goto err; p->sq_off.head = offsetof(struct io_rings, sq.head); p->sq_off.tail = offsetof(struct io_rings, sq.tail); p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); p->sq_off.flags = offsetof(struct io_rings, sq_flags); p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; p->sq_off.resv1 = 0; if (!(ctx->flags & IORING_SETUP_NO_MMAP)) p->sq_off.user_addr = 0; p->cq_off.head = offsetof(struct io_rings, cq.head); p->cq_off.tail = offsetof(struct io_rings, cq.tail); p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); p->cq_off.cqes = offsetof(struct io_rings, cqes); p->cq_off.flags = offsetof(struct io_rings, cq_flags); p->cq_off.resv1 = 0; if (!(ctx->flags & IORING_SETUP_NO_MMAP)) p->cq_off.user_addr = 0; p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; goto err; } if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !(ctx->flags & IORING_SETUP_R_DISABLED)) WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); file = io_uring_get_file(ctx); if (IS_ERR(file)) { ret = PTR_ERR(file); goto err; } ret = __io_uring_add_tctx_node(ctx); if (ret) goto err_fput; tctx = current->io_uring; /* * Install ring fd as the very last thing, so we don't risk someone * having closed it before we finish setup */ if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY) ret = io_ring_add_registered_file(tctx, file, 0, IO_RINGFD_REG_MAX); else ret = io_uring_install_fd(file); if (ret < 0) goto err_fput; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: io_ring_ctx_wait_and_kill(ctx); return ret; err_fput: fput(file); return ret; } /* * Sets up an aio uring context, and returns the fd. Applications asks for a * ring size, we return the actual sq/cq ring sizes (among other things) in the * params structure passed in. */ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) { struct io_uring_params p; int i; if (copy_from_user(&p, params, sizeof(p))) return -EFAULT; for (i = 0; i < ARRAY_SIZE(p.resv); i++) { if (p.resv[i]) return -EINVAL; } if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL | IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY | IORING_SETUP_NO_SQARRAY)) return -EINVAL; return io_uring_create(entries, &p, params); } static inline bool io_uring_allowed(void) { int disabled = READ_ONCE(sysctl_io_uring_disabled); kgid_t io_uring_group; if (disabled == 2) return false; if (disabled == 0 || capable(CAP_SYS_ADMIN)) return true; io_uring_group = make_kgid(&init_user_ns, sysctl_io_uring_group); if (!gid_valid(io_uring_group)) return false; return in_group_p(io_uring_group); } SYSCALL_DEFINE2(io_uring_setup, u32, entries, struct io_uring_params __user *, params) { if (!io_uring_allowed()) return -EPERM; return io_uring_setup(entries, params); } static int __init io_uring_init(void) { #define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \ BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \ BUILD_BUG_ON(sizeof_field(stype, ename) != esize); \ } while (0) #define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \ __BUILD_BUG_VERIFY_OFFSET_SIZE(struct io_uring_sqe, eoffset, sizeof(etype), ename) #define BUILD_BUG_SQE_ELEM_SIZE(eoffset, esize, ename) \ __BUILD_BUG_VERIFY_OFFSET_SIZE(struct io_uring_sqe, eoffset, esize, ename) BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64); BUILD_BUG_SQE_ELEM(0, __u8, opcode); BUILD_BUG_SQE_ELEM(1, __u8, flags); BUILD_BUG_SQE_ELEM(2, __u16, ioprio); BUILD_BUG_SQE_ELEM(4, __s32, fd); BUILD_BUG_SQE_ELEM(8, __u64, off); BUILD_BUG_SQE_ELEM(8, __u64, addr2); BUILD_BUG_SQE_ELEM(8, __u32, cmd_op); BUILD_BUG_SQE_ELEM(12, __u32, __pad1); BUILD_BUG_SQE_ELEM(16, __u64, addr); BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in); BUILD_BUG_SQE_ELEM(24, __u32, len); BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags); BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags); BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags); BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags); BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events); BUILD_BUG_SQE_ELEM(28, __u32, poll32_events); BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags); BUILD_BUG_SQE_ELEM(28, __u32, msg_flags); BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags); BUILD_BUG_SQE_ELEM(28, __u32, accept_flags); BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags); BUILD_BUG_SQE_ELEM(28, __u32, open_flags); BUILD_BUG_SQE_ELEM(28, __u32, statx_flags); BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice); BUILD_BUG_SQE_ELEM(28, __u32, splice_flags); BUILD_BUG_SQE_ELEM(28, __u32, rename_flags); BUILD_BUG_SQE_ELEM(28, __u32, unlink_flags); BUILD_BUG_SQE_ELEM(28, __u32, hardlink_flags); BUILD_BUG_SQE_ELEM(28, __u32, xattr_flags); BUILD_BUG_SQE_ELEM(28, __u32, msg_ring_flags); BUILD_BUG_SQE_ELEM(32, __u64, user_data); BUILD_BUG_SQE_ELEM(40, __u16, buf_index); BUILD_BUG_SQE_ELEM(40, __u16, buf_group); BUILD_BUG_SQE_ELEM(42, __u16, personality); BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); BUILD_BUG_SQE_ELEM(44, __u32, file_index); BUILD_BUG_SQE_ELEM(44, __u16, addr_len); BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]); BUILD_BUG_SQE_ELEM(48, __u64, addr3); BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd); BUILD_BUG_SQE_ELEM(56, __u64, __pad2); BUILD_BUG_ON(sizeof(struct io_uring_files_update) != sizeof(struct io_uring_rsrc_update)); BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) > sizeof(struct io_uring_rsrc_update2)); /* ->buf_index is u16 */ BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0); BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) != offsetof(struct io_uring_buf_ring, tail)); /* should fit into one byte */ BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8)); BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8)); BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS); BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int)); BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32)); /* top 8bits are for internal use */ BUILD_BUG_ON((IORING_URING_CMD_MASK & 0xff000000) != 0); io_uring_optable_init(); /* * Allow user copy in the per-command field, which starts after the * file in io_kiocb and until the opcode field. The openat2 handling * requires copying in user memory into the io_kiocb object in that * range, and HARDENED_USERCOPY will complain if we haven't * correctly annotated this range. */ req_cachep = kmem_cache_create_usercopy("io_kiocb", sizeof(struct io_kiocb), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU, offsetof(struct io_kiocb, cmd.data), sizeof_field(struct io_kiocb, cmd.data), NULL); io_buf_cachep = kmem_cache_create("io_buffer", sizeof(struct io_buffer), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); #ifdef CONFIG_SYSCTL register_sysctl_init("kernel", kernel_io_uring_disabled_table); #endif return 0; }; __initcall(io_uring_init);
4349 4349 4349 45 45 41 371 59 24 357 51 307 51 174 12 10 268 272 259 55 204 7 181 35 163 5 158 2 156 8 147 10 138 13 124 118 19 106 2 68 3 67 4 63 1 62 2 60 4 56 2 54 4 50 48 4 46 46 10 36 38 26 72 353 353 2 351 304 347 93 351 2 111 337 335 210 211 211 31 27 211 186 186 59 59 59 5 5 4 4 122 122 122 6 12 13 10 30 27 22 23 138 27 27 27 4 22 30 25 2 271 168 10 1 2 2 2 6 1 6 16 15 16 15 14 8 7 12 12 6 14 13 13 212 212 15 468 455 15 2613 2613 3 2 2 2609 2614 1 7 5 8 5 11 11 10 9 10 7 5 464 465 10 4 457 455 426 454 454 19 454 453 447 277 413 414 414 414 61 413 293 292 273 89 15 15 15 5 211 211 436 86 326 827 2 14 806 291 2 2 2 4 2 2 2 2 41 20 210 27 61 49 59 60 56 52 2 53 50 2 47 50 51 213 214 211 201 198 2 197 195 193 191 189 190 57 57 55 53 4 47 4 43 49 69 65 61 60 5 55 59 40 59 58 18 17 2 15 18 14 13 12 12 8 13 95 94 2 91 92 29 89 90 90 90 90 85 25 30 24 29 1 26 28 15 25 25 25 24 22 24 23 1 1 22 22 22 21 21 8 8 10 28 27 25 24 23 3 2 2 20 18 17 9 8 6 15 17 18 12 7 7 6 4 4 6 2404 2410 2396 2397 1698 1680 1680 1679 30 690 721 721 31 1 30 30 30 788 30 760 18 3 35 35 1 2 1681 1257 1307 1231 1228 27 27 1229 86 1231 1 80 54 2463 53 44 27 24 6 4 4 86 89 2455 52 43 67 34 28 26 5 39 2333 2330 2493 2494 2490 2492 2481 1014 2480 29 1 1 2458 1 1 92 10 11 2454 2454 2454 2439 2423 2422 19 2414 4 10 2407 3 21 2397 2326 1677 3 2322 2454 76 76 2420 14 13 4 12 17 16 15 48 22 22 44 46 24 46 3 16 46 45 45 402 404 402 402 402 404 1 2 36 18 20 23 6 1 5 5 4 7 6 7 2 1 358 357 356 356 355 1 354 358 4 379 358 93 111 6 8 6 2 105 3 2 5 64 3 28 60 56 15 64 36 35 48 44 43 13 6 10 10 14 14 43 30 7 7 22 20 20 8 5 6 7 5 530 25 24 23 7 2 7 6 696 658 694 677 693 691 687 27 26 23 22 27 9 3 2 7 6 10 9 8 7 2 3 3 3 3 2 5 7 7 7 1 7 1 7 7 5 3 28 36 35 34 40 41 40 39 4 2 4 4 37 1 33 33 7 7 31 1 31 8 8 23 31 5 3 1 2 1 1 2 29 1 6 6 3 3 3 1 29 3 7 6 23 2 4 1 28 7 28 28 2 1 28 1 5 3 25 2 4 4 4 3 3 3 1 27 3 4 2 2 2 29 9 1 9 2 9 8 8 11 10 9 10 10 21 4 90 78 40 38 10 28 9 19 11 288 11 11 6 6 6 4 4 1 3 2 4 4 6 4 13 12 12 12 9 9 7 6 6 1 7 156 92 154 149 55 149 147 111 12 108 96 29 28 77 3 75 69 61 10 1 30 15 14 2 3 2 2 1 1 1 68 12 11 9 8 4 6 6 1 7 8 7 6 4 8 4 4 3 5 4 2 14 14 13 14 15 15 8 7 1 13 14 9 6 2 3 1 5 5 1 5 4353 4169 4351 4350 4354 463 60 215 2494 14 3 9 88 290 45 15 97 19 80 1 4355 2 2 2 28 1 24 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ #include <linux/bpf.h> #include <linux/bpf-cgroup.h> #include <linux/bpf_trace.h> #include <linux/bpf_lirc.h> #include <linux/bpf_verifier.h> #include <linux/bsearch.h> #include <linux/btf.h> #include <linux/syscalls.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/vmalloc.h> #include <linux/mmzone.h> #include <linux/anon_inodes.h> #include <linux/fdtable.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/license.h> #include <linux/filter.h> #include <linux/kernel.h> #include <linux/idr.h> #include <linux/cred.h> #include <linux/timekeeping.h> #include <linux/ctype.h> #include <linux/nospec.h> #include <linux/audit.h> #include <uapi/linux/btf.h> #include <linux/pgtable.h> #include <linux/bpf_lsm.h> #include <linux/poll.h> #include <linux/sort.h> #include <linux/bpf-netns.h> #include <linux/rcupdate_trace.h> #include <linux/memcontrol.h> #include <linux/trace_events.h> #include <net/netfilter/nf_bpf_link.h> #include <net/netkit.h> #include <net/tcx.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) #define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY) #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) #define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \ IS_FD_HASH(map)) #define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY) DEFINE_PER_CPU(int, bpf_prog_active); static DEFINE_IDR(prog_idr); static DEFINE_SPINLOCK(prog_idr_lock); static DEFINE_IDR(map_idr); static DEFINE_SPINLOCK(map_idr_lock); static DEFINE_IDR(link_idr); static DEFINE_SPINLOCK(link_idr_lock); int sysctl_unprivileged_bpf_disabled __read_mostly = IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0; static const struct bpf_map_ops * const bpf_map_types[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) #define BPF_MAP_TYPE(_id, _ops) \ [_id] = &_ops, #define BPF_LINK_TYPE(_id, _name) #include <linux/bpf_types.h> #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE #undef BPF_LINK_TYPE }; /* * If we're handed a bigger struct than we know of, ensure all the unknown bits * are 0 - i.e. new user-space does not rely on any kernel feature extensions * we don't know about yet. * * There is a ToCToU between this function call and the following * copy_from_user() call. However, this is not a concern since this function is * meant to be a future-proofing of bits. */ int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size, size_t actual_size) { int res; if (unlikely(actual_size > PAGE_SIZE)) /* silly large */ return -E2BIG; if (actual_size <= expected_size) return 0; if (uaddr.is_kernel) res = memchr_inv(uaddr.kernel + expected_size, 0, actual_size - expected_size) == NULL; else res = check_zeroed_user(uaddr.user + expected_size, actual_size - expected_size); if (res < 0) return res; return res ? 0 : -E2BIG; } const struct bpf_map_ops bpf_map_offload_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = bpf_map_offload_map_alloc, .map_free = bpf_map_offload_map_free, .map_check_btf = map_check_no_btf, .map_mem_usage = bpf_map_offload_map_mem_usage, }; static void bpf_map_write_active_inc(struct bpf_map *map) { atomic64_inc(&map->writecnt); } static void bpf_map_write_active_dec(struct bpf_map *map) { atomic64_dec(&map->writecnt); } bool bpf_map_write_active(const struct bpf_map *map) { return atomic64_read(&map->writecnt) != 0; } static u32 bpf_map_value_size(const struct bpf_map *map) { if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) return round_up(map->value_size, 8) * num_possible_cpus(); else if (IS_FD_MAP(map)) return sizeof(u32); else return map->value_size; } static void maybe_wait_bpf_programs(struct bpf_map *map) { /* Wait for any running non-sleepable BPF programs to complete so that * userspace, when we return to it, knows that all non-sleepable * programs that could be running use the new map value. For sleepable * BPF programs, synchronize_rcu_tasks_trace() should be used to wait * for the completions of these programs, but considering the waiting * time can be very long and userspace may think it will hang forever, * so don't handle sleepable BPF programs now. */ if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS || map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) synchronize_rcu(); } static int bpf_map_update_value(struct bpf_map *map, struct file *map_file, void *key, void *value, __u64 flags) { int err; /* Need to create a kthread, thus must support schedule */ if (bpf_map_is_offloaded(map)) { return bpf_map_offload_update_elem(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { return map->ops->map_update_elem(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH || map->map_type == BPF_MAP_TYPE_SOCKMAP) { return sock_map_update_elem_sys(map, key, value, flags); } else if (IS_FD_PROG_ARRAY(map)) { return bpf_fd_array_map_update_elem(map, map_file, key, value, flags); } bpf_disable_instrumentation(); if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_update(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_update(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { err = bpf_percpu_cgroup_storage_update(map, key, value, flags); } else if (IS_FD_ARRAY(map)) { err = bpf_fd_array_map_update_elem(map, map_file, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { err = bpf_fd_htab_map_update_elem(map, map_file, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { /* rcu_read_lock() is not needed */ err = bpf_fd_reuseport_array_update_elem(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_QUEUE || map->map_type == BPF_MAP_TYPE_STACK || map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) { err = map->ops->map_push_elem(map, value, flags); } else { rcu_read_lock(); err = map->ops->map_update_elem(map, key, value, flags); rcu_read_unlock(); } bpf_enable_instrumentation(); return err; } static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, __u64 flags) { void *ptr; int err; if (bpf_map_is_offloaded(map)) return bpf_map_offload_lookup_elem(map, key, value); bpf_disable_instrumentation(); if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { err = bpf_percpu_cgroup_storage_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { err = bpf_stackmap_copy(map, key, value); } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) { err = bpf_fd_array_map_lookup_elem(map, key, value); } else if (IS_FD_HASH(map)) { err = bpf_fd_htab_map_lookup_elem(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { err = bpf_fd_reuseport_array_lookup_elem(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_QUEUE || map->map_type == BPF_MAP_TYPE_STACK || map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) { err = map->ops->map_peek_elem(map, value); } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { /* struct_ops map requires directly updating "value" */ err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); } else { rcu_read_lock(); if (map->ops->map_lookup_elem_sys_only) ptr = map->ops->map_lookup_elem_sys_only(map, key); else ptr = map->ops->map_lookup_elem(map, key); if (IS_ERR(ptr)) { err = PTR_ERR(ptr); } else if (!ptr) { err = -ENOENT; } else { err = 0; if (flags & BPF_F_LOCK) /* lock 'ptr' and copy everything but lock */ copy_map_value_locked(map, value, ptr, true); else copy_map_value(map, value, ptr); /* mask lock and timer, since value wasn't zero inited */ check_and_init_map_value(map, value); } rcu_read_unlock(); } bpf_enable_instrumentation(); return err; } /* Please, do not use this function outside from the map creation path * (e.g. in map update path) without taking care of setting the active * memory cgroup (see at bpf_map_kmalloc_node() for example). */ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) { /* We really just want to fail instead of triggering OOM killer * under memory pressure, therefore we set __GFP_NORETRY to kmalloc, * which is used for lower order allocation requests. * * It has been observed that higher order allocation requests done by * vmalloc with __GFP_NORETRY being set might fail due to not trying * to reclaim memory from the page cache, thus we set * __GFP_RETRY_MAYFAIL to avoid such situations. */ gfp_t gfp = bpf_memcg_flags(__GFP_NOWARN | __GFP_ZERO); unsigned int flags = 0; unsigned long align = 1; void *area; if (size >= SIZE_MAX) return NULL; /* kmalloc()'ed memory can't be mmap()'ed */ if (mmapable) { BUG_ON(!PAGE_ALIGNED(size)); align = SHMLBA; flags = VM_USERMAP; } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY, numa_node); if (area != NULL) return area; } return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL, flags, numa_node, __builtin_return_address(0)); } void *bpf_map_area_alloc(u64 size, int numa_node) { return __bpf_map_area_alloc(size, numa_node, false); } void *bpf_map_area_mmapable_alloc(u64 size, int numa_node) { return __bpf_map_area_alloc(size, numa_node, true); } void bpf_map_area_free(void *area) { kvfree(area); } static u32 bpf_map_flags_retain_permanent(u32 flags) { /* Some map creation flags are not tied to the map object but * rather to the map fd instead, so they have no meaning upon * map object inspection since multiple file descriptors with * different (access) properties can exist here. Thus, given * this has zero meaning for the map itself, lets clear these * from here. */ return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY); } void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) { map->map_type = attr->map_type; map->key_size = attr->key_size; map->value_size = attr->value_size; map->max_entries = attr->max_entries; map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags); map->numa_node = bpf_map_attr_numa_node(attr); map->map_extra = attr->map_extra; } static int bpf_map_alloc_id(struct bpf_map *map) { int id; idr_preload(GFP_KERNEL); spin_lock_bh(&map_idr_lock); id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC); if (id > 0) map->id = id; spin_unlock_bh(&map_idr_lock); idr_preload_end(); if (WARN_ON_ONCE(!id)) return -ENOSPC; return id > 0 ? 0 : id; } void bpf_map_free_id(struct bpf_map *map) { unsigned long flags; /* Offloaded maps are removed from the IDR store when their device * disappears - even if someone holds an fd to them they are unusable, * the memory is gone, all ops will fail; they are simply waiting for * refcnt to drop to be freed. */ if (!map->id) return; spin_lock_irqsave(&map_idr_lock, flags); idr_remove(&map_idr, map->id); map->id = 0; spin_unlock_irqrestore(&map_idr_lock, flags); } #ifdef CONFIG_MEMCG_KMEM static void bpf_map_save_memcg(struct bpf_map *map) { /* Currently if a map is created by a process belonging to the root * memory cgroup, get_obj_cgroup_from_current() will return NULL. * So we have to check map->objcg for being NULL each time it's * being used. */ if (memcg_bpf_enabled()) map->objcg = get_obj_cgroup_from_current(); } static void bpf_map_release_memcg(struct bpf_map *map) { if (map->objcg) obj_cgroup_put(map->objcg); } static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map) { if (map->objcg) return get_mem_cgroup_from_objcg(map->objcg); return root_mem_cgroup; } void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, int node) { struct mem_cgroup *memcg, *old_memcg; void *ptr; memcg = bpf_map_get_memcg(map); old_memcg = set_active_memcg(memcg); ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node); set_active_memcg(old_memcg); mem_cgroup_put(memcg); return ptr; } void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) { struct mem_cgroup *memcg, *old_memcg; void *ptr; memcg = bpf_map_get_memcg(map); old_memcg = set_active_memcg(memcg); ptr = kzalloc(size, flags | __GFP_ACCOUNT); set_active_memcg(old_memcg); mem_cgroup_put(memcg); return ptr; } void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, gfp_t flags) { struct mem_cgroup *memcg, *old_memcg; void *ptr; memcg = bpf_map_get_memcg(map); old_memcg = set_active_memcg(memcg); ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT); set_active_memcg(old_memcg); mem_cgroup_put(memcg); return ptr; } void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align, gfp_t flags) { struct mem_cgroup *memcg, *old_memcg; void __percpu *ptr; memcg = bpf_map_get_memcg(map); old_memcg = set_active_memcg(memcg); ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT); set_active_memcg(old_memcg); mem_cgroup_put(memcg); return ptr; } #else static void bpf_map_save_memcg(struct bpf_map *map) { } static void bpf_map_release_memcg(struct bpf_map *map) { } #endif static int btf_field_cmp(const void *a, const void *b) { const struct btf_field *f1 = a, *f2 = b; if (f1->offset < f2->offset) return -1; else if (f1->offset > f2->offset) return 1; return 0; } struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset, u32 field_mask) { struct btf_field *field; if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & field_mask)) return NULL; field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp); if (!field || !(field->type & field_mask)) return NULL; return field; } void btf_record_free(struct btf_record *rec) { int i; if (IS_ERR_OR_NULL(rec)) return; for (i = 0; i < rec->cnt; i++) { switch (rec->fields[i].type) { case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: if (rec->fields[i].kptr.module) module_put(rec->fields[i].kptr.module); btf_put(rec->fields[i].kptr.btf); break; case BPF_LIST_HEAD: case BPF_LIST_NODE: case BPF_RB_ROOT: case BPF_RB_NODE: case BPF_SPIN_LOCK: case BPF_TIMER: case BPF_REFCOUNT: /* Nothing to release */ break; default: WARN_ON_ONCE(1); continue; } } kfree(rec); } void bpf_map_free_record(struct bpf_map *map) { btf_record_free(map->record); map->record = NULL; } struct btf_record *btf_record_dup(const struct btf_record *rec) { const struct btf_field *fields; struct btf_record *new_rec; int ret, size, i; if (IS_ERR_OR_NULL(rec)) return NULL; size = offsetof(struct btf_record, fields[rec->cnt]); new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN); if (!new_rec) return ERR_PTR(-ENOMEM); /* Do a deep copy of the btf_record */ fields = rec->fields; new_rec->cnt = 0; for (i = 0; i < rec->cnt; i++) { switch (fields[i].type) { case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: btf_get(fields[i].kptr.btf); if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) { ret = -ENXIO; goto free; } break; case BPF_LIST_HEAD: case BPF_LIST_NODE: case BPF_RB_ROOT: case BPF_RB_NODE: case BPF_SPIN_LOCK: case BPF_TIMER: case BPF_REFCOUNT: /* Nothing to acquire */ break; default: ret = -EFAULT; WARN_ON_ONCE(1); goto free; } new_rec->cnt++; } return new_rec; free: btf_record_free(new_rec); return ERR_PTR(ret); } bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b) { bool a_has_fields = !IS_ERR_OR_NULL(rec_a), b_has_fields = !IS_ERR_OR_NULL(rec_b); int size; if (!a_has_fields && !b_has_fields) return true; if (a_has_fields != b_has_fields) return false; if (rec_a->cnt != rec_b->cnt) return false; size = offsetof(struct btf_record, fields[rec_a->cnt]); /* btf_parse_fields uses kzalloc to allocate a btf_record, so unused * members are zeroed out. So memcmp is safe to do without worrying * about padding/unused fields. * * While spin_lock, timer, and kptr have no relation to map BTF, * list_head metadata is specific to map BTF, the btf and value_rec * members in particular. btf is the map BTF, while value_rec points to * btf_record in that map BTF. * * So while by default, we don't rely on the map BTF (which the records * were parsed from) matching for both records, which is not backwards * compatible, in case list_head is part of it, we implicitly rely on * that by way of depending on memcmp succeeding for it. */ return !memcmp(rec_a, rec_b, size); } void bpf_obj_free_timer(const struct btf_record *rec, void *obj) { if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TIMER))) return; bpf_timer_cancel_and_free(obj + rec->timer_off); } void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { const struct btf_field *fields; int i; if (IS_ERR_OR_NULL(rec)) return; fields = rec->fields; for (i = 0; i < rec->cnt; i++) { struct btf_struct_meta *pointee_struct_meta; const struct btf_field *field = &fields[i]; void *field_ptr = obj + field->offset; void *xchgd_field; switch (fields[i].type) { case BPF_SPIN_LOCK: break; case BPF_TIMER: bpf_timer_cancel_and_free(field_ptr); break; case BPF_KPTR_UNREF: WRITE_ONCE(*(u64 *)field_ptr, 0); break; case BPF_KPTR_REF: case BPF_KPTR_PERCPU: xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0); if (!xchgd_field) break; if (!btf_is_kernel(field->kptr.btf)) { pointee_struct_meta = btf_find_struct_meta(field->kptr.btf, field->kptr.btf_id); migrate_disable(); __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ? pointee_struct_meta->record : NULL, fields[i].type == BPF_KPTR_PERCPU); migrate_enable(); } else { field->kptr.dtor(xchgd_field); } break; case BPF_LIST_HEAD: if (WARN_ON_ONCE(rec->spin_lock_off < 0)) continue; bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off); break; case BPF_RB_ROOT: if (WARN_ON_ONCE(rec->spin_lock_off < 0)) continue; bpf_rb_root_free(field, field_ptr, obj + rec->spin_lock_off); break; case BPF_LIST_NODE: case BPF_RB_NODE: case BPF_REFCOUNT: break; default: WARN_ON_ONCE(1); continue; } } } /* called from workqueue */ static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); struct btf_record *rec = map->record; struct btf *btf = map->btf; security_bpf_map_free(map); bpf_map_release_memcg(map); /* implementation dependent freeing */ map->ops->map_free(map); /* Delay freeing of btf_record for maps, as map_free * callback usually needs access to them. It is better to do it here * than require each callback to do the free itself manually. * * Note that the btf_record stashed in map->inner_map_meta->record was * already freed using the map_free callback for map in map case which * eventually calls bpf_map_free_meta, since inner_map_meta is only a * template bpf_map struct used during verification. */ btf_record_free(rec); /* Delay freeing of btf for maps, as map_free callback may need * struct_meta info which will be freed with btf_put(). */ btf_put(btf); } static void bpf_map_put_uref(struct bpf_map *map) { if (atomic64_dec_and_test(&map->usercnt)) { if (map->ops->map_release_uref) map->ops->map_release_uref(map); } } static void bpf_map_free_in_work(struct bpf_map *map) { INIT_WORK(&map->work, bpf_map_free_deferred); /* Avoid spawning kworkers, since they all might contend * for the same mutex like slab_mutex. */ queue_work(system_unbound_wq, &map->work); } static void bpf_map_free_rcu_gp(struct rcu_head *rcu) { bpf_map_free_in_work(container_of(rcu, struct bpf_map, rcu)); } static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu) { if (rcu_trace_implies_rcu_gp()) bpf_map_free_rcu_gp(rcu); else call_rcu(rcu, bpf_map_free_rcu_gp); } /* decrement map refcnt and schedule it for freeing via workqueue * (underlying map implementation ops->map_free() might sleep) */ void bpf_map_put(struct bpf_map *map) { if (atomic64_dec_and_test(&map->refcnt)) { /* bpf_map_free_id() must be called first */ bpf_map_free_id(map); WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt)); if (READ_ONCE(map->free_after_mult_rcu_gp)) call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp); else if (READ_ONCE(map->free_after_rcu_gp)) call_rcu(&map->rcu, bpf_map_free_rcu_gp); else bpf_map_free_in_work(map); } } EXPORT_SYMBOL_GPL(bpf_map_put); void bpf_map_put_with_uref(struct bpf_map *map) { bpf_map_put_uref(map); bpf_map_put(map); } static int bpf_map_release(struct inode *inode, struct file *filp) { struct bpf_map *map = filp->private_data; if (map->ops->map_release) map->ops->map_release(map, filp); bpf_map_put_with_uref(map); return 0; } static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f) { fmode_t mode = f.file->f_mode; /* Our file permissions may have been overridden by global * map permissions facing syscall side. */ if (READ_ONCE(map->frozen)) mode &= ~FMODE_CAN_WRITE; return mode; } #ifdef CONFIG_PROC_FS /* Show the memory usage of a bpf map */ static u64 bpf_map_memory_usage(const struct bpf_map *map) { return map->ops->map_mem_usage(map); } static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) { struct bpf_map *map = filp->private_data; u32 type = 0, jited = 0; if (map_type_contains_progs(map)) { spin_lock(&map->owner.lock); type = map->owner.type; jited = map->owner.jited; spin_unlock(&map->owner.lock); } seq_printf(m, "map_type:\t%u\n" "key_size:\t%u\n" "value_size:\t%u\n" "max_entries:\t%u\n" "map_flags:\t%#x\n" "map_extra:\t%#llx\n" "memlock:\t%llu\n" "map_id:\t%u\n" "frozen:\t%u\n", map->map_type, map->key_size, map->value_size, map->max_entries, map->map_flags, (unsigned long long)map->map_extra, bpf_map_memory_usage(map), map->id, READ_ONCE(map->frozen)); if (type) { seq_printf(m, "owner_prog_type:\t%u\n", type); seq_printf(m, "owner_jited:\t%u\n", jited); } } #endif static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz, loff_t *ppos) { /* We need this handler such that alloc_file() enables * f_mode with FMODE_CAN_READ. */ return -EINVAL; } static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf, size_t siz, loff_t *ppos) { /* We need this handler such that alloc_file() enables * f_mode with FMODE_CAN_WRITE. */ return -EINVAL; } /* called for any extra memory-mapped regions (except initial) */ static void bpf_map_mmap_open(struct vm_area_struct *vma) { struct bpf_map *map = vma->vm_file->private_data; if (vma->vm_flags & VM_MAYWRITE) bpf_map_write_active_inc(map); } /* called for all unmapped memory region (including initial) */ static void bpf_map_mmap_close(struct vm_area_struct *vma) { struct bpf_map *map = vma->vm_file->private_data; if (vma->vm_flags & VM_MAYWRITE) bpf_map_write_active_dec(map); } static const struct vm_operations_struct bpf_map_default_vmops = { .open = bpf_map_mmap_open, .close = bpf_map_mmap_close, }; static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) { struct bpf_map *map = filp->private_data; int err; if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record)) return -ENOTSUPP; if (!(